├── .all-contributorsrc
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── PULL_REQUEST_TEMPLATE
│ ├── bug_fix.md
│ ├── documentation_related.md
│ ├── engine_implementation.md
│ └── feature_implementation.md
└── workflows
│ ├── deploy.yml
│ └── test.yml
├── .gitignore
├── .pylintrc
├── .readthedocs.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── assets
├── animate.gif
└── example.gif
├── docs
├── Makefile
├── documentation.md
├── engines.md
├── faq.md
├── make.bat
├── source
│ ├── conf.py
│ ├── index.rst
│ ├── modules.rst
│ ├── search_engine_parser.core.engines.rst
│ ├── search_engine_parser.core.rst
│ ├── search_engine_parser.rst
│ └── search_engine_parser.tests.rst
└── supported_engines.md
├── fixtures
├── aol-Hello-synopsis.yaml
├── ask-Hello-synopsis.yaml
├── baidu-Hello-synopsis.yaml
├── bing-Hello-synopsis.yaml
├── coursera-Hello-synopsis.yaml
├── duckduckgo-Hello-synopsis.yaml
├── github-Hello-synopsis.yaml
├── google-Hello-synopsis.yaml
├── google-test-diff-synopsis.yaml
├── googlenews-Hello-synopsis.yaml
├── googlescholar-Hello-synopsis.yaml
├── myanimelist-Hello-synopsis.yaml
├── stackoverflow-Hello-synopsis.yaml
├── yahoo-Hello-synopsis.yaml
├── yahoo-test-diff-synopsis.yaml
├── yandex-Hello-synopsis.yaml
└── youtube-Hello-synopsis.yaml
├── requirements
├── cli.txt
├── dev.txt
└── main.txt
├── scripts
├── docs.sh
├── post_deploy_test.sh
└── pre_deploy_test.sh
├── search_engine_parser
├── .gitignore
├── __init__.py
├── core
│ ├── __init__.py
│ ├── base.py
│ ├── cli.py
│ ├── engines
│ │ ├── __init__.py
│ │ ├── aol.py
│ │ ├── ask.py
│ │ ├── baidu.py
│ │ ├── bing.py
│ │ ├── coursera.py
│ │ ├── duckduckgo.py
│ │ ├── github.py
│ │ ├── google.py
│ │ ├── googlenews.py
│ │ ├── googlescholar.py
│ │ ├── myanimelist.py
│ │ ├── stackoverflow.py
│ │ ├── yahoo.py
│ │ ├── yandex.py
│ │ └── youtube.py
│ ├── exceptions.py
│ └── utils.py
└── tests
│ ├── __init__.py
│ ├── test_base.py
│ └── test_cli.py
└── setup.py
/.all-contributorsrc:
--------------------------------------------------------------------------------
1 | {
2 | "files": [
3 | "README.md"
4 | ],
5 | "imageSize": 100,
6 | "commit": false,
7 | "contributors": [
8 | {
9 | "login": "Rexogamer",
10 | "name": "Ed Luff",
11 | "avatar_url": "https://avatars0.githubusercontent.com/u/42586271?v=4",
12 | "profile": "https://github.com/Rexogamer",
13 | "contributions": [
14 | "code"
15 | ]
16 | },
17 | {
18 | "login": "deven96",
19 | "name": "Diretnan Domnan",
20 | "avatar_url": "https://avatars3.githubusercontent.com/u/23453888?v=4",
21 | "profile": "http://diretnandomnan.webnode.com",
22 | "contributions": [
23 | "infra",
24 | "test",
25 | "tool",
26 | "code"
27 | ]
28 | },
29 | {
30 | "login": "MeNsaaH",
31 | "name": "MeNsaaH",
32 | "avatar_url": "https://avatars3.githubusercontent.com/u/24734308?v=4",
33 | "profile": "http://mensaah.github.io",
34 | "contributions": [
35 | "infra",
36 | "test",
37 | "tool",
38 | "code"
39 | ]
40 | },
41 | {
42 | "login": "PalAditya",
43 | "name": "Aditya Pal",
44 | "avatar_url": "https://avatars2.githubusercontent.com/u/25523604?v=4",
45 | "profile": "https://github.com/PalAditya",
46 | "contributions": [
47 | "test",
48 | "code",
49 | "doc"
50 | ]
51 | },
52 | {
53 | "login": "AvinashReddy3108",
54 | "name": "Avinash Reddy",
55 | "avatar_url": "https://avatars1.githubusercontent.com/u/27774996?v=4",
56 | "profile": "http://energized.pro",
57 | "contributions": [
58 | "bug"
59 | ]
60 | },
61 | {
62 | "login": "Iamdavidonuh",
63 | "name": "David Onuh",
64 | "avatar_url": "https://avatars3.githubusercontent.com/u/37768509?v=4",
65 | "profile": "https://github.com/Iamdavidonuh",
66 | "contributions": [
67 | "code",
68 | "test"
69 | ]
70 | },
71 | {
72 | "login": "sp1thas",
73 | "name": "Panagiotis Simakis",
74 | "avatar_url": "https://avatars2.githubusercontent.com/u/8322266?v=4",
75 | "profile": "http://simakis.me",
76 | "contributions": [
77 | "code",
78 | "test"
79 | ]
80 | },
81 | {
82 | "login": "reiarthur",
83 | "name": "reiarthur",
84 | "avatar_url": "https://avatars2.githubusercontent.com/u/20190646?v=4",
85 | "profile": "https://github.com/reiarthur",
86 | "contributions": [
87 | "code"
88 | ]
89 | },
90 | {
91 | "login": "ashokkumarta",
92 | "name": "Ashokkumar TA",
93 | "avatar_url": "https://avatars0.githubusercontent.com/u/5450267?v=4",
94 | "profile": "http://ashokkumarta.blogspot.com/",
95 | "contributions": [
96 | "code"
97 | ]
98 | },
99 | {
100 | "login": "ateuber",
101 | "name": "Andreas Teuber",
102 | "avatar_url": "https://avatars2.githubusercontent.com/u/44349054?v=4",
103 | "profile": "https://github.com/ateuber",
104 | "contributions": [
105 | "code"
106 | ]
107 | },
108 | {
109 | "login": "mi096684",
110 | "name": "mi096684",
111 | "avatar_url": "https://avatars3.githubusercontent.com/u/22032932?v=4",
112 | "profile": "https://github.com/mi096684",
113 | "contributions": [
114 | "bug"
115 | ]
116 | },
117 | {
118 | "login": "devajithvs",
119 | "name": "devajithvs",
120 | "avatar_url": "https://avatars1.githubusercontent.com/u/29475282?v=4",
121 | "profile": "https://github.com/devajithvs",
122 | "contributions": [
123 | "code"
124 | ]
125 | },
126 | {
127 | "login": "zakaryan2004",
128 | "name": "Geg Zakaryan",
129 | "avatar_url": "https://avatars3.githubusercontent.com/u/29994884?v=4",
130 | "profile": "https://github.com/zakaryan2004",
131 | "contributions": [
132 | "code",
133 | "bug"
134 | ]
135 | },
136 | {
137 | "login": "redrussianarmy",
138 | "name": "Hakan Boğan",
139 | "avatar_url": "https://avatars1.githubusercontent.com/u/24498747?v=4",
140 | "profile": "https://www.hakanbogan.com",
141 | "contributions": [
142 | "bug"
143 | ]
144 | },
145 | {
146 | "login": "NicKoehler",
147 | "name": "NicKoehler",
148 | "avatar_url": "https://avatars3.githubusercontent.com/u/53040044?v=4",
149 | "profile": "https://github.com/NicKoehler",
150 | "contributions": [
151 | "bug",
152 | "code"
153 | ]
154 | },
155 | {
156 | "login": "chris4540",
157 | "name": "ChrisLin",
158 | "avatar_url": "https://avatars1.githubusercontent.com/u/12794588?v=4",
159 | "profile": "https://github.com/chris4540",
160 | "contributions": [
161 | "bug",
162 | "code"
163 | ]
164 | },
165 | {
166 | "login": "pgrandinetti",
167 | "name": "Pietro",
168 | "avatar_url": "https://avatars.githubusercontent.com/u/10454135?v=4",
169 | "profile": "http://pete.world",
170 | "contributions": [
171 | "code",
172 | "bug"
173 | ]
174 | }
175 | ],
176 | "contributorsPerLine": 7,
177 | "projectName": "search-engine-parser",
178 | "projectOwner": "bisoncorps",
179 | "repoType": "github",
180 | "repoHost": "https://github.com",
181 | "skipCi": true
182 | }
183 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: 'bug'
6 | assignees: '@deven96'
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Open python console to
16 | 2. Import search_engine_parser
17 | 3. Search using .... Engine
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. Windows]
28 | - Python Version [e.g. 3.6.5]
29 | - Search-engine-parser version [e.g. 0.5.1]
30 |
31 |
32 | **Additional context**
33 | Add any other context about the problem here.
34 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: 'enhancement'
6 | assignees: '@deven96'
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/bug_fix.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug Fix
3 | title: ''
4 | labels: 'patch', 'needs-review'
5 | assignees: '@MeNsaaH'
6 |
7 | ---
8 |
9 | **Issue relating to the bug**
10 | Issue number relating to the bug e.g #13
11 |
12 | **Simple summary of steps Taken to fix the bug**
13 | A clear and concise description of what the fix is. Ex. I added a browser header to the base engine `search_engine_parser/core/engines/base` to prevent captchas.
14 |
15 | **Describe alternatives you've considered**
16 | A clear and concise description of any alternative solutions you've considered.
17 |
18 | **Additional context**
19 | Add any other context or screenshots about the fix here.
20 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/documentation_related.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Documentation Related
3 | about: Added documentation to the project
4 | title: ''
5 | labels: 'documentation', 'needs-review'
6 | assignees: '@MenSaaH'
7 |
8 | ---
9 |
10 | **Describe the change to the documentation**
11 | A clear and concise description of what the change/addition is.
12 |
13 | **Issue fix?**
14 | Issue number that this documentation PR fixes.
15 |
16 | **Screenshots**
17 | If applicable, add screenshots of the sphinx documentation rendered on your local machine.
18 |
19 | **Additional context**
20 | Add any other context about the PR here.
21 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/engine_implementation.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Engine Implementation
3 | about: Implemented a new engine
4 | title: 'Name of Engine goes here'
5 | labels: 'engine', 'needs-review'
6 | assignees: '@deven96', '@MenSaaH'
7 |
8 | ---
9 |
10 | **Issue relating to the engine request**
11 | Issue number relating to the engine e.g #13
12 |
13 | **Summary of steps Taken to implement the engine**
14 | A clear and concise description of what the engine is.
15 |
16 | ```t
17 | Ex. I added the GitHub engine, `github.py` to the `search_engine_parser/core/engines` directory and made the necessary imports.
18 | This engine integrates GitHub search capabilities and returns stars, repository info, descriptions, links and titles.
19 | ```
20 |
21 | **Describe any issues you've faced or inconsistencies in the engine**
22 | A clear and concise description of any issues you've faced. Ex. I was unable to parse 10 results per page due to [...]
23 |
24 | **Additional context**
25 | Add any other context or screenshots about the engine here.
26 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/feature_implementation.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature Implementation
3 | about: ''
4 | title: ''
5 | labels: 'feature', 'needs-review'
6 | assignees: '@deven96', '@MenSaaH'
7 |
8 | ---
9 |
10 | **Issue relating to the feature**
11 | Issue number relating to the feature e.g #13
12 |
13 | **Summary of steps Taken to implement the feature**
14 | A clear and concise description of what the feature is.
15 |
16 | ```t
17 | Ex. I added a browser header to the base engine `search_engine_parser/core/engines/base` to prevent captchas.
18 | ```
19 |
20 | **Describe any issues you've faced or inconsistencies in implementing the feature**
21 | A clear and concise description of any issues you've faced. Ex. Captchas still occur after a certain amount of usage [...]
22 |
23 | **Additional context**
24 | Add any other context or screenshots about the feature here.
25 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
1 | name: Deploy to Pypi
2 | on:
3 | push:
4 | tags:
5 | - 'v*.*.*'
6 |
7 | jobs:
8 | deploy:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v1
12 |
13 | - name: Set up Python 3.7
14 | uses: actions/setup-python@v1
15 | with:
16 | python-version: 3.7
17 |
18 | - name: Install Dependencies
19 | run: pip install -r requirements/dev.txt
20 |
21 | - name: Set env
22 | run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
23 |
24 | - name: update Package version
25 | run: sed -i "s/.*__version__.*/__version__ = \"${{ env.RELEASE_VERSION }}\"/g" search_engine_parser/__init__.py
26 |
27 | - name: Install pypa/build
28 | run: python -m pip install build --user
29 |
30 | - name: Build a binary wheel and a source tarball
31 | run: python -m build --sdist --wheel --outdir dist/ .
32 |
33 | - name: Build Changelog
34 | id: github_release
35 | uses: mikepenz/release-changelog-builder-action@v3
36 | env:
37 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
38 |
39 | - name: Create Release
40 | uses: softprops/action-gh-release@v0.1.14
41 | with:
42 | body: ${{steps.github_release.outputs.changelog}}
43 |
44 | - name: Publish package
45 | uses: pypa/gh-action-pypi-publish@release/v1
46 | with:
47 | user: __token__
48 | password: ${{ secrets.PYPI_API_TOKEN }}
49 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Test
2 | on:
3 | push:
4 | branches:
5 | - master
6 | paths:
7 | - '**.py'
8 | - 'requirements/**'
9 | pull_request:
10 | branches:
11 | - master
12 | paths:
13 | - '**.py'
14 | - 'requirements/**'
15 |
16 | jobs:
17 | test:
18 | strategy:
19 | matrix:
20 | python: ["3.6", "3.7", "3.8", "3.9"]
21 | runs-on: ubuntu-latest
22 | steps:
23 | - uses: actions/checkout@v1
24 |
25 | - name: Set up Python 3.7
26 | uses: actions/setup-python@v1
27 | with:
28 | python-version: ${{ matrix.python }}
29 |
30 | - name: Install Dependencies
31 | run: pip install -r requirements/dev.txt
32 |
33 | - name: Run tests
34 | run: pytest -s
35 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | #search_engine_parser cache
107 | **/cache/**
108 |
--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [MASTER]
2 |
3 | # A comma-separated list of package or module names from where C extensions may
4 | # be loaded. Extensions are loading into the active Python interpreter and may
5 | # run arbitrary code.
6 | extension-pkg-whitelist=
7 |
8 | # Add files or directories to the blacklist. They should be base names, not
9 | # paths.
10 | ignore=CVS
11 |
12 | # Add files or directories matching the regex patterns to the blacklist. The
13 | # regex matches against base names, not paths.
14 | ignore-patterns=
15 |
16 | # Python code to execute, usually for sys.path manipulation such as
17 | # pygtk.require().
18 | #init-hook=
19 |
20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
21 | # number of processors available to use.
22 | jobs=1
23 |
24 | # Control the amount of potential inferred values when inferring a single
25 | # object. This can help the performance when dealing with large functions or
26 | # complex, nested conditions.
27 | limit-inference-results=100
28 |
29 | # List of plugins (as comma separated values of python modules names) to load,
30 | # usually to register additional checkers.
31 | load-plugins=
32 |
33 | # Pickle collected data for later comparisons.
34 | persistent=yes
35 |
36 | # Specify a configuration file. #rcfile=
37 |
38 | # When enabled, pylint would attempt to guess common misconfiguration and emit
39 | # user-friendly hints instead of false-positive error messages.
40 | suggestion-mode=yes
41 |
42 | # Allow loading of arbitrary C extensions. Extensions are imported into the
43 | # active Python interpreter and may run arbitrary code.
44 | unsafe-load-any-extension=no
45 |
46 |
47 | [MESSAGES CONTROL]
48 |
49 | # Only show warnings with the listed confidence levels. Leave empty to show
50 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
51 | confidence=
52 |
53 | # Disable the message, report, category or checker with the given id(s). You
54 | # can either give multiple identifiers separated by comma (,) or put this
55 | # option multiple times (only on the command line, not in the configuration
56 | # file where it should appear only once). You can also use "--disable=all" to
57 | # disable everything first and then reenable specific checks. For example, if
58 | # you want to run only the similarities checker, you can use "--disable=all
59 | # --enable=similarities". If you want to run only the classes checker, but have
60 | # no Warning level messages displayed, use "--disable=all --enable=classes
61 | # --disable=W".
62 | disable=print-statement,
63 | parameter-unpacking,
64 | unpacking-in-except,
65 | old-raise-syntax,
66 | backtick,
67 | long-suffix,
68 | old-ne-operator,
69 | old-octal-literal,
70 | import-star-module-level,
71 | non-ascii-bytes-literal,
72 | raw-checker-failed,
73 | bad-inline-option,
74 | locally-disabled,
75 | file-ignored,
76 | suppressed-message,
77 | useless-suppression,
78 | deprecated-pragma,
79 | use-symbolic-message-instead,
80 | apply-builtin,
81 | basestring-builtin,
82 | buffer-builtin,
83 | cmp-builtin,
84 | coerce-builtin,
85 | execfile-builtin,
86 | file-builtin,
87 | long-builtin,
88 | raw_input-builtin,
89 | reduce-builtin,
90 | standarderror-builtin,
91 | unicode-builtin,
92 | xrange-builtin,
93 | coerce-method,
94 | delslice-method,
95 | getslice-method,
96 | setslice-method,
97 | no-absolute-import,
98 | old-division,
99 | dict-iter-method,
100 | dict-view-method,
101 | next-method-called,
102 | metaclass-assignment,
103 | indexing-exception,
104 | raising-string,
105 | reload-builtin,
106 | oct-method,
107 | hex-method,
108 | nonzero-method,
109 | cmp-method,
110 | input-builtin,
111 | round-builtin,
112 | missing-docstring,
113 | intern-builtin,
114 | unichr-builtin,
115 | map-builtin-not-iterating,
116 | zip-builtin-not-iterating,
117 | range-builtin-not-iterating,
118 | filter-builtin-not-iterating,
119 | using-cmp-argument,
120 | eq-without-hash,
121 | div-method,
122 | idiv-method,
123 | rdiv-method,
124 | exception-message-attribute,
125 | invalid-str-codec,
126 | sys-max-int,
127 | bad-python3-import,
128 | deprecated-string-function,
129 | deprecated-str-translate-call,
130 | deprecated-itertools-function,
131 | deprecated-types-field,
132 | next-method-defined,
133 | dict-items-not-iterating,
134 | dict-keys-not-iterating,
135 | dict-values-not-iterating,
136 | deprecated-operator-function,
137 | deprecated-urllib-function,
138 | xreadlines-attribute,
139 | deprecated-sys-function,
140 | exception-escape,
141 | comprehension-escape,
142 | R0801
143 |
144 | # Enable the message, report, category or checker with the given id(s). You can
145 | # either give multiple identifier separated by comma (,) or put this option
146 | # multiple time (only on the command line, not in the configuration file where
147 | # it should appear only once). See also the "--disable" option for examples.
148 | enable=c-extension-no-member
149 |
150 |
151 | [REPORTS]
152 |
153 | # Python expression which should return a note less than 10 (10 is the highest
154 | # note). You have access to the variables errors warning, statement which
155 | # respectively contain the number of errors / warnings messages and the total
156 | # number of statements analyzed. This is used by the global evaluation report
157 | # (RP0004).
158 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
159 |
160 | # Template used to display messages. This is a python new-style format string
161 | # used to format the message information. See doc for all details.
162 | #msg-template=
163 |
164 | # Set the output format. Available formats are text, parseable, colorized, json
165 | # and msvs (visual studio). You can also give a reporter class, e.g.
166 | # mypackage.mymodule.MyReporterClass.
167 | output-format=text
168 |
169 | # Tells whether to display a full report or only the messages.
170 | reports=no
171 |
172 | # Activate the evaluation score.
173 | score=yes
174 |
175 |
176 | [REFACTORING]
177 |
178 | # Maximum number of nested blocks for function / method body
179 | max-nested-blocks=5
180 |
181 | # Complete name of functions that never returns. When checking for
182 | # inconsistent-return-statements if a never returning function is called then
183 | # it will be considered as an explicit return statement and no message will be
184 | # printed.
185 | never-returning-functions=sys.exit
186 |
187 |
188 | [MISCELLANEOUS]
189 |
190 | # List of note tags to take in consideration, separated by a comma.
191 | notes=FIXME,
192 | XXX,
193 | TODO
194 |
195 |
196 | [LOGGING]
197 |
198 | # Format style used to check logging format string. `old` means using %
199 | # formatting, while `new` is for `{}` formatting.
200 | logging-format-style=old
201 |
202 | # Logging modules to check that the string format arguments are in logging
203 | # function parameter format.
204 | logging-modules=logging
205 |
206 |
207 | [STRING]
208 |
209 | # This flag controls whether the implicit-str-concat-in-sequence should
210 | # generate a warning on implicit string concatenation in sequences defined over
211 | # several lines.
212 | check-str-concat-over-line-jumps=no
213 |
214 |
215 | [SPELLING]
216 |
217 | # Limits count of emitted suggestions for spelling mistakes.
218 | max-spelling-suggestions=4
219 |
220 | # Spelling dictionary name. Available dictionaries: none. To make it working
221 | # install python-enchant package..
222 | spelling-dict=
223 |
224 | # List of comma separated words that should not be checked.
225 | spelling-ignore-words=
226 |
227 | # A path to a file that contains private dictionary; one word per line.
228 | spelling-private-dict-file=
229 |
230 | # Tells whether to store unknown words to indicated private dictionary in
231 | # --spelling-private-dict-file option instead of raising a message.
232 | spelling-store-unknown-words=no
233 |
234 |
235 | [FORMAT]
236 |
237 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
238 | expected-line-ending-format=
239 |
240 | # Regexp for a line that is allowed to be longer than the limit.
241 | ignore-long-lines=^\s*(# )??$
242 |
243 | # Number of spaces of indent required inside a hanging or continued line.
244 | indent-after-paren=4
245 |
246 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
247 | # tab).
248 | indent-string=' '
249 |
250 | # Maximum number of characters on a single line.
251 | max-line-length=100
252 |
253 | # Maximum number of lines in a module.
254 | max-module-lines=1000
255 |
256 | # List of optional constructs for which whitespace checking is disabled. `dict-
257 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
258 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
259 | # `empty-line` allows space-only lines.
260 | no-space-check=trailing-comma,
261 | dict-separator
262 |
263 | # Allow the body of a class to be on the same line as the declaration if body
264 | # contains single statement.
265 | single-line-class-stmt=no
266 |
267 | # Allow the body of an if to be on the same line as the test if there is no
268 | # else.
269 | single-line-if-stmt=no
270 |
271 |
272 | [BASIC]
273 |
274 | # Naming style matching correct argument names.
275 | argument-naming-style=snake_case
276 |
277 | # Regular expression matching correct argument names. Overrides argument-
278 | # naming-style.
279 | #argument-rgx=
280 |
281 | # Naming style matching correct attribute names.
282 | attr-naming-style=snake_case
283 |
284 | # Regular expression matching correct attribute names. Overrides attr-naming-
285 | # style.
286 | #attr-rgx=
287 |
288 | # Bad variable names which should always be refused, separated by a comma.
289 | bad-names=foo,
290 | bar,
291 | baz,
292 | toto,
293 | tutu,
294 | tata
295 |
296 | # Naming style matching correct class attribute names.
297 | class-attribute-naming-style=any
298 |
299 | # Regular expression matching correct class attribute names. Overrides class-
300 | # attribute-naming-style.
301 | #class-attribute-rgx=
302 |
303 | # Naming style matching correct class names.
304 | class-naming-style=PascalCase
305 |
306 | # Regular expression matching correct class names. Overrides class-naming-
307 | # style.
308 | #class-rgx=
309 |
310 | # Naming style matching correct constant names.
311 | const-naming-style=UPPER_CASE
312 |
313 | # Regular expression matching correct constant names. Overrides const-naming-
314 | # style.
315 | #const-rgx=
316 |
317 | # Minimum line length for functions/classes that require docstrings, shorter
318 | # ones are exempt.
319 | docstring-min-length=-1
320 |
321 | # Naming style matching correct function names.
322 | function-naming-style=snake_case
323 |
324 | # Regular expression matching correct function names. Overrides function-
325 | # naming-style.
326 | #function-rgx=
327 |
328 | # Good variable names which should always be accepted, separated by a comma.
329 | good-names=i,
330 | j,
331 | k,
332 | ex,
333 | Run,
334 | _
335 |
336 | # Include a hint for the correct naming format with invalid-name.
337 | include-naming-hint=no
338 |
339 | # Naming style matching correct inline iteration names.
340 | inlinevar-naming-style=any
341 |
342 | # Regular expression matching correct inline iteration names. Overrides
343 | # inlinevar-naming-style.
344 | #inlinevar-rgx=
345 |
346 | # Naming style matching correct method names.
347 | method-naming-style=snake_case
348 |
349 | # Regular expression matching correct method names. Overrides method-naming-
350 | # style.
351 | #method-rgx=
352 |
353 | # Naming style matching correct module names.
354 | module-naming-style=snake_case
355 |
356 | # Regular expression matching correct module names. Overrides module-naming-
357 | # style.
358 | #module-rgx=
359 |
360 | # Colon-delimited sets of names that determine each other's naming style when
361 | # the name regexes allow several styles.
362 | name-group=
363 |
364 | # Regular expression which should only match function or class names that do
365 | # not require a docstring.
366 | no-docstring-rgx=^_
367 |
368 | # List of decorators that produce properties, such as abc.abstractproperty. Add
369 | # to this list to register other decorators that produce valid properties.
370 | # These decorators are taken in consideration only for invalid-name.
371 | property-classes=abc.abstractproperty
372 |
373 | # Naming style matching correct variable names.
374 | variable-naming-style=snake_case
375 |
376 | # Regular expression matching correct variable names. Overrides variable-
377 | # naming-style.
378 | #variable-rgx=
379 |
380 |
381 | [TYPECHECK]
382 |
383 | # List of decorators that produce context managers, such as
384 | # contextlib.contextmanager. Add to this list to register other decorators that
385 | # produce valid context managers.
386 | contextmanager-decorators=contextlib.contextmanager
387 |
388 | # List of members which are set dynamically and missed by pylint inference
389 | # system, and so shouldn't trigger E1101 when accessed. Python regular
390 | # expressions are accepted.
391 | generated-members=
392 |
393 | # Tells whether missing members accessed in mixin class should be ignored. A
394 | # mixin class is detected if its name ends with "mixin" (case insensitive).
395 | ignore-mixin-members=yes
396 |
397 | # Tells whether to warn about missing members when the owner of the attribute
398 | # is inferred to be None.
399 | ignore-none=yes
400 |
401 | # This flag controls whether pylint should warn about no-member and similar
402 | # checks whenever an opaque object is returned when inferring. The inference
403 | # can return multiple potential results while evaluating a Python object, but
404 | # some branches might not be evaluated, which results in partial inference. In
405 | # that case, it might be useful to still emit no-member and other checks for
406 | # the rest of the inferred objects.
407 | ignore-on-opaque-inference=yes
408 |
409 | # List of class names for which member attributes should not be checked (useful
410 | # for classes with dynamically set attributes). This supports the use of
411 | # qualified names.
412 | ignored-classes=optparse.Values,thread._local,_thread._local
413 |
414 | # List of module names for which member attributes should not be checked
415 | # (useful for modules/projects where namespaces are manipulated during runtime
416 | # and thus existing member attributes cannot be deduced by static analysis. It
417 | # supports qualified module names, as well as Unix pattern matching.
418 | ignored-modules=
419 |
420 | # Show a hint with possible names when a member name was not found. The aspect
421 | # of finding the hint is based on edit distance.
422 | missing-member-hint=yes
423 |
424 | # The minimum edit distance a name should have in order to be considered a
425 | # similar match for a missing member name.
426 | missing-member-hint-distance=1
427 |
428 | # The total number of similar names that should be taken in consideration when
429 | # showing a hint for a missing member.
430 | missing-member-max-choices=1
431 |
432 |
433 | [VARIABLES]
434 |
435 | # List of additional names supposed to be defined in builtins. Remember that
436 | # you should avoid defining new builtins when possible.
437 | additional-builtins=
438 |
439 | # Tells whether unused global variables should be treated as a violation.
440 | allow-global-unused-variables=yes
441 |
442 | # List of strings which can identify a callback function by name. A callback
443 | # name must start or end with one of those strings.
444 | callbacks=cb_,
445 | _cb
446 |
447 | # A regular expression matching the name of dummy variables (i.e. expected to
448 | # not be used).
449 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
450 |
451 | # Argument names that match this expression will be ignored. Default to name
452 | # with leading underscore.
453 | ignored-argument-names=_.*|^ignored_|^unused_
454 |
455 | # Tells whether we should check for unused import in __init__ files.
456 | init-import=no
457 |
458 | # List of qualified module names which can have objects that can redefine
459 | # builtins.
460 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
461 |
462 |
463 | [SIMILARITIES]
464 |
465 | # Ignore comments when computing similarities.
466 | ignore-comments=yes
467 |
468 | # Ignore docstrings when computing similarities.
469 | ignore-docstrings=yes
470 |
471 | # Ignore imports when computing similarities.
472 | ignore-imports=no
473 |
474 | # Minimum lines number of a similarity.
475 | min-similarity-lines=4
476 |
477 |
478 | [IMPORTS]
479 |
480 | # Allow wildcard imports from modules that define __all__.
481 | allow-wildcard-with-all=no
482 |
483 | # Analyse import fallback blocks. This can be used to support both Python 2 and
484 | # 3 compatible code, which means that the block might have code that exists
485 | # only in one or another interpreter, leading to false positives when analysed.
486 | analyse-fallback-blocks=no
487 |
488 | # Deprecated modules which should not be used, separated by a comma.
489 | deprecated-modules=optparse,tkinter.tix
490 |
491 | # Create a graph of external dependencies in the given file (report RP0402 must
492 | # not be disabled).
493 | ext-import-graph=
494 |
495 | # Create a graph of every (i.e. internal and external) dependencies in the
496 | # given file (report RP0402 must not be disabled).
497 | import-graph=
498 |
499 | # Create a graph of internal dependencies in the given file (report RP0402 must
500 | # not be disabled).
501 | int-import-graph=
502 |
503 | # Force import order to recognize a module as part of the standard
504 | # compatibility libraries.
505 | known-standard-library=
506 |
507 | # Force import order to recognize a module as part of a third party library.
508 | known-third-party=enchant
509 |
510 |
511 | [DESIGN]
512 |
513 | # Maximum number of arguments for function / method.
514 | max-args=5
515 |
516 | # Maximum number of attributes for a class (see R0902).
517 | max-attributes=7
518 |
519 | # Maximum number of boolean expressions in an if statement.
520 | max-bool-expr=5
521 |
522 | # Maximum number of branch for function / method body.
523 | max-branches=12
524 |
525 | # Maximum number of locals for function / method body.
526 | max-locals=15
527 |
528 | # Maximum number of parents for a class (see R0901).
529 | max-parents=7
530 |
531 | # Maximum number of public methods for a class (see R0904).
532 | max-public-methods=20
533 |
534 | # Maximum number of return / yield for function / method body.
535 | max-returns=6
536 |
537 | # Maximum number of statements in function / method body.
538 | max-statements=50
539 |
540 | # Minimum number of public methods for a class (see R0903).
541 | min-public-methods=2
542 |
543 |
544 | [CLASSES]
545 |
546 | # List of method names used to declare (i.e. assign) instance attributes.
547 | defining-attr-methods=__init__,
548 | __new__,
549 | setUp
550 |
551 | # List of member names, which should be excluded from the protected access
552 | # warning.
553 | exclude-protected=_asdict,
554 | _fields,
555 | _replace,
556 | _source,
557 | _make
558 |
559 | # List of valid names for the first argument in a class method.
560 | valid-classmethod-first-arg=cls
561 |
562 | # List of valid names for the first argument in a metaclass class method.
563 | valid-metaclass-classmethod-first-arg=cls
564 |
565 |
566 | [EXCEPTIONS]
567 |
568 | # Exceptions that will emit a warning when being caught. Defaults to
569 | # "BaseException, Exception".
570 | overgeneral-exceptions=BaseException,
571 | Exception
572 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Build documentation in the docs/ directory with Sphinx
9 | sphinx:
10 | configuration: docs/source/conf.py
11 |
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | # configuration: mkdocs.yml
15 |
16 | # Optionally build your docs in additional formats such as PDF and ePub
17 | formats: all
18 |
19 | # Optionally set the version of Python and requirements required to build your docs
20 | python:
21 | version: 3.7
22 | install:
23 | - requirements: requirements/dev.txt
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at diretnan.bisoncorps@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ## How to contribute to Search Engine Parser
2 |
3 | All Contributions to the code base or documentation must be done on a branch with intuitive name e.g `aol-#13-patch`, `yandex-engine-implementation`
4 |
5 | #### **Did you find a bug?**
6 |
7 |
8 | * **Ensure the bug was not already reported** by searching on GitHub under [Issues](https://github.com/bisoncorps/search-engine-parser/issues).
9 |
10 | * If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/bisoncorps/search-engine-parser/issues/new). If possible, be sure to make use of the [bug template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/ISSUE_TEMPLATE/bug_report.md) with label `bug`
11 |
12 | * Ensure the issue description clearly describes the bug.Include the relevant issue number if applicable.
13 |
14 | #### **Did you write a patch that fixes a bug?**
15 |
16 | * Ensure the bug is first reported by searching on GitHub under [Issues](https://github.com/bisoncorps/search-engine-parser/issues) using label `bug`
17 |
18 | * If issue does not exist, open an issue with the [bug report template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/ISSUE_TEMPLATE/bug_report.md)
19 |
20 | * Open a new GitHub pull request with the patch using [bug fix template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/PULL_REQUEST_TEMPLATE/bug_fix.md).
21 |
22 | * Ensure the PR description clearly describes the solution. Include the relevant issue number if applicable.
23 |
24 |
25 | #### **Do you intend to add a new feature or change an existing one?**
26 |
27 | * **Ensure the feature was not already requested** by searching on GitHub under [Issues](https://github.com/bisoncorps/search-engine-parser/issues). Search using the `enhancement` or `feature` labels
28 |
29 | * Suggest your feature/change in the [search-engine-parser mailing list](https://groups.google.com/forum/?fromgroups#!forum/searchengineparser) and start writing code.
30 |
31 | * Do not open an issue on GitHub until you have collected positive feedback about the change.
32 |
33 | * Raise an issue using the [feature request template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/ISSUE_TEMPLATE/feature_request.md) with labels `enhancement`
34 |
35 | * Upon implementing the feature, make a PR using the [feature implementation template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/PULL_REQUEST_TEMPLATE/feature_implementation.md)
36 |
37 | ##### **Engines**
38 |
39 | * Refer to the [SearchEngineParser Engines Documentation](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/engines.md) for help on implementing Engines
40 |
41 | * If an issue for the Engine does not already exist under [Issues], suggest the engine in the [search-engine-parser mailing list](https://groups.google.com/forum/?fromgroups#!forum/searchengineparser)
42 |
43 | * If the Engine to be included is accepted, raise an issue using the [feature template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/ISSUE_TEMPLATE/feature_request.md) and labels `enhancement` and `engine`
44 |
45 | * Upon implementing the Engine, make a PR using the [engine implementation template](https://github.com/bisoncorps/search-engine-parser/blob/master/.github/PULL_REQUEST_TEMPLATE/engine_implementation.md)
46 |
47 |
48 | #### **Do you have questions about the source code?**
49 |
50 | * Ask any question about how to use SearchEngineParser [search-engine-parser mailing list](https://groups.google.com/forum/?fromgroups#!forum/searchengineparser).
51 |
52 | #### **Do you want to contribute to the search-engine-parser documentation?**
53 |
54 | * Please read [Contributing to the SearchEngineParser Documentation](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/documentation.md).
55 |
56 |
57 |
58 | `NOTE: There are exceptions in every case and we know that too!`
59 |
60 | SearchEngineParser is a volunteer effort. We encourage you to pitch in and [join the team](https://github.com/bisoncorps/search-engine-parser/blob/master/README.md#contributors)!
61 |
62 |
63 | Thanks!
64 |
65 | Bisoncorps Team - `B`uilding `I`nteresting `S`oftware `O`pensourced for huma`NS` :heart: :heart:
66 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 bison_corps/search-engine-parser
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | include requirements/*.txt
3 | include README.md
4 | recursive-include search_engine_parser *.py
5 | prune docs/
6 | prune scripts/
7 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Search Engine Parser
2 |
3 | "If it is a search engine, then it can be parsed" - some random guy
4 |
5 | 
6 |
7 | [](https://www.python.org/downloads/)
8 | [](https://pypi.org/project/search-engine-parser/)
9 | [](https://pypi.org/project/search-engine-parser/)
10 | [](https://github.com/bisohns/search-engine-parser/actions/workflows/deploy.yml)
11 | [](https://github.com/bisohns/search-engine-parser/actions/workflows/test.yml)
12 | [](https://search-engine-parser.readthedocs.io/en/latest/?badge=latest)
13 | [](https://opensource.org/licenses/MIT)
14 | [](#contributors)
15 |
16 |
17 | search-engine-parser is a package that lets you query popular search engines and scrape for result titles, links, descriptions and more. It aims to scrape the widest range of search engines.
18 | View all supported engines [here.](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/supported_engines.md)
19 |
20 | - [Search Engine Parser](#search-engine-parser)
21 | - [Popular Supported Engines](#popular-supported-engines)
22 | - [Installation](#installation)
23 | - [Development](#development)
24 | - [Code Documentation](#code-documentation)
25 | - [Running the tests](#running-the-tests)
26 | - [Usage](#usage)
27 | - [Code](#code)
28 | - [Command line](#command-line)
29 | - [FAQ](docs/faq.md)
30 | - [Code of Conduct](#code-of-conduct)
31 | - [Contribution](#contribution)
32 | - [License (MIT)](#license-mit)
33 |
34 | ## Popular Supported Engines
35 | Popular search engines supported include:
36 |
37 | - Google
38 | - DuckDuckGo
39 | - GitHub
40 | - StackOverflow
41 | - Baidu
42 | - YouTube
43 |
44 | View all supported engines [here.](docs/supported_engines.md)
45 |
46 | ## Installation
47 | Install from PyPi:
48 |
49 | ```bash
50 | # install only package dependencies
51 | pip install search-engine-parser
52 | # Installs `pysearch` cli tool
53 | pip install "search-engine-parser[cli]"
54 | ```
55 |
56 | or from master:
57 | ```bash
58 | pip install git+https://github.com/bisoncorps/search-engine-parser
59 | ```
60 |
61 | ## Development
62 | Clone the repository:
63 |
64 | ```bash
65 | git clone git@github.com:bisoncorps/search-engine-parser.git
66 | ```
67 |
68 | Then create a virtual environment and install the required packages:
69 |
70 | ```bash
71 | mkvirtualenv search_engine_parser
72 | pip install -r requirements/dev.txt
73 | ```
74 |
75 |
76 | ## Code Documentation
77 | Code docs can be found on [Read the Docs](https://search-engine-parser.readthedocs.io/en/latest).
78 |
79 | ## Running the tests
80 | ```bash
81 | pytest
82 | ```
83 |
84 | ## Usage
85 |
86 | ### Code
87 | Query results can be scraped from popular search engines, as shown in the example snippet below.
88 |
89 | ```python
90 | import pprint
91 |
92 | from search_engine_parser.core.engines.bing import Search as BingSearch
93 | from search_engine_parser.core.engines.google import Search as GoogleSearch
94 | from search_engine_parser.core.engines.yahoo import Search as YahooSearch
95 |
96 | search_args = ('preaching to the choir', 1)
97 | gsearch = GoogleSearch()
98 | ysearch = YahooSearch()
99 | bsearch = BingSearch()
100 | gresults = gsearch.search(*search_args)
101 | yresults = ysearch.search(*search_args)
102 | bresults = bsearch.search(*search_args)
103 | a = {
104 | "Google": gresults,
105 | "Yahoo": yresults,
106 | "Bing": bresults
107 | }
108 |
109 | # pretty print the result from each engine
110 | for k, v in a.items():
111 | print(f"-------------{k}------------")
112 | for result in v:
113 | pprint.pprint(result)
114 |
115 | # print first title from google search
116 | print(gresults["titles"][0])
117 | # print 10th link from yahoo search
118 | print(yresults["links"][9])
119 | # print 6th description from bing search
120 | print(bresults["descriptions"][5])
121 |
122 | # print first result containing links, descriptions and title
123 | print(gresults[0])
124 | ```
125 |
126 | For localization, you can pass the `url` keyword and a localized url. This queries and parses the localized url using the same engine's parser:
127 | ```python
128 | # Use google.de instead of google.com
129 | results = gsearch.search(*search_args, url="google.de")
130 | ```
131 |
132 | If you need results in a specific language you can pass the 'hl' keyword and the 2-letter country abbreviation (here's a [handy list](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)):
133 | ```python
134 | # Use 'it' to receive italian results
135 | results = gsearch.search(*search_args, hl="it")
136 | ```
137 |
138 | #### Cache
139 | The results are automatically cached for engine searches. You can either bypass the cache by adding `cache=False` to the `search` or `async_search` method or clear the engine's cache
140 | ```python
141 | from search_engine_parser.core.engines.github import Search as GitHub
142 | github = GitHub()
143 | # bypass the cache
144 | github.search("search-engine-parser", cache=False)
145 |
146 | #OR
147 | # clear cache before search
148 | github.clear_cache()
149 | github.search("search-engine-parser")
150 | ```
151 |
152 | #### Proxy
153 | Adding a proxy entails sending details to the search function
154 | ```python
155 | from search_engine_parser.core.engines.github import Search as GitHub
156 | github = GitHub()
157 | github.search("search-engine-parser",
158 | # http proxies supported only
159 | proxy='http://123.12.1.0',
160 | proxy_auth=('username', 'password'))
161 | ```
162 |
163 |
164 | #### Async
165 | search-engine-parser supports `async`:
166 | ```python
167 | results = await gsearch.async_search(*search_args)
168 | ```
169 |
170 | #### Results
171 | The `SearchResults` after searching:
172 | ```python
173 | >>> results = gsearch.search("preaching to the choir", 1)
174 | >>> results
175 |
176 | # the object supports retrieving individual results by iteration of just by type (links, descriptions, titles)
177 | >>> results[0] # returns the first
178 | >>> results[0]["description"] # gets the description of the first item
179 | >>> results[0]["link"] # gets the link of the first item
180 | >>> results["descriptions"] # returns a list of all descriptions from all results
181 | ```
182 | It can be iterated like a normal list to return individual `SearchItem`s.
183 |
184 | ### Command line
185 |
186 | search-engine-parser comes with a CLI tool known as `pysearch`. You can use it as such:
187 |
188 | ```bash
189 | pysearch --engine bing --type descriptions "Preaching to the choir"
190 | ```
191 |
192 | Result:
193 |
194 | ```bash
195 | 'Preaching to the choir' originated in the USA in the 1970s. It is a variant of the earlier 'preaching to the converted', which dates from England in the late 1800s and has the same meaning. Origin - the full story 'Preaching to the choir' (also sometimes spelled quire) is of US origin.
196 | ```
197 |
198 | 
199 |
200 | ```bash
201 | usage: pysearch [-h] [-V] [-e ENGINE] [--show-summary] [-u URL] [-p PAGE]
202 | [-t TYPE] [-cc] [-r RANK] [--proxy PROXY]
203 | [--proxy-user PROXY_USER] [--proxy-password PROXY_PASSWORD]
204 | query
205 |
206 | SearchEngineParser
207 |
208 | positional arguments:
209 | query Query string to search engine for
210 |
211 | optional arguments:
212 | -h, --help show this help message and exit
213 | -V, --version show program's version number and exit
214 | -e ENGINE, --engine ENGINE
215 | Engine to use for parsing the query e.g google, yahoo,
216 | bing,duckduckgo (default: google)
217 | --show-summary Shows the summary of an engine
218 | -u URL, --url URL A custom link to use as base url for search e.g
219 | google.de
220 | -p PAGE, --page PAGE Page of the result to return details for (default: 1)
221 | -t TYPE, --type TYPE Type of detail to return i.e full, links, desciptions
222 | or titles (default: full)
223 | -cc, --clear-cache Clear cache of engine before searching
224 | -r RANK, --rank RANK ID of Detail to return e.g 5 (default: 0)
225 | --proxy PROXY Proxy address to make use of
226 | --proxy-user PROXY_USER
227 | Proxy user to make use of
228 | --proxy-password PROXY_PASSWORD
229 | Proxy password to make use of
230 | ```
231 |
232 |
233 |
234 | ## Code of Conduct
235 | Make sure to adhere to the [code of conduct](CODE_OF_CONDUCT.md) at all times.
236 |
237 | ## Contribution
238 | Before making any contributions, please read the [contribution guide](CONTRIBUTING.md).
239 |
240 | ## License (MIT)
241 | This project is licensed under the [MIT 2.0 License](LICENSE) which allows very broad use for both academic and commercial purposes.
242 |
243 | ## Contributors ✨
244 |
245 | Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
246 |
247 |
248 |
249 |
250 |
275 |
276 |
277 |
278 |
279 |
280 |
281 | This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
282 |
--------------------------------------------------------------------------------
/assets/animate.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisohns/search-engine-parser/0c2f4bde7dd21c10e64c9204417d9a228e96c187/assets/animate.gif
--------------------------------------------------------------------------------
/assets/example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisohns/search-engine-parser/0c2f4bde7dd21c10e64c9204417d9a228e96c187/assets/example.gif
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SOURCEDIR = source
8 | BUILDDIR = build
9 |
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 |
14 | .PHONY: help Makefile
15 |
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/docs/documentation.md:
--------------------------------------------------------------------------------
1 | ## Documentation
2 |
3 | The documentation for this project is generated by [Sphinx](https://sphinx-doc.org) and is hosted at [Read the Docs](https://search-engine-parser.readthedocs.io).
4 | On the root of the project, there exists a `docs` directory housing the sphinx configuration and rst files.
5 |
6 | ### Understanding Sphinx
7 |
8 | If you have not made use of sphinx before, take a look at this explanatory [blogpost](https://medium.com/@richdayandnight/a-simple-tutorial-on-how-to-document-your-python-project-using-sphinx-and-rinohtype-177c22a15b5b)
9 |
10 |
11 | ### Documenting an Engine
12 |
13 | Write the appropriate summary, and document the class and every function as follows
14 |
15 | ```python
16 | """@desc
17 | # This is the module documentation
18 | Parser for FakeEngine search results
19 | """
20 |
21 |
22 | class Search(BaseSearch):
23 | """
24 | Searches FakeEngine for string
25 | """
26 | name = "FakeEngine"
27 | summary = "\t Here lies the summary for a fake engine"
28 |
29 | def fake_function(self, input_1, input_2):
30 | """
31 | Describe function here
32 | :param input_1: describe input 1
33 | :type single_result: str
34 | :param input_2: describe input 2
35 | :type input_2: int
36 | :return: this is an example return
37 | :rtype: str
38 | """
39 | ```
40 |
41 | ### Generating the files
42 |
43 | After including the necessary documentation
44 |
45 | * Go to the root of the project and then
46 |
47 | ```bash
48 | cd docs/
49 | ```
50 |
51 | * Ensure your virtualenv is enabled with all requirements listed in the [requirements-dev.txt](https://github.com/bisoncorps/search-engine-parser/blob/master/requirements-dev.txt)
52 |
53 | * Run the command
54 |
55 | ```bash
56 | sphinx-apidoc -f -o source/ ../search_engine_parser
57 | ```
58 |
59 | * Write an appropriate commit message
60 |
61 | ```t
62 | Ex. Included documentation for the Yandex Engine
63 | ```
64 |
--------------------------------------------------------------------------------
/docs/engines.md:
--------------------------------------------------------------------------------
1 | ## Engines
2 |
3 | This document is dedicated to helping developers better understand how to include Engines to the SearchEngineParser OSS.
4 |
5 | ### What Search Engines are accepted
6 |
7 | This project was started primarily for general purpose search engines like Google and Bing.
8 | It has since surpassed that and aims to include all useful sites (termed `custom engines`).
9 | These custom engines include things like Youtube, GitHub, StackOverflow, e.t.c.
10 | Basically any site that is popular enough to search and return links
11 |
12 | ### Skills Needed
13 |
14 | - Python (obviously)
15 | - Sphinx
16 | - Regular Expressions
17 | - Beautiful Soup
18 |
19 | ### Implementing an Engine
20 |
21 | The engine modules are in the [search_engine_parser/core/engines/](https://github.com/bisoncorps/search-engine-parser/blob/master/search_engine_parser/core/engines) directory
22 |
23 | * Create module for the new search engine
24 |
25 | * Create class for the Engine
26 |
27 | * Class should import from the base engine
28 |
29 | * Example for a fake engine is shown below
30 |
31 | ```python
32 |
33 | # fake.py
34 | from search_engine_parser.core.base import BaseSearch
35 | from search_engine_parser.core.exceptions import NoResultsOrTrafficError
36 |
37 | class FakeEngine(BaseSearch):
38 | # name of the engine to be displayed on the CLI, preferably PascalCase
39 | name = "FakeEngine"
40 | # engine url to be search, with parameters to be formatted e.g query , page
41 | search_url = "https://search.fake.com/fake/search"
42 | # a short 2 or 3 line summary of the engine with some statistics, preferably obtained from wikipedia
43 | summary = "\t According to netmarketshare, this site is balderdash among "\
44 | "search engines with a market share that is close to 100%. "\
45 | "The fake engine includes many popular features but was solely created to show you an example ."
46 |
47 |
48 | # this function should return the dict of params to be passed to the search_url
49 | def get_params(self, query=None, page=None, offset=None, **kwargs):
50 | params = {}
51 | params["q"] =query
52 | params["page"] = page
53 | return params
54 |
55 | # This function should use beautiful soup (combined with regex if necessary)
56 | # to return all the divs containiing results
57 | def parse_soup(self, soup):
58 | return soup.find_all('div', class_='fake-result-div')
59 |
60 | # This function should parse each result soup to return title, link, and description
61 | # NOTE: The implementation may not be as straightforward as shown below
62 | def parse_single_result(self, single_result):
63 | title_div = single_result.find('div', class_='fake-title')
64 | title = title_div.text
65 | link_tag = title_div.find('a')
66 | link = link_tag.get('href')
67 | desc_span = single_result.find('span', class_='fake-description')
68 | desc = desc.text
69 | rdict = {
70 | "titles": title,
71 | "links": link,
72 | "descriptions": desc,
73 | }
74 | return rdict
75 | ```
76 |
77 | * Import the engine by adding to the following files
78 |
79 | [search_engine_parser/__init__.py](https://github.com/bisoncorps/search-engine-parser/blob/master/search_engine_parser/__init__.py)
80 |
81 | ```python
82 | ...
83 | from search_engine_parser.core.engines.fake import Search as FakeEngineSearch
84 | ```
85 |
86 |
87 | * Make sure to write code documentation by following the [documentation guide](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/documentation.md#documenting-an-engine)
88 |
89 | * [Generate the RST file](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/documentation.md#generating-the-files)
90 |
91 | * Add Engine to Supported Engines in [supported engines](https://github.com/bisoncorps/search-engine-parser/blob/master/docs/supported_engines.md)
92 |
--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
1 | # FAQ
2 |
3 | ## Why do I get `RuntimeError: This event loop is already running` When running in Jupyter Notebook
4 |
5 | This is a popular issue on [Jupyter Notebook](https://github.com/jupyter/notebook/issues/5663). The solution:
6 | - try `pip install --upgrade ipykernel ipython` which should upgrade the ipykernet to a recent version with issue resolved
7 | - or add this to your notebook to allow nested asyncio loops
8 | ```bash
9 | !pip install nest-asyncio
10 | ```
11 |
12 | ```python
13 | import nest_asyncio
14 | nest_asyncio.apply()
15 | ```
16 |
17 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # Configuration file for the Sphinx documentation builder.
4 | #
5 | # This file does only contain a selection of the most common options. For a
6 | # full list see the documentation:
7 | # http://www.sphinx-doc.org/en/master/config
8 |
9 | # -- Path setup --------------------------------------------------------------
10 |
11 | # If extensions (or modules to document with autodoc) are in another directory,
12 | # add these directories to sys.path here. If the directory is relative to the
13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
14 | #
15 | import os
16 | import sys
17 | sys.path.insert(0, os.path.abspath('../..'))
18 | from search_engine_parser import __version__ as VERSION
19 |
20 | # -- Project information -----------------------------------------------------
21 |
22 | project = 'Search Engine Parser'
23 | copyright = '2019, BisonCorps'
24 | author = 'Diretnan Domnan, Mmadu Manasseh'
25 |
26 | # The short X.Y version
27 | version = ''
28 | # The full version, including alpha/beta/rc tags
29 | release = VERSION
30 |
31 |
32 | # -- General configuration ---------------------------------------------------
33 |
34 | # If your documentation needs a minimal Sphinx version, state it here.
35 | #
36 | # needs_sphinx = '1.0'
37 |
38 | # Add any Sphinx extension module names here, as strings. They can be
39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
40 | # ones.
41 | extensions = [
42 | 'sphinx.ext.autodoc',
43 | 'sphinx.ext.todo',
44 | 'sphinx.ext.viewcode',
45 | 'sphinx.ext.githubpages',
46 | 'm2r',
47 | ]
48 |
49 | # Add any paths that contain templates here, relative to this directory.
50 | templates_path = ['_templates']
51 |
52 | # The suffix(es) of source filenames.
53 | # You can specify multiple suffix as a list of string:
54 | #
55 | # source_suffix = ['.rst', '.md']
56 | source_suffix = ['.rst', '.md']
57 |
58 | # The master toctree document.
59 | master_doc = 'index'
60 |
61 | # The language for content autogenerated by Sphinx. Refer to documentation
62 | # for a list of supported languages.
63 | #
64 | # This is also used if you do content translation via gettext catalogs.
65 | # Usually you set "language" from the command line for these cases.
66 | language = None
67 |
68 | # List of patterns, relative to source directory, that match files and
69 | # directories to ignore when looking for source files.
70 | # This pattern also affects html_static_path and html_extra_path.
71 | exclude_patterns = []
72 |
73 | # The name of the Pygments (syntax highlighting) style to use.
74 | pygments_style = None
75 |
76 |
77 | # -- Options for HTML output -------------------------------------------------
78 |
79 | # The theme to use for HTML and HTML Help pages. See the documentation for
80 | # a list of builtin themes.
81 | #
82 | html_theme = 'sphinx_rtd_theme'
83 |
84 | # Theme options are theme-specific and customize the look and feel of a theme
85 | # further. For a list of options available for each theme, see the
86 | # documentation.
87 | #
88 | # html_theme_options = {}
89 |
90 | # Add any paths that contain custom static files (such as style sheets) here,
91 | # relative to this directory. They are copied after the builtin static files,
92 | # so a file named "default.css" will overwrite the builtin "default.css".
93 | html_static_path = ['_static']
94 |
95 | # Custom sidebar templates, must be a dictionary that maps document names
96 | # to template names.
97 | #
98 | # The default sidebars (for documents that don't match any pattern) are
99 | # defined by theme itself. Builtin themes are using these templates by
100 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
101 | # 'searchbox.html']``.
102 | #
103 | # html_sidebars = {}
104 |
105 |
106 | # -- Options for HTMLHelp output ---------------------------------------------
107 |
108 | # Output file base name for HTML help builder.
109 | htmlhelp_basename = 'SearchEngineParserdoc'
110 |
111 |
112 | # -- Options for LaTeX output ------------------------------------------------
113 |
114 | latex_elements = {
115 | # The paper size ('letterpaper' or 'a4paper').
116 | #
117 | # 'papersize': 'letterpaper',
118 |
119 | # The font size ('10pt', '11pt' or '12pt').
120 | #
121 | # 'pointsize': '10pt',
122 |
123 | # Additional stuff for the LaTeX preamble.
124 | #
125 | # 'preamble': '',
126 |
127 | # Latex figure (float) alignment
128 | #
129 | # 'figure_align': 'htbp',
130 | }
131 |
132 | # Grouping the document tree into LaTeX files. List of tuples
133 | # (source start file, target name, title,
134 | # author, documentclass [howto, manual, or own class]).
135 | latex_documents = [
136 | (master_doc, 'SearchEngineParser.tex', 'Search Engine Parser Documentation',
137 | 'Diretnan Domnan, Mmadu Manasseh', 'manual'),
138 | ]
139 |
140 |
141 | # -- Options for manual page output ------------------------------------------
142 |
143 | # One entry per manual page. List of tuples
144 | # (source start file, name, description, authors, manual section).
145 | man_pages = [
146 | (master_doc, 'searchengineparser', 'Search Engine Parser Documentation',
147 | [author], 1)
148 | ]
149 |
150 |
151 | # -- Options for Texinfo output ----------------------------------------------
152 |
153 | # Grouping the document tree into Texinfo files. List of tuples
154 | # (source start file, target name, title, author,
155 | # dir menu entry, description, category)
156 | texinfo_documents = [
157 | (master_doc, 'SearchEngineParser', 'Search Engine Parser Documentation',
158 | author, 'SearchEngineParser', 'One line description of project.',
159 | 'Miscellaneous'),
160 | ]
161 |
162 |
163 | # -- Options for Epub output -------------------------------------------------
164 |
165 | # Bibliographic Dublin Core info.
166 | epub_title = project
167 |
168 | # The unique identifier of the text. This can be a ISBN number
169 | # or the project homepage.
170 | #
171 | # epub_identifier = ''
172 |
173 | # A unique identification for the text.
174 | #
175 | # epub_uid = ''
176 |
177 | # A list of files that should not be packed into the epub file.
178 | epub_exclude_files = ['search.html']
179 |
180 |
181 | # -- Extension configuration -------------------------------------------------
182 |
183 | # -- Options for todo extension ----------------------------------------------
184 |
185 | # If true, `todo` and `todoList` produce output, else they produce nothing.
186 | todo_include_todos = True
187 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. Search Engine Parser documentation master file, created by
2 | sphinx-quickstart on Fri Feb 1 23:05:55 2019.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to Search Engine Parser's documentation!
7 | ================================================
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 | :caption: Contents:
12 |
13 |
14 | .. mdinclude:: ../../README.md
15 |
16 | Indices and tables
17 | ==================
18 |
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 |
--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | search_engine_parser
2 | ====================
3 |
4 | .. toctree::
5 | :maxdepth: 4
6 |
7 | search_engine_parser
8 |
--------------------------------------------------------------------------------
/docs/source/search_engine_parser.core.engines.rst:
--------------------------------------------------------------------------------
1 | search\_engine\_parser.core.engines package
2 | ===========================================
3 |
4 | Submodules
5 | ----------
6 |
7 | search\_engine\_parser.core.engines.aol module
8 | ----------------------------------------------
9 |
10 | .. automodule:: search_engine_parser.core.engines.aol
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | search\_engine\_parser.core.engines.ask module
16 | ----------------------------------------------
17 |
18 | .. automodule:: search_engine_parser.core.engines.ask
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 | search\_engine\_parser.core.engines.baidu module
24 | ------------------------------------------------
25 |
26 | .. automodule:: search_engine_parser.core.engines.baidu
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 | search\_engine\_parser.core.engines.bing module
32 | -----------------------------------------------
33 |
34 | .. automodule:: search_engine_parser.core.engines.bing
35 | :members:
36 | :undoc-members:
37 | :show-inheritance:
38 |
39 | search\_engine\_parser.core.engines.coursera module
40 | ---------------------------------------------------
41 |
42 | .. automodule:: search_engine_parser.core.engines.coursera
43 | :members:
44 | :undoc-members:
45 | :show-inheritance:
46 |
47 | search\_engine\_parser.core.engines.duckduckgo module
48 | -----------------------------------------------------
49 |
50 | .. automodule:: search_engine_parser.core.engines.duckduckgo
51 | :members:
52 | :undoc-members:
53 | :show-inheritance:
54 |
55 | search\_engine\_parser.core.engines.github module
56 | -------------------------------------------------
57 |
58 | .. automodule:: search_engine_parser.core.engines.github
59 | :members:
60 | :undoc-members:
61 | :show-inheritance:
62 |
63 | search\_engine\_parser.core.engines.google module
64 | -------------------------------------------------
65 |
66 | .. automodule:: search_engine_parser.core.engines.google
67 | :members:
68 | :undoc-members:
69 | :show-inheritance:
70 |
71 | search\_engine\_parser.core.engines.googlescholar module
72 | --------------------------------------------------------
73 |
74 | .. automodule:: search_engine_parser.core.engines.googlescholar
75 | :members:
76 | :undoc-members:
77 | :show-inheritance:
78 |
79 | search\_engine\_parser.core.engines.myanimelist module
80 | ------------------------------------------------------
81 |
82 | .. automodule:: search_engine_parser.core.engines.myanimelist
83 | :members:
84 | :undoc-members:
85 | :show-inheritance:
86 |
87 | search\_engine\_parser.core.engines.stackoverflow module
88 | --------------------------------------------------------
89 |
90 | .. automodule:: search_engine_parser.core.engines.stackoverflow
91 | :members:
92 | :undoc-members:
93 | :show-inheritance:
94 |
95 | search\_engine\_parser.core.engines.yahoo module
96 | ------------------------------------------------
97 |
98 | .. automodule:: search_engine_parser.core.engines.yahoo
99 | :members:
100 | :undoc-members:
101 | :show-inheritance:
102 |
103 | search\_engine\_parser.core.engines.yandex module
104 | -------------------------------------------------
105 |
106 | .. automodule:: search_engine_parser.core.engines.yandex
107 | :members:
108 | :undoc-members:
109 | :show-inheritance:
110 |
111 | search\_engine\_parser.core.engines.youtube module
112 | --------------------------------------------------
113 |
114 | .. automodule:: search_engine_parser.core.engines.youtube
115 | :members:
116 | :undoc-members:
117 | :show-inheritance:
118 |
119 |
120 | Module contents
121 | ---------------
122 |
123 | .. automodule:: search_engine_parser.core.engines
124 | :members:
125 | :undoc-members:
126 | :show-inheritance:
127 |
--------------------------------------------------------------------------------
/docs/source/search_engine_parser.core.rst:
--------------------------------------------------------------------------------
1 | search\_engine\_parser.core package
2 | ===================================
3 |
4 | Subpackages
5 | -----------
6 |
7 | .. toctree::
8 |
9 | search_engine_parser.core.engines
10 |
11 | Submodules
12 | ----------
13 |
14 | search\_engine\_parser.core.base module
15 | ---------------------------------------
16 |
17 | .. automodule:: search_engine_parser.core.base
18 | :members:
19 | :undoc-members:
20 | :show-inheritance:
21 |
22 | search\_engine\_parser.core.cli module
23 | --------------------------------------
24 |
25 | .. automodule:: search_engine_parser.core.cli
26 | :members:
27 | :undoc-members:
28 | :show-inheritance:
29 |
30 | search\_engine\_parser.core.exceptions module
31 | ---------------------------------------------
32 |
33 | .. automodule:: search_engine_parser.core.exceptions
34 | :members:
35 | :undoc-members:
36 | :show-inheritance:
37 |
38 |
39 | Module contents
40 | ---------------
41 |
42 | .. automodule:: search_engine_parser.core
43 | :members:
44 | :undoc-members:
45 | :show-inheritance:
46 |
--------------------------------------------------------------------------------
/docs/source/search_engine_parser.rst:
--------------------------------------------------------------------------------
1 | search\_engine\_parser package
2 | ==============================
3 |
4 | Subpackages
5 | -----------
6 |
7 | .. toctree::
8 |
9 | search_engine_parser.core
10 | search_engine_parser.tests
11 |
12 | Module contents
13 | ---------------
14 |
15 | .. automodule:: search_engine_parser
16 | :members:
17 | :undoc-members:
18 | :show-inheritance:
19 |
--------------------------------------------------------------------------------
/docs/source/search_engine_parser.tests.rst:
--------------------------------------------------------------------------------
1 | search\_engine\_parser.tests package
2 | ====================================
3 |
4 | Submodules
5 | ----------
6 |
7 | search\_engine\_parser.tests.base module
8 | ----------------------------------------
9 |
10 | .. automodule:: search_engine_parser.tests.base
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | search\_engine\_parser.tests.test\_search module
16 | ------------------------------------------------
17 |
18 | .. automodule:: search_engine_parser.tests.test_search
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 |
24 | Module contents
25 | ---------------
26 |
27 | .. automodule:: search_engine_parser.tests
28 | :members:
29 | :undoc-members:
30 | :show-inheritance:
31 |
--------------------------------------------------------------------------------
/docs/supported_engines.md:
--------------------------------------------------------------------------------
1 | ## Supported Engines
2 |
3 | Below is a list of supported engines and what they return.
4 |
5 |
6 | |No|Engine|Returns|
7 | |------|------|-----|
8 | 1|Google|titles, links, descriptions
9 | |2|Yahoo|titles, links, descriptions
10 | 3|Bing|titles, links, descriptions
11 | |4|DuckDuckGo|titles, links, descriptions
12 | 5|Baidu|titles, links, descriptions
13 | |6|Yandex|titles, links, descriptions
14 | 7|Aol|titles, links, descriptions
15 | 8|StackOverflow|titles, links, descriptions
16 | 9|GitHub|titles, links, descriptions, stars, languages
17 | 10|Ask|titles, links, descriptions
18 | 11|YouTube|titles, links, descriptions, channels, [single videos only: durations, views, upload_dates]
19 | 12|MyAnimeList|titles, links, descriptions, number of episodes, type of result (OVA, series, movie, etc.), ratings
20 | 13|GoogleScholar|titles, links, descriptions, type of results ([BOOK], [CITATION], etc.), links of files
21 | 14|GoogleNews|titles, links, descriptions, image links, date, news source
22 | 15|Coursera|titles,links,ratings count, ratings average, partners, difficulties, enrolments numbers
23 |
--------------------------------------------------------------------------------
/requirements/cli.txt:
--------------------------------------------------------------------------------
1 | blessed >=1.15.0, < 2
2 |
--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
1 | -r main.txt
2 | blessed==1.17.9
3 | m2r==0.2.1
4 | parameterized==0.7.4
5 | pylint==2.5.3
6 | pytest==5.4.3
7 | sphinx==3.1.2
8 | sphinx-rtd-theme==0.5.0
9 | vcrpy==4.0.2
10 |
--------------------------------------------------------------------------------
/requirements/main.txt:
--------------------------------------------------------------------------------
1 | lxml >=4.6.5, <5
2 | aiohttp >=3.6.2,<4
3 | beautifulsoup4 >=4.9.1,<5
4 | fake-useragent >=0.1.11, <0.2
5 |
--------------------------------------------------------------------------------
/scripts/docs.sh:
--------------------------------------------------------------------------------
1 | cd ./docs
2 | sphinx-apidoc -f -o source/ ../search_engine_parser
3 | if [ $? -ne 0 ]; then
4 | echo "Failed to run sphinx-apidoc"
5 | exit 1
6 | fi
7 | make html
8 | if [ $? -ne 0 ]; then
9 | echo "Failed to make html"
10 | exit 1
11 | fi
12 | cd ..
13 | git commit -am "make html"
14 | git config --global push.default simple
15 | git config --global user.email "travis@travis-ci.com"
16 | git config --global user.name "Travis CI"
17 |
18 |
19 | #remove existing files except html
20 | shopt -s extglob
21 | rm -r ./!(docs)/
22 |
23 | #copy contents of html to root
24 | cp -R ${TRAVIS_BUILD_DIR}/docs/build/html/. ${TRAVIS_BUILD_DIR}/
25 |
26 | #remove html and accompanying docs
27 | rm -r ./docs
28 | echo "Viewing current files in directory"
29 | ls -lah
30 | # Checkout to gh-pages
31 | git checkout gh-pages
32 | if [ $? -eq 1 ]; then
33 | echo "Checked out to existing gh-pages branch"
34 | else
35 | git checkout -b gh-pages
36 | echo "Creating gh-pages branch"
37 | fi
38 | git add .
39 | git commit -am "rebuilt docs"
40 | git remote add origin-pages https://${GITHUB_TOKEN}@github.com/bisoncorps/search_engine_parser.git
41 | git push -u origin-pages gh-pages --force
42 |
43 | # echo if docs was succesfully pushed
44 | if [ $? -eq 0 ]; then
45 | echo "Docs successfully pushed to Github Pages"
46 | else
47 | echo "Failed to push docs"
48 | exit 1
49 | fi
50 |
--------------------------------------------------------------------------------
/scripts/post_deploy_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # get current version
4 | VERSION="$(python setup.py --version)"
5 | echo "${VERSION}"
6 |
7 | # install python package
8 | pip uninstall search-engine-parser -y
9 | pip install search-engine-parser=="${VERSION}"
10 | python -c "import search_engine_parser"
11 |
12 | pip uninstall search-engine-parser -y
13 |
14 | pip install 'search-engine-parser[cli]=="${VERSION}"'
15 |
16 | # run the cli version to get a result
17 | python -m search_engine_parser.core.cli --engine bing search --query "Preaching to the choir" --type descriptions
18 |
19 | # run cli with pysearch
20 | pysearch -e youtube search -q "NoCopyrightSounds"
21 |
22 | if [ $? -eq 0 ]; then
23 | echo "Package works as expected"
24 | else
25 | echo "CLI handler of the package failed to execute"
26 | exit 1
27 | fi
28 |
--------------------------------------------------------------------------------
/scripts/pre_deploy_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # change directory
4 | cd search_engine_parser/
5 |
6 | python tests/__init__.py
7 |
--------------------------------------------------------------------------------
/search_engine_parser/.gitignore:
--------------------------------------------------------------------------------
1 | ### Python template
2 | # Byte-compiled / optimized / DLL files
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 | cover/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | .pybuilder/
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 |
82 | # IPython
83 | profile_default/
84 | ipython_config.py
85 |
86 | # pyenv
87 | # For a library or package, you might want to ignore these files since the code is
88 | # intended to run in multiple environments; otherwise, check them in:
89 | # .python-version
90 |
91 | # pipenv
92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
95 | # install all needed dependencies.
96 | #Pipfile.lock
97 |
98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
99 | __pypackages__/
100 |
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 |
105 | # SageMath parsed files
106 | *.sage.py
107 |
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 |
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 |
121 | # Rope project settings
122 | .ropeproject
123 |
124 | # mkdocs documentation
125 | /site
126 |
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 |
132 | # Pyre type checker
133 | .pyre/
134 |
135 | # pytype static type analyzer
136 | .pytype/
137 |
138 | # Cython debug symbols
139 | cython_debug/
140 |
141 | ### Python template
142 | # Byte-compiled / optimized / DLL files
143 | __pycache__/
144 | *.py[cod]
145 | *$py.class
146 |
147 | # C extensions
148 | *.so
149 |
150 | # Distribution / packaging
151 | .Python
152 | build/
153 | develop-eggs/
154 | dist/
155 | downloads/
156 | eggs/
157 | .eggs/
158 | lib/
159 | lib64/
160 | parts/
161 | sdist/
162 | var/
163 | wheels/
164 | share/python-wheels/
165 | *.egg-info/
166 | .installed.cfg
167 | *.egg
168 | MANIFEST
169 |
170 | # PyInstaller
171 | # Usually these files are written by a python script from a template
172 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
173 | *.manifest
174 | *.spec
175 |
176 | # Installer logs
177 | pip-log.txt
178 | pip-delete-this-directory.txt
179 |
180 | # Unit test / coverage reports
181 | htmlcov/
182 | .tox/
183 | .nox/
184 | .coverage
185 | .coverage.*
186 | .cache
187 | nosetests.xml
188 | coverage.xml
189 | *.cover
190 | *.py,cover
191 | .hypothesis/
192 | .pytest_cache/
193 | cover/
194 |
195 | # Translations
196 | *.mo
197 | *.pot
198 |
199 | # Django stuff:
200 | *.log
201 | local_settings.py
202 | db.sqlite3
203 | db.sqlite3-journal
204 |
205 | # Flask stuff:
206 | instance/
207 | .webassets-cache
208 |
209 | # Scrapy stuff:
210 | .scrapy
211 |
212 | # Sphinx documentation
213 | docs/_build/
214 |
215 | # PyBuilder
216 | .pybuilder/
217 | target/
218 |
219 | # Jupyter Notebook
220 | .ipynb_checkpoints
221 |
222 | # IPython
223 | profile_default/
224 | ipython_config.py
225 |
226 | # pyenv
227 | # For a library or package, you might want to ignore these files since the code is
228 | # intended to run in multiple environments; otherwise, check them in:
229 | # .python-version
230 |
231 | # pipenv
232 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
233 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
234 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
235 | # install all needed dependencies.
236 | #Pipfile.lock
237 |
238 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
239 | __pypackages__/
240 |
241 | # Celery stuff
242 | celerybeat-schedule
243 | celerybeat.pid
244 |
245 | # SageMath parsed files
246 | *.sage.py
247 |
248 | # Environments
249 | .env
250 | .venv
251 | env/
252 | venv/
253 | ENV/
254 | env.bak/
255 | venv.bak/
256 |
257 | # Spyder project settings
258 | .spyderproject
259 | .spyproject
260 |
261 | # Rope project settings
262 | .ropeproject
263 |
264 | # mkdocs documentation
265 | /site
266 |
267 | # mypy
268 | .mypy_cache/
269 | .dmypy.json
270 | dmypy.json
271 |
272 | # Pyre type checker
273 | .pyre/
274 |
275 | # pytype static type analyzer
276 | .pytype/
277 |
278 | # Cython debug symbols
279 | cython_debug/
280 |
281 | #idea
282 | .idea/*
283 |
--------------------------------------------------------------------------------
/search_engine_parser/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | @author
3 | Domnan Diretnan
4 | Artificial Intelligence Enthusiast & Software Engineer.
5 | Email: diretnandomnan@gmail.com
6 | Github: https://github.com/deven96
7 | GitLab: https://gitlab.com/Deven96
8 |
9 | Mmadu Manasseh
10 | Email: mmadumanasseh@gmail.com
11 | Github: https://github.com/mensaah
12 | GitLab: https://gitlab.com/mensaah
13 |
14 | @project
15 | @create date 2019-02-01 22:15:44
16 | @modify date 2019-02-01 22:15:44
17 |
18 | @license
19 | MIT License
20 | Copyright (c) 2018. Domnan Diretnan. All rights reserved
21 |
22 | """
23 |
24 | # Allow import using `search_engine_parser.engines`
25 | from search_engine_parser.core import engines
26 | # Support for older versions of imports
27 | # DEPRECATION_WARNING: These imports will be removed in later versions
28 | from search_engine_parser.core.engines.aol import Search as AolSearch
29 | from search_engine_parser.core.engines.ask import Search as AskSearch
30 | from search_engine_parser.core.engines.baidu import Search as BaiduSearch
31 | from search_engine_parser.core.engines.bing import Search as BingSearch
32 | from search_engine_parser.core.engines.duckduckgo import \
33 | Search as DuckDuckGoSearch
34 | from search_engine_parser.core.engines.github import Search as GithubSearch
35 | from search_engine_parser.core.engines.google import Search as GoogleSearch
36 | from search_engine_parser.core.engines.googlescholar import \
37 | Search as GoogleScholarSearch
38 | from search_engine_parser.core.engines.stackoverflow import \
39 | Search as StackOverflowSearch
40 | from search_engine_parser.core.engines.yahoo import Search as YahooSearch
41 |
42 | name = "search-engine-parser" # pylint: disable=invalid-name
43 | __version__ = "0.6.3"
44 |
--------------------------------------------------------------------------------
/search_engine_parser/core/__init__.py:
--------------------------------------------------------------------------------
1 | import search_engine_parser.core.engines
2 |
--------------------------------------------------------------------------------
/search_engine_parser/core/base.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Base class inherited by every search engine
3 | """
4 |
5 | import asyncio
6 | import random
7 | from abc import ABCMeta, abstractmethod
8 | from contextlib import suppress
9 | from enum import Enum, unique
10 | from urllib.parse import urlencode, urlparse
11 |
12 | import aiohttp
13 | from bs4 import BeautifulSoup
14 |
15 | from search_engine_parser.core import utils
16 | from search_engine_parser.core.exceptions import NoResultsOrTrafficError
17 |
18 |
19 | @unique
20 | class ReturnType(Enum):
21 | FULL = "full"
22 | TITLE = "titles"
23 | DESCRIPTION = "descriptions"
24 | LINK = "links"
25 |
26 |
27 | # All results returned are each items of search
28 | class SearchItem(dict):
29 | """
30 | SearchItem is a dict of results containing keys (titles, descriptions, links and other
31 | additional keys dependending on the engine)
32 | >>> result
33 |
34 | >>> result["description"]
35 | Some description
36 | >>> result["descriptions"]
37 | Same description
38 | """
39 | def __getitem__(self, value):
40 | """ Allow getting by index and by type ('descriptions', 'links'...)"""
41 | try:
42 | return super().__getitem__(value)
43 | except KeyError:
44 | pass
45 | if not value.endswith('s'):
46 | value += 's'
47 | return super().__getitem__(value)
48 |
49 |
50 | class SearchResult():
51 | """
52 | The SearchResults after the searching
53 |
54 | >>> results = gsearch.search("preaching the choir", 1)
55 | >>> results
56 |
57 |
58 | The object supports retreiving individual results by iteration of just by type
59 | >>> results[0] # Returns the first result
60 | >>> results["descriptions"] # Returns a list of all descriptions from all results
61 |
62 | It can be iterated like a normal list to return individual SearchItem
63 | """
64 |
65 | def __init__(self):
66 | self.results = []
67 |
68 | def append(self, value):
69 | self.results.append(value)
70 |
71 | def __getitem__(self, value):
72 | """ Allow getting by index and by type ('descriptions', 'links'...)"""
73 | if isinstance(value, int):
74 | return self.results[value]
75 | l = []
76 | for x in self.results:
77 | with suppress(KeyError):
78 | l.append(x[value])
79 | return l
80 |
81 | def keys(self):
82 | keys = {}
83 | with suppress(IndexError):
84 | x = self.results[0]
85 | keys = x.keys()
86 | return keys
87 |
88 | def __len__(self):
89 | return len(self.results)
90 |
91 | def __repr_(self):
92 | return "".format(len(self.results))
93 |
94 |
95 | class BaseSearch:
96 |
97 | __metaclass__ = ABCMeta
98 |
99 | """
100 | Search base to be extended by search parsers
101 | Every subclass must have two methods `search` amd `parse_single_result`
102 | """
103 | # Summary of engine
104 | summary = None
105 | # Search Engine Name
106 | name = None
107 | # Search Engine unformatted URL
108 | search_url = None
109 | # The url after all query params have been set
110 | _parsed_url = None
111 | # boolean that indicates cache hit or miss
112 | _cache_hit = False
113 |
114 | @abstractmethod
115 | def parse_soup(self, soup):
116 | """
117 | Defines the results contained in a soup
118 | """
119 | raise NotImplementedError("subclasses must define method ")
120 |
121 | @abstractmethod
122 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
123 | """
124 | Every div/span containing a result is passed here to retrieve
125 | `title`, `link` and `descr`
126 | """
127 | raise NotImplementedError(
128 | "subclasses must define method ")
129 |
130 | def get_cache_handler(self):
131 | """ Return Cache Handler to use"""
132 |
133 | return utils.CacheHandler()
134 |
135 | @property
136 | def cache_handler(self):
137 | return self.get_cache_handler()
138 |
139 | def parse_result(self, results, **kwargs):
140 | """
141 | Runs every entry on the page through parse_single_result
142 |
143 | :param results: Result of main search to extract individual results
144 | :type results: list[`bs4.element.ResultSet`]
145 | :returns: dictionary. Containing lists of titles, links, descriptions and other possible\
146 | returns.
147 | :rtype: dict
148 | """
149 | search_results = SearchResult()
150 | for each in results:
151 | rdict = self.parse_single_result(each, **kwargs)
152 | if rdict is not None:
153 | search_results.append(rdict)
154 | return search_results
155 |
156 | def get_params(self, query=None, page=None, offset=None, **kwargs):
157 | """ This function should be overwritten to return a dictionary of query params"""
158 | return {'q': query, 'page': page}
159 |
160 | def headers(self):
161 | headers = {
162 | "Cache-Control": 'no-cache',
163 | "Connection": "keep-alive",
164 | "User-Agent": utils.get_rand_user_agent()
165 | }
166 | return headers
167 |
168 | def clear_cache(self, all_cache=False):
169 | """
170 | Triggers the clear cache function for a particular engine
171 |
172 | :param all_cache: if True, deletes for all engines
173 | """
174 | if all_cache:
175 | return self.cache_handler.clear()
176 | return self.cache_handler.clear(self.name)
177 |
178 | async def get_source(self, url, cache=True, proxy=None, proxy_auth=None):
179 | """
180 | Returns the source code of a webpage.
181 | Also sets the _cache_hit if cache was used
182 |
183 | :rtype: string
184 | :param url: URL to pull it's source code
185 | :param proxy: proxy address to make use off
186 | :type proxy: str
187 | :param proxy_auth: (user, password) tuple to authenticate proxy
188 | :type proxy_auth: (str, str)
189 | :return: html source code of a given URL.
190 | """
191 | try:
192 | html, cache_hit = await self.cache_handler.get_source(self.name, url, self.headers(), cache, proxy, proxy_auth)
193 | except Exception as exc:
194 | raise Exception('ERROR: {}\n'.format(exc))
195 | self._cache_hit = cache_hit
196 | return html
197 |
198 | async def get_soup(self, url, cache, proxy, proxy_auth):
199 | """
200 | Get the html soup of a query
201 | :param url: url to obrain soup from
202 | :type url: str
203 | :param cache: cache request or not
204 | :type cache: bool
205 | :param proxy: proxy address to make use off
206 | :type proxy: str
207 | :param proxy_auth: (user, password) tuple to authenticate proxy
208 | :type proxy_auth: (str, str)
209 |
210 | :rtype: `bs4.element.ResultSet`
211 | """
212 | html = await self.get_source(url, cache, proxy, proxy_auth)
213 | return BeautifulSoup(html, 'lxml')
214 |
215 | def get_search_url(self, query=None, page=None, **kwargs):
216 | """
217 | Return a formatted search url
218 | """
219 | # Some URLs use offsets
220 | offset = (page * 10) - 9
221 | params = self.get_params(
222 | query=query, page=page, offset=offset, **kwargs)
223 | url = urlparse(self.search_url)
224 | # For localization purposes, custom urls can be parsed for the same engine
225 | # such as google.de and google.com
226 | if kwargs.get("url"):
227 | new_url = urlparse(kwargs.pop("url"))
228 | # When passing url without scheme e.g google.de, url is parsed as path
229 | if not new_url.netloc:
230 | url = url._replace(netloc=new_url.path)
231 | else:
232 | url = url._replace(netloc=new_url.netloc)
233 | self.base_url = url.geturl()
234 | self._parsed_url = url._replace(query=urlencode(params))
235 |
236 | return self._parsed_url.geturl()
237 |
238 | def get_results(self, soup, **kwargs):
239 | """ Get results from soup"""
240 |
241 | search_results = None
242 | results = self.parse_soup(soup)
243 | # TODO Check if empty results is caused by traffic or answers to query
244 | # were not found
245 | if not results:
246 | print("ENGINE FAILURE: {}\n".format(self.name))
247 | raise NoResultsOrTrafficError(
248 | "The result parsing was unsuccessful. It is either your query could not be found"
249 | " or it was flagged as unusual traffic")
250 |
251 | try:
252 | search_results = self.parse_result(results, **kwargs)
253 | # AttributeError occurs as it cannot pass the returned soup
254 | except AttributeError as e:
255 | raise NoResultsOrTrafficError(
256 | "The returned results could not be parsed. This might be due to site updates or "
257 | "server errors. Drop an issue at https://github.com/bisoncorps/search-engine-parser"
258 | " if this persists"
259 | )
260 |
261 | return search_results
262 |
263 | def search(self, query=None, page=1, cache=True, proxy=None, proxy_auth=None, **kwargs):
264 | """
265 | Query the search engine
266 |
267 | :param query: the query to search for
268 | :type query: str
269 | :param page: Page to be displayed, defaults to 1
270 | :type page: int
271 | :param proxy: proxy address to make use off
272 | :type proxy: str
273 | :param proxy_auth: (user, password) tuple to authenticate proxy
274 | :type proxy_auth: (str, str)
275 | :return: dictionary. Containing titles, links, netlocs and descriptions.
276 | """
277 | # Pages can only be from 1-N
278 | if page <= 0:
279 | page = 1
280 | # Get search Page Results
281 | loop = asyncio.get_event_loop()
282 | url = self.get_search_url(
283 | query, page, **kwargs)
284 | soup = loop.run_until_complete(
285 | self.get_soup(url, cache=cache,
286 | proxy=proxy,
287 | proxy_auth=proxy_auth))
288 | return self.get_results(soup, **kwargs)
289 |
290 | async def async_search(self, query=None, page=1, cache=True, proxy=None, proxy_auth=None, **kwargs):
291 | """
292 | Query the search engine but in async mode
293 |
294 | :param query: the query to search for
295 | :type query: str
296 | :param page: Page to be displayed, defaults to 1
297 | :type page: int
298 | :param proxy: proxy address to make use off
299 | :type proxy: str
300 | :param proxy_auth: (user, password) tuple to authenticate proxy
301 | :type proxy_auth: (str, str)
302 | :return: dictionary. Containing titles, links, netlocs and descriptions.
303 | """
304 | # Pages can only be from 1-N
305 | if page == 0:
306 | page = 1
307 | soup = await self.get_soup(self.get_search_url(query, page, **kwargs), cache=cache, proxy=proxy, proxy_auth=proxy_auth)
308 | return self.get_results(soup, **kwargs)
309 |
--------------------------------------------------------------------------------
/search_engine_parser/core/cli.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Making use of the parser through cli
3 | """
4 | from __future__ import print_function
5 |
6 | import argparse
7 | import sys
8 | from datetime import datetime
9 | from importlib import import_module
10 |
11 | from blessed import Terminal
12 | from search_engine_parser import __version__
13 | from search_engine_parser.core.base import ReturnType
14 | from search_engine_parser.core.exceptions import NoResultsOrTrafficError
15 |
16 |
17 | def display(results, term, args):
18 | """ Displays search results
19 | """
20 | def print_one(kwargs):
21 | """ Print one result to the console """
22 | # Header
23 | if kwargs.get("titles"):
24 | print("\t{}".format(term.magenta(kwargs.pop("titles"))))
25 | if kwargs.get("links"):
26 | print("\t{}".format(kwargs.pop("links")))
27 | print("\t-----------------------------------------------------")
28 | if kwargs.get("descriptions"):
29 | print(kwargs.pop("descriptions"))
30 | if kwargs.values():
31 | for k, v in kwargs.items():
32 | if v:
33 | print(k.strip(), " : ", v)
34 | print("\n")
35 |
36 | if args.rank and args.rank > 10:
37 | sys.exit(
38 | "Results are only limited to 10, specify a different page number instead")
39 |
40 | if not args.rank:
41 | for i in results:
42 | print_one(i)
43 | else:
44 | rank = args.rank
45 | print_one(results[rank])
46 |
47 |
48 | def get_engine_class(engine):
49 | """ Return the Engine Class """
50 | try:
51 | module = import_module(
52 | "search_engine_parser.core.engines.{}".format(
53 | engine.lower()))
54 | return getattr(module, "Search")
55 | except (ImportError, ModuleNotFoundError):
56 | sys.exit('Engine < {} > does not exist'.format(engine))
57 |
58 |
59 | def show_summary(term, engine_class):
60 | """ Show the summary of an Engine"""
61 | print("\t{}".format(term.magenta(engine_class.name)))
62 | print("\t-----------------------------------------------------")
63 | print(engine_class.summary)
64 |
65 |
66 | def main(args): # pylint: disable=too-many-branches
67 | """
68 | Executes logic from parsed arguments
69 | """
70 | term = Terminal()
71 | engine_class = get_engine_class(args.engine)
72 |
73 | if args.show_summary:
74 | show_summary(term, engine_class)
75 | return
76 |
77 | if not args.query:
78 | print("--show-summary or --query argument must be passed")
79 | sys.exit(1)
80 |
81 | # Initialize search Engine with required params
82 | engine = engine_class()
83 | try:
84 | if args.clear_cache:
85 | engine.clear_cache()
86 | # Display full details: Header, Link, Description
87 | start = datetime.now()
88 | results = engine.search(
89 | args.query, args.page, return_type=ReturnType(args.type), url=args.url, proxy=args.proxy, proxy_auth=(args.proxy_user, args.proxy_password))
90 | duration = datetime.now() - start
91 | display(results, term, args)
92 | print("Total search took -> %s seconds" % (duration))
93 | except NoResultsOrTrafficError as exc:
94 | print('\n', '{}'.format(term.red(str(exc))))
95 |
96 |
97 | def create_parser():
98 | """
99 | runner that handles parsing logic
100 | """
101 | parser = argparse.ArgumentParser(description='SearchEngineParser', prog="pysearch")
102 |
103 | parser.add_argument('-V', '--version', action="version", version="%(prog)s v" + __version__)
104 |
105 | parser.add_argument(
106 | '-e', '--engine',
107 | help='Engine to use for parsing the query e.g google, yahoo, bing,'
108 | 'duckduckgo (default: google)',
109 | default='google')
110 |
111 | parser.add_argument(
112 | '--show-summary',
113 | action='store_true',
114 | help='Shows the summary of an engine')
115 |
116 | parser.add_argument(
117 | '-u',
118 | '--url',
119 | help='A custom link to use as base url for search e.g google.de')
120 |
121 | parser.add_argument(
122 | '-p',
123 | '--page',
124 | type=int,
125 | help='Page of the result to return details for (default: 1)',
126 | default=1)
127 |
128 | parser.add_argument(
129 | '-t', '--type',
130 | help='Type of detail to return i.e full, links, desciptions or titles (default: full)',
131 | default="full")
132 |
133 | parser.add_argument(
134 | '-cc', '--clear-cache',
135 | action='store_true',
136 | help='Clear cache of engine before searching')
137 |
138 | parser.add_argument(
139 | '-r',
140 | '--rank',
141 | type=int,
142 | help='ID of Detail to return e.g 5 (default: 0)')
143 |
144 | parser.add_argument(
145 | '--proxy',
146 | required=False,
147 | help='Proxy address to make use of')
148 |
149 | parser.add_argument(
150 | '--proxy-user',
151 | required='--proxy' in sys.argv,
152 | help='Proxy user to make use of')
153 |
154 | parser.add_argument(
155 | '--proxy-password',
156 | required='--proxy' in sys.argv,
157 | help='Proxy password to make use of')
158 |
159 | parser.add_argument(
160 | 'query', type=str, nargs='?',
161 | help='Query string to search engine for')
162 |
163 | return parser
164 |
165 |
166 | def runner():
167 | parser = create_parser()
168 | args = parser.parse_args(sys.argv[1:])
169 | main(args)
170 |
171 |
172 | if __name__ == '__main__':
173 | runner()
174 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisohns/search-engine-parser/0c2f4bde7dd21c10e64c9204417d9a228e96c187/search_engine_parser/core/engines/__init__.py
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/aol.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for AOL search results
3 | """
4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
5 |
6 |
7 | class Search(BaseSearch):
8 | """
9 | Searches Aol for string
10 | """
11 | name = "AOL"
12 | search_url = "https://search.aol.com/aol/search?"
13 | summary = "\t According to netmarketshare, the old time famous AOL is still in the top 10 "\
14 | "search engines with a market share that is close to 0.06%. "\
15 | "The AOL network includes many popular web sites like engadget.com, techchrunch.com and "\
16 | "the huffingtonpost.com. \nOn June 23, 2015, AOL was acquired by Verizon Communications."
17 |
18 | def parse_soup(self, soup):
19 | """
20 | Parses AOL for a search query
21 | """
22 | # find all divs
23 | return soup.find_all('div', class_='algo-sr')
24 |
25 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
26 | """
27 | Parses the source code to return
28 |
29 | :param single_result: single result found in
30 | :type single_result: `bs4.element.ResultSet`
31 | :return: parsed title, link and description of single result
32 | :rtype: dict
33 | """
34 | rdict = SearchItem()
35 | h3_tag = single_result.find('h3')
36 | link_tag = h3_tag.find('a')
37 | if return_type in (ReturnType.FULL, return_type.TITLE):
38 | # Get the text and link
39 | rdict["titles"] = link_tag.text
40 |
41 | if return_type in (ReturnType.FULL, ReturnType.LINK):
42 | rdict["links"] = link_tag.get("href")
43 |
44 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
45 | caption = single_result.find('div', class_='compText aAbs')
46 | desc = caption.find('p', class_='lh-16')
47 | rdict["descriptions"] = desc.text
48 |
49 | return rdict
50 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/ask.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for ask search results
3 | """
4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
5 |
6 |
7 | class Search(BaseSearch):
8 | """
9 | Searches Ask for string
10 | """
11 | name = "Ask"
12 |
13 | search_url = "https://www.ask.com/web?"
14 |
15 | summary = "\t Formerly known as Ask Jeeves, Ask.com receives approximately 0.42% of the search"\
16 | " share. ASK is based on a question/answer format where most questions are answered by "\
17 | "other users or are in the form of polls.\nIt also has the general search functionality "\
18 | "but the results returned lack quality compared to Google or even Bing and Yahoo."
19 |
20 | def get_params(self, query=None, page=None, offset=None, **kwargs):
21 | params = {}
22 | params["o"] = 0
23 | params["l"] = "dir"
24 | params["qo"] = "pagination"
25 | params["q"] = query
26 | params["qsrc"] = 998
27 | params["page"] = page
28 | return params
29 |
30 | def parse_soup(self, soup):
31 | """
32 | Parses Ask Search Soup for results
33 | """
34 | # find all class_='PartialSearchResults-item' => each result
35 | return soup.find_all('div', class_="PartialSearchResults-item")
36 |
37 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
38 | """
39 | Parses the source code to return
40 |
41 | :param single_result: single result found in
42 | :type single_result: `bs4.element.ResultSet`
43 | :return: parsed title, link and description of single result
44 | :rtype: str, str, str
45 | """
46 |
47 | rdict = SearchItem()
48 | if return_type in (ReturnType.FULL, return_type.TITLE):
49 | rdict["titles"] = single_result.find('a').text
50 |
51 | if return_type in (ReturnType.FULL, return_type.TITLE):
52 | rdict["links"] = single_result.a["href"]
53 |
54 | if return_type in (ReturnType.FULL, return_type.TITLE):
55 | rdict["descriptions"] = single_result.find(
56 | 'p', class_="PartialSearchResults-item-abstract").text
57 |
58 | return rdict
59 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/baidu.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for Baidu search results
3 | """
4 |
5 | import re
6 |
7 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
8 |
9 |
10 | class Search(BaseSearch):
11 | """
12 | Searches Baidu for string
13 | """
14 | name = "Baidu"
15 | search_url = "https://www.baidu.com/s?"
16 | summary = "\tBaidu, Inc. is a Chinese multinational technology company specializing in"\
17 | " Internet-related services and products and artificial intelligence (AI), headquartered"\
18 | " in Beijing's Haidian District.\n\tIt is one of the largest AI and internet"\
19 | " companies in the world.\n\tBaidu offers various services, including a"\
20 | " Chinese search engine, as well as a mapping service called Baidu Maps."
21 |
22 | """Override get_search_url"""
23 |
24 | def get_params(self, query=None, page=None, offset=None, **kwargs):
25 | params = {}
26 | params["wd"] = query
27 | params["pn"] = (page - 1) * 10
28 | params["oq"] = query
29 | return params
30 |
31 | def parse_soup(self, soup):
32 | """
33 | Parses Baidu for a search query
34 | """
35 |
36 | # Baidu search can be made deterministic via an id
37 | # Hence, a regex is used to match all eligible ids
38 |
39 | return soup.find_all('div', {'id': re.compile(r"^\d{1,2}")}, class_="c-container")
40 |
41 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
42 | """
43 | Parses the source code to return
44 |
45 | :param single_result: single result found in div with a numeric id
46 | :type single_result: `bs4.element.Tag`
47 | :return: parsed title, link and description of single result
48 | :rtype: dict
49 | """
50 | rdict = SearchItem()
51 | if return_type in (ReturnType.FULL, return_type.TITLE):
52 | h3_tag = single_result.find('h3')
53 |
54 | # sometimes h3 tag is not found
55 | if h3_tag:
56 | rdict["title"] = h3_tag.text
57 |
58 | if return_type in (ReturnType.FULL, ReturnType.LINK):
59 | link_tag = single_result.find('a')
60 | # Get the text and link
61 | rdict["links"] = link_tag.get('href')
62 |
63 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
64 | desc = single_result.find('div', class_='c-abstract')
65 | rdict["descriptions"] = desc if desc else ''
66 | return rdict
67 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/bing.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for Bing search results
3 | """
4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
5 |
6 |
7 | class Search(BaseSearch):
8 | """
9 | Searches Bing for string
10 | """
11 | name = "Bing"
12 | search_url = "https://www.bing.com/search?"
13 | summary = "\tBing is Microsoft’s attempt to challenge Google in search, but despite their "\
14 | "efforts they still did not manage to convince users that their search engine can be"\
15 | " an alternative to Google.\n\tTheir search engine market share is constantly below "\
16 | "10%, even though Bing is the default search engine on Windows PCs."
17 |
18 | def get_params(self, query=None, page=None, offset=None, **kwargs):
19 | params = {}
20 | params["q"] = query
21 | params["offset"] = 0
22 | params["first"] = offset
23 | params["count"] = 10
24 | params["FORM"] = "PERE"
25 | return params
26 |
27 | def parse_soup(self, soup):
28 | """
29 | Parses Bing for a search query.
30 | """
31 | # find all li tags
32 | return soup.find_all('li', class_='b_algo')
33 |
34 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
35 | """
36 | Parses the source code to return
37 |
38 | :param single_result: single result found in
39 | :type single_result: `bs4.element.ResultSet`
40 | :return: parsed title, link and description of single result
41 | :rtype: dict
42 | """
43 | rdict = SearchItem()
44 | h2_tag = single_result.find('h2')
45 | link_tag = h2_tag.find('a')
46 |
47 | if return_type in (ReturnType.FULL, return_type.TITLE):
48 | rdict["titles"] = link_tag.text
49 |
50 | if return_type in (ReturnType.FULL, return_type.LINK):
51 | link = link_tag.get('href')
52 | rdict["links"] = link
53 |
54 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
55 | caption = single_result.find('div', class_='b_caption')
56 | desc = caption.find('p')
57 | rdict["descriptions"] = desc.text
58 |
59 | return rdict
60 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/coursera.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for coursera search results
3 | """
4 |
5 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
6 | from urllib.parse import urljoin
7 |
8 |
9 | class Search(BaseSearch):
10 | """
11 | Searches Coursera for string
12 | """
13 | name = "Coursera"
14 | search_url = "https://www.coursera.org/search?"
15 | summary = "\tCoursera is an American online learning platform founded by Stanford professors Andrew Ng and " \
16 | "Daphne Koller that offers massive open online courses, specializations, and degrees."
17 |
18 | def get_params(self, query=None, page=None, offset=None, **kwargs):
19 | params = {}
20 | params["query"] =query
21 | params["page"] = page
22 | return params
23 |
24 | def parse_soup(self, soup):
25 | """
26 | Parses Coursera Search Soup for results
27 | """
28 | # find all class_='gs_r gs_or gs_scl' => each result
29 | return soup.find_all('li', class_='ais-InfiniteHits-item')
30 |
31 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
32 | """
33 | Parses the source code to return
34 |
35 | :param single_result: single result found in
36 | :type single_result: `bs4.element.ResultSet`
37 | :return: parsed title, link, description, file link, result type of single result
38 | :rtype: dict
39 | """
40 | rdict = SearchItem()
41 |
42 | if return_type in (ReturnType.FULL, return_type.LINK):
43 | link = single_result.find('a', class_='rc-DesktopSearchCard anchor-wrapper').get('href')
44 |
45 | rdict["links"] = urljoin('https://www.coursera.org', link)
46 |
47 | if return_type in (ReturnType.FULL, return_type.TITLE):
48 | title = single_result.find('h2', class_="card-title").text
49 | rdict["titles"] = title
50 |
51 | if return_type in (ReturnType.FULL,):
52 | partner_elem = single_result.find('span', class_='partner-name')
53 | partner = ''
54 | if partner_elem:
55 | partner = partner_elem.text
56 |
57 | rating_avg_elem = single_result.find('span', class_='ratings-text')
58 | rating_avg = None
59 | if rating_avg_elem:
60 | rating_avg = float(rating_avg_elem.text)
61 |
62 | enrollment_elem = single_result.find('span', class_='enrollment-number')
63 | enrolment_number = None
64 |
65 | if enrollment_elem:
66 | enr_cl_txt = enrollment_elem.text.lower().replace(',', '').replace('.', '')\
67 | .replace('m', '0' * 6).replace('k', '0' * 3)
68 | if enr_cl_txt.isdigit():
69 | enrolment_number = int(enr_cl_txt)
70 |
71 | difficulty_elem = single_result.find('span', class_='difficulty')
72 | difficulty = ''
73 | if difficulty_elem:
74 | difficulty = difficulty_elem.text
75 |
76 | rating_count_elem = single_result.find('span', class_='ratings-count')
77 | rating_count = None
78 | if rating_count_elem:
79 | rating_count_elem = rating_count_elem.find('span')
80 | rating_count_cl = rating_count_elem.text.replace(',', '')
81 | if rating_count_cl.isdigit():
82 | rating_count = int(rating_count_cl)
83 |
84 | rdict.update({
85 | "partners": partner,
86 | "ratings_avg": rating_avg,
87 | "ratings_count": rating_count,
88 | "enrolments_numbers": enrolment_number,
89 | "difficulties": difficulty,
90 | })
91 | return rdict
92 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/duckduckgo.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for DuckDuckGo search results
3 | """
4 | import re
5 |
6 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
7 |
8 |
9 | class Search(BaseSearch):
10 | """
11 | Searches DuckDuckGo for string
12 | """
13 | name = "DuckDuckGo"
14 | base_url = "https://www.duckduckgo.com"
15 | search_url = "https://www.duckduckgo.com/html/?"
16 | summary = "\tHas a number of advantages over the other search engines. \n\tIt has a clean "\
17 | "interface, it does not track users, it is not fully loaded with ads and has a number "\
18 | "of very nice features (only one page of results, you can search directly other web "\
19 | "sites etc).\n\tAccording to DuckDuckGo traffic stats [December, 2018], they are "\
20 | "currently serving more than 30 million searches per day."
21 |
22 | def get_params(self, query=None, page=None, offset=None, **kwargs):
23 | params = {}
24 | params["q"] = query
25 | params["s"] = 0 if (page < 2) else (((page-1) * 50) - 20)
26 | params["dc"] = offset
27 | params["o"] = "json"
28 | params["api"] = "d.js"
29 | return params
30 |
31 | def parse_soup(self, soup):
32 | """
33 | Parses DuckDuckGo Search Soup for a query results
34 | """
35 | # find all div tags
36 | return soup.find_all('div', class_='result')
37 |
38 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
39 | """
40 | Parses the source code to return
41 |
42 | :param single_result: single result found in
43 | :type single_result: `bs4.element.ResultSet`
44 | :return: parsed title, link and description of single result
45 | :rtype: dict
46 | """
47 |
48 | rdict = SearchItem()
49 |
50 | if return_type in (ReturnType.FULL, return_type.TITLE):
51 | h2 = single_result.find(
52 | 'h2', class_="result__title") # pylint: disable=invalid-name
53 | # Get the text and link
54 | rdict["titles"] = h2.text.strip()
55 |
56 | if return_type in (ReturnType.FULL, ReturnType.LINK):
57 | link = None
58 | link_tag = single_result.find('a', class_="result__a")
59 | if link_tag is not None:
60 | rdict["links"] = link_tag.get('href')
61 | else:
62 | rdict['links'] = None
63 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
64 | desc = single_result.find(class_='result__snippet')
65 | if desc is not None:
66 | rdict["descriptions"] = desc.text
67 | else:
68 | rdict["descriptions"] = ""
69 | if rdict['links'] is None:
70 | rdict = None
71 |
72 | return rdict
73 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/github.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for GitHub search results
3 | """
4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
5 | from search_engine_parser.core.exceptions import IncorrectKeyWord
6 |
7 |
8 | class Search(BaseSearch):
9 | """
10 | Searches GitHub for string
11 | """
12 | name = "GitHub"
13 | base_url = "https://github.com"
14 | search_url = base_url + "/search?"
15 | summary = "\tGitHub is an American company that provides hosting for software development "\
16 | "version control using Git. It is a subsidiary of Microsoft, which acquired the company "\
17 | "in 2018 for $7.5 billion.\n\tIt offers all of the distributed version control and source"\
18 | " code management (SCM) functionality of Git as well as adding its own features."\
19 | "\n\tAs of May 2019, GitHub reports having over 37 million users and more than 100 million"\
20 | " repositories (including at least 28 million public repositories), making it the largest "\
21 | "host of source code in the world."
22 |
23 | def get_params(self, query=None, page=None, offset=None, **kwargs):
24 | params = {}
25 | params["q"] = query
26 | params["p"] = page
27 | params["type"] = kwargs.get("type_", None)
28 | self.type = params["type"]
29 | return params
30 |
31 | def parse_soup(self, soup):
32 | """
33 | Parses GitHub for a search query.
34 | """
35 | allowed_types = (
36 | None,
37 | "Repositories",
38 | "Wikis",
39 | "Users",
40 | "Topics",
41 | "Marketplace",
42 | "RegistryPackages",
43 | "Issues",
44 | "Commits",
45 | "Code")
46 | if self.type not in allowed_types:
47 | raise IncorrectKeyWord(
48 | "No type <{type_}> exists".format(type_=self.type))
49 | # find all li tags
50 | if self.type in (None, "Repositories"):
51 | return soup.find_all('li', class_='repo-list-item')
52 | elif self.type == "RegistryPackages":
53 | return soup.find_all("div", class_='hx_hit-package')
54 | # find all user divs
55 | elif self.type == "Users":
56 | return soup.find_all('div', class_='user-list-item')
57 | elif self.type == "Wikis":
58 | return soup.find_all('div', class_='hx_hit-wiki')
59 | elif self.type == "Topics":
60 | return soup.find_all('div', class_='topic-list-item')
61 | elif self.type == "Issues":
62 | return soup.find_all('div', class_='issue-list-item')
63 | elif self.type == "Marketplace":
64 | return soup.find_all('div', class_='hx_hit-marketplace')
65 | elif self.type == "Commits":
66 | return soup.find_all('div', class_='commits-list-item')
67 |
68 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
69 | """
70 | Parses the source code to return
71 |
72 | :param single_result: single result found in container element
73 | :type single_result: `bs4.element.ResultSet`
74 | :return: parsed title, link and description of single result
75 | :rtype: dict
76 | """
77 | rdict = SearchItem()
78 | if self.type in (None, "Repositories"):
79 | h3 = single_result.find(
80 | 'div', class_='f4') # pylint: disable=invalid-name
81 | link_tag = h3.find('a')
82 | # Get the text and link
83 | if return_type in (ReturnType.FULL, ReturnType.TITLE):
84 | title = link_tag.text
85 | rdict["titles"] = title
86 |
87 | if return_type in (ReturnType.FULL, ReturnType.LINK):
88 | ref_link = link_tag.get('href')
89 | link = self.base_url + ref_link
90 | rdict["links"] = link
91 |
92 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
93 | desc = single_result.find('p', class_="mb-1")
94 | rdict["descriptions"] = getattr(desc, 'text', '')
95 |
96 | if return_type in (ReturnType.FULL,):
97 | stars_and_lang_div = single_result.find(
98 | 'div', class_='d-flex')
99 | lang = stars_and_lang_div.find(
100 | 'span', itemprop="programmingLanguage")
101 | stars = single_result.find('div', class_='mr-3').find(
102 | 'a')
103 | updated_on = single_result.find("relative-time").get("title")
104 | rdict.update({
105 | "stars": "" if not stars else stars.text.strip(),
106 | "languages": lang.text if lang else "",
107 | "updated_on": updated_on,
108 | })
109 |
110 | if self.type == "Users":
111 | title_tag = single_result.find('div', class_='f4')
112 | if return_type in (ReturnType.FULL, ReturnType.TITLE):
113 | title = title_tag.text
114 | rdict["titles"] = title
115 |
116 | if return_type in (ReturnType.FULL, ReturnType.LINK):
117 | ref_link = title_tag.find('a').get('href')
118 | link = self.base_url + ref_link
119 | rdict["links"] = link
120 |
121 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
122 | desc_tag = single_result.find('p', class_='mb-1')
123 | desc = None
124 | if desc_tag:
125 | desc = desc_tag.text.strip(' \n')
126 | rdict["descriptions"] = desc
127 |
128 | if return_type in (ReturnType.FULL, ):
129 | location_div = single_result.find('div', class_='d-flex')
130 | location_and_email = location_div.find_all(
131 | 'div', class_='mr-3')
132 | location = email = None
133 | for single in location_and_email:
134 | if single.get('href') == None:
135 | location = single.text.strip(' \n')
136 | else:
137 | email = single.text
138 |
139 | rdict.update({
140 | "locations": location,
141 | "emails": email,
142 | })
143 |
144 | if self.type == "Wikis":
145 | title_tag = single_result.find('a', class_=None)
146 |
147 | if return_type in (ReturnType.FULL, ReturnType.TITLE):
148 | title = title_tag.get('title')
149 | rdict["title"] = title
150 |
151 | if return_type in (ReturnType.FULL, ReturnType.LINK):
152 | ref_link = title_tag.get('href')
153 | link = self.base_url + ref_link
154 | rdict["links"] = link
155 |
156 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
157 | desc = single_result.find('p', class_="mb1").text
158 | rdict["descriptions"] = desc
159 |
160 | if return_type in (ReturnType.FULL, ):
161 | last_updated = single_result.find(
162 | 'relative-time').get('title')
163 | repository = single_result.find('a', class_='muted-link').text
164 | rdict.update({
165 | "repositories": repository,
166 | "last_updated": last_updated,
167 | })
168 |
169 | if self.type == "Topics":
170 | title_div = single_result.find('div', class_='f4')
171 | title_tag = title_div.find('a', class_=None)
172 | if return_type in (ReturnType.FULL, ReturnType.TITLE):
173 | rdict["titles"] = title_tag.text
174 | if return_type in (ReturnType.FULL, ReturnType.LINK):
175 | ref_link = title_tag.get('href')
176 | link = self.base_url + ref_link
177 | rdict["links"] = link
178 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
179 | desc = None
180 | desc_tag = single_result.find('p', class_=None)
181 | if desc_tag:
182 | desc = desc_tag.text
183 | rdict["descriptions"] = desc
184 |
185 | if self.type == "Marketplace":
186 | title_tag = single_result.find('a', class_='no-underline')
187 | if return_type in (ReturnType.FULL, ReturnType.TITLE):
188 | title = title_tag.get('title')
189 | rdict["titles"] = title_tag.text
190 | if return_type in (ReturnType.FULL, ReturnType.LINK):
191 | link = title_tag.get('href')
192 | rdict["links"] = link
193 |
194 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
195 | desc = None
196 | desc_tag = single_result.find('text-gray')
197 | if desc_tag:
198 | desc = desc_tag.text
199 | rdict["descriptions"] = desc
200 |
201 | if return_type in (ReturnType.FULL, ):
202 | categories = list()
203 | categories_tags = single_result.find_all('a', class_='Label')
204 | if categories_tags:
205 | for i in categories_tags:
206 | categories.append(str(i).strip('\n '))
207 | rdict["categories"] = categories
208 |
209 | if self.type == "RegistryPackages":
210 | title_tag = single_result.find('a', class_='h4')
211 | if return_type in (ReturnType.FULL, ReturnType.TITLE):
212 | title = title_tag.text
213 | rdict["titles"] = title_tag.text
214 |
215 | if return_type in (ReturnType.FULL, ReturnType.LINK):
216 | ref_link = title_tag.get('href')
217 | link = self.base_url + ref_link
218 | rdict["links"] = link
219 |
220 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
221 | desc = single_result.find(
222 | 'p', class_='mb-1').text.strip('\n ')
223 | rdict["descriptions"] = desc
224 |
225 | if self.type == "Issues":
226 | title_tag = single_result.find('a', class_=None)
227 | if return_type in (ReturnType.FULL, ReturnType.TITLE):
228 | title = title_tag.text
229 | rdict["titles"] = title_tag.text
230 |
231 | if return_type in (ReturnType.FULL, ReturnType.LINK):
232 | ref_link = title_tag.get('href')
233 | link = self.base_url + ref_link
234 | rdict["links"] = link
235 |
236 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
237 | desc = single_result.find('p', class_='mb-0').text
238 | rdict["descriptions"] = desc
239 |
240 | if return_type in (ReturnType.FULL, ):
241 | repository = single_result.find(
242 | 'div', class_='ml-1').find('a', 'text-bold').text
243 | opened_by = self.base_url + \
244 | single_result.find(
245 | 'div', class_='mr-3').find('a').get('href')
246 | opened_on = single_result.find('relative-time').get("title")
247 | rdict.update({
248 | "opened_by": opened_by,
249 | "opened_on": opened_on,
250 | "respositories": repository,
251 | })
252 |
253 | if self.type == "Commits":
254 | title_p = single_result.find('div', class_="f4")
255 | title_tag = title_p.find('a')
256 |
257 | if return_type in (ReturnType.FULL, ReturnType.TITLE):
258 | title = title_tag.get('aria-label').strip("\n ")
259 | rdict["titles"] = title_tag.text
260 |
261 | if return_type in (ReturnType.FULL, ReturnType.LINK):
262 | ref_link = title_tag.get('href')
263 | if ref_link.startswith("http"):
264 | link = ref_link
265 | else:
266 | link = self.base_url + ref_link
267 | rdict["links"] = link
268 |
269 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
270 | opened_on = None
271 | author = None
272 | if single_result.find('relative-time'):
273 | opened_on = single_result.find(
274 | 'relative-time').get("title")
275 | desc = None
276 | if single_result.find('a', class_='commit-author'):
277 | author_tag = single_result.find(
278 | 'a', class_='commit-author')
279 | author = author_tag.text
280 | div = single_result.find('div', class_='d-flex')
281 | repo = div.find('a').text
282 | desc = "Committed to {}".format(repo)
283 | rdict["descriptions"] = desc
284 | if return_type == ReturnType.FULL:
285 | rdict.update({
286 | "authors": author,
287 | "opened_on": opened_on,
288 | })
289 | return rdict
290 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/google.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for google search results
3 | """
4 | import sys
5 | from urllib.parse import (
6 | urljoin,
7 | parse_qs,
8 | unquote
9 | )
10 | import urllib.parse as urlparse
11 |
12 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
13 |
14 |
15 | EXTRA_PARAMS = ('hl', 'tbs')
16 |
17 |
18 | class Search(BaseSearch):
19 | """
20 | Searches Google for string
21 | """
22 | name = "Google"
23 | base_url = "https://www.google.com/"
24 | summary = "\tNo need for further introductions. The search engine giant holds the first "\
25 | "place in search with a stunning difference of 65% from second in place Bing.\n"\
26 | "\tAccording to the latest netmarketshare report (November 2018) 73% of searches "\
27 | "were powered by Google and only 7.91% by Bing.\n\tGoogle is also dominating the "\
28 | "mobile/tablet search engine market share with 81%!"
29 |
30 | def __init__(self):
31 | super().__init__()
32 | self.search_url = urljoin(self.base_url, "search")
33 |
34 | def get_params(self, query=None, offset=None, page=None, **kwargs):
35 | params = {}
36 | params["start"] = (page-1) * 10
37 | params["q"] = query
38 | params["gbv"] = 1
39 | # additional parameters will be considered
40 | for param in EXTRA_PARAMS:
41 | if kwargs.get(param):
42 | params[param] = kwargs[param]
43 | return params
44 |
45 | def parse_url(self, url):
46 | return self.clean_url(urljoin(self.base_url, url))
47 |
48 | def parse_soup(self, soup):
49 | """
50 | Parses Google Search Soup for results
51 | """
52 | # find all class_='g' => each result
53 | return soup.find_all('div', class_="Gx5Zad fP1Qef xpd EtOod pkphOe")
54 |
55 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
56 | """
57 | Parses the source code to return
58 |
59 | :param single_result: single result found in
60 | :type single_result: `bs4.element.ResultSet`
61 | :return: parsed title, link and description of single result
62 | :rtype: dict
63 | """
64 | # Some unneeded details shown such as suggestions should be ignore
65 | if (single_result.find("h2", class_="wITvVb") and single_result.find("div", class_="LKSyXe"))\
66 | or single_result.find("div", class_="X7NTVe"):
67 | return
68 |
69 | results = SearchItem()
70 | els = single_result.find_all('div', class_='kCrYT')
71 | if len(els) < 2:
72 | return
73 |
74 | # First div contains title and url
75 | r_elem = els[0]
76 |
77 | # Get the text and link
78 | if return_type in (ReturnType.FULL, ReturnType.TITLE):
79 | link_tag = r_elem.find('a')
80 | if link_tag:
81 | title = link_tag.find('h3').text
82 | else:
83 | r_elem = els[1]
84 | title = r_elem.find('div', class_='BNeawe').text
85 | results['titles'] = title
86 |
87 | if return_type in (ReturnType.FULL, ReturnType.LINK):
88 | link_tag = r_elem.find('a')
89 | if link_tag:
90 | raw_link = link_tag.get('href')
91 | raw_url = urljoin(self.base_url, raw_link)
92 | results['raw_urls'] = raw_url
93 | results['links'] = self.clean_url(raw_url)
94 |
95 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
96 | # Second Div contains Description
97 | desc_tag = els[1]
98 | if return_type in (ReturnType.FULL, ReturnType.LINK) and not results.get('links'):
99 | link_tag = desc_tag.find('a')
100 | if link_tag:
101 | desc_tag = els[0]
102 | raw_link = link_tag.get('href')
103 | raw_url = urljoin(self.base_url, raw_link)
104 | results['raw_urls'] = raw_url
105 | results['links'] = self.clean_url(raw_url)
106 | desc = desc_tag.text
107 | results['descriptions'] = desc
108 | return results
109 |
110 | def clean_url(self, url):
111 | """
112 | Extract clean URL from the SERP URL.
113 |
114 | >clean_url('https://www.google.com/url?q=https://english.stackexchange.com/questions/140710/what-is-the-opposite-of-preaching-to-the-choir&sa=U&ved=2ahUKEwi31MGyzvnuAhXyyDgGHXXACOYQFnoECAkQAg&usg=AOvVaw1GdXON-JIWGu-dGjHfgljl')
115 | https://english.stackexchange.com/questions/140710/what-is-the-opposite-of-preaching-to-the-choir
116 | """
117 | parsed = urlparse.urlparse(url)
118 | url_qs = parse_qs(parsed.query)
119 | if 'q' in url_qs:
120 | return unquote(url_qs['q'][0])
121 | elif 'url' in url_qs:
122 | return unquote(url_qs['url'][0])
123 | # Add more cases here.
124 | return url
125 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/googlenews.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for google news search results
3 | """
4 |
5 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
6 |
7 |
8 | class Search(BaseSearch):
9 | """
10 | Searches Google News for string
11 | """
12 | name = "GoogleNews"
13 | search_url = "https://www.google.com/search?"
14 | summary = "\tGoogle News is a news aggregator app developed by Google. It presents a "\
15 | "continuous, customizable flow of articles organized from thousands of publishers "\
16 | "and magazines. Google News is available as an app on Android, iOS, and the Web. "\
17 | "Google released a beta version in September 2002 and the official app in January 2006."
18 |
19 | def get_params(self, query=None, offset=None, page=None, **kwargs):
20 | params = {}
21 | params["num"] = 10
22 | params["start"] = page
23 | params["q"] = query
24 | params["client"] = "ubuntu"
25 | params["tbm"] = "nws"
26 | return params
27 |
28 | def parse_soup(self, soup):
29 | """
30 | Parses Google News Search Soup for results
31 | """
32 | # find all class_='g' => each result
33 | return soup.find_all('div', class_='g')
34 |
35 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
36 | """
37 | Parses the source code to return
38 |
39 | :param single_result: single result found in
40 | :type single_result: `bs4.element.ResultSet`
41 | :return: parsed title, link, description, imge link, news source, date of single result
42 | :rtype: dict
43 | """
44 | rdict = SearchItem()
45 |
46 | if return_type in (ReturnType.FULL, return_type.TITLE):
47 | title_tag = single_result.find('h3')
48 | title = title_tag.text
49 | rdict["titles"] = title
50 |
51 | if return_type in (ReturnType.FULL, ReturnType.LINK):
52 | link_tag = single_result.find('a')
53 | rdict["links"] = link_tag.get('href')
54 |
55 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
56 | desc_tag = single_result.find('div', class_='st')
57 | rdict["descriptions"] = desc_tag.text
58 |
59 | if return_type in (ReturnType.FULL,):
60 | img_tag = single_result.find('img', class_='th')
61 | news_source_tag = single_result.find('span', class_='e8fRJf')
62 | date_tag = single_result.find('span', class_='f')
63 |
64 | rdict["image_url"] = img_tag.get('src')
65 | rdict["news_source"] = news_source_tag.text
66 | rdict["date"] = date_tag.text
67 | return rdict
68 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/googlescholar.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for google scholar search results
3 | """
4 |
5 | import re
6 |
7 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
8 |
9 |
10 | class Search(BaseSearch):
11 | """
12 | Searches Google Scholar for string
13 | """
14 | name = "GoogleScholar"
15 | search_url = "https://scholar.google.gr/scholar?"
16 | summary = "\tGoogle Scholar is a freely accessible web search engine that indexes the full "\
17 | "text or metadata of scholarly literature across an array of publishing formats and "\
18 | "disciplines."
19 |
20 | def get_params(self, query=None, offset=None, page=None, **kwargs):
21 | params = {}
22 | params["hl"] = "en"
23 | params["start"] = page
24 | params["q"] = query
25 | return params
26 |
27 | def parse_soup(self, soup):
28 | """
29 | Parses Google Scholar Search Soup for results
30 | """
31 | # find all class_='gs_r gs_or gs_scl' => each result
32 | return soup.find_all('div', class_='gs_r gs_or gs_scl')
33 |
34 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
35 | """
36 | Parses the source code to return
37 |
38 | :param single_result: single result found in
39 | :type single_result: `bs4.element.ResultSet`
40 | :return: parsed title, link, description, file link, result type of single result
41 | :rtype: dict
42 | """
43 | rdict = SearchItem()
44 | r_elem = single_result.find('h3', class_='gs_rt')
45 | if return_type in (ReturnType.FULL, ReturnType.LINK):
46 | link_tag = r_elem.find('a')
47 | if link_tag:
48 | raw_link = link_tag.get('href')
49 | else:
50 | raw_link = ''
51 | rdict["links"] = raw_link
52 |
53 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
54 | desc = single_result.find('div', class_='gs_rs')
55 | if desc:
56 | desc = desc.text
57 | else:
58 | desc = ''
59 | rdict["descriptions"] = desc
60 |
61 | if return_type in (ReturnType.FULL, return_type.TITLE):
62 | title = r_elem.text
63 | title = re.sub(r'^[\[\w+\]]+ ', '', title)
64 | rdict["titles"] = title
65 |
66 | if return_type == ReturnType.FULL:
67 | t_elem = single_result.find('span', class_='gs_ct1')
68 | if t_elem:
69 | result_type = t_elem.text
70 | else:
71 | result_type = ''
72 |
73 | f_elem = single_result.find('div', class_='gs_or_ggsm')
74 | if f_elem:
75 | flink_tag = r_elem.find('a')
76 | if flink_tag:
77 | file_link = flink_tag.get('href')
78 | else:
79 | file_link = ''
80 | else:
81 | file_link = ''
82 |
83 | rdict.update({
84 | "result_types": result_type,
85 | "files_links": file_link
86 | })
87 |
88 | return rdict
89 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/myanimelist.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for MyAnimeList search results
3 | """
4 |
5 | import math
6 | import sys
7 |
8 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
9 |
10 |
11 | class Search(BaseSearch):
12 | """
13 | Searches MyAnimeList for string
14 | """
15 | name = "MyAnimeList"
16 |
17 | search_url = "https://myanimelist.net/anime.php?"
18 | summary = "\tMyAnimeList, often abbreviated as MAL, is an anime and manga social"\
19 | "networking and social cataloging application website."\
20 | "\n\tThe site provides its users with a list-like system to organize"\
21 | "and score anime and manga.\n\tIt facilitates finding users who share"\
22 | "similar tastes and provides a large database on anime and manga.\n\tThe"\
23 | "site claims to have 4.4 million anime and 775,000 manga entries."\
24 | "\n\tIn 2015, the site received over 120 million visitors a month."
25 |
26 | def get_params(self, query=None, page=None, offset=None, **kwargs):
27 | params = {}
28 | params["show"] = (math.ceil(page / 5) - 1) * 50
29 | params["q"] = query
30 | return params
31 |
32 | def parse_soup(self, soup):
33 | """
34 | Parses MyAnimeList for a search query
35 | """
36 |
37 | # The data is stored in table so find all table rows
38 | # The first row is table header
39 | res = soup.find('div', class_='js-categories-seasonal js-block-list list')
40 | if res:
41 | return res.find_all('tr')[1:]
42 |
43 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
44 | """
45 | Parses the source code to return
46 |
47 | :param single_result: single result found in div with a numeric id
48 | :type single_result: `bs4.element.Tag`
49 | :return: parsed title, link and description of single result
50 | :rtype: str, str, str
51 | """
52 | rdict = SearchItem()
53 | link_tag = single_result.find('a', class_='fw-b')
54 |
55 | if return_type in (ReturnType.FULL, return_type.TITLE):
56 | title = link_tag.find('strong').text
57 | rdict["titles"] = title
58 |
59 | if return_type in (ReturnType.FULL, ReturnType.LINK):
60 | rdict["links"] = link_tag.get('href')
61 |
62 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
63 | desc = single_result.find('div', class_='pt4').text.strip()
64 | rdict["descriptions"] = desc
65 |
66 | if return_type == ReturnType.FULL:
67 | data = list(single_result.find_all('td', class_='ac'))
68 | animetype = data[0].text.strip()
69 | episodes = data[1].text.strip()
70 | score = data[2].text.strip()
71 |
72 | rdict.update({
73 | "episode_count": episodes,
74 | "animetypes": animetype,
75 | "ratings": score
76 | })
77 | return rdict
78 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/stackoverflow.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for AOL search results
3 | """
4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
5 |
6 |
7 | class Search(BaseSearch):
8 | """
9 | Searches StackOverflow for string
10 | """
11 | name = "StackOverflow"
12 | base_url = "https://stackoverflow.com"
13 | search_url = base_url + "/search?"
14 | summary = "\tStack Overflow is a question and answer site for professional and enthusiast "\
15 | "programmers.\n\tIt is a privately held website, the flagship site of the Stack "\
16 | "Exchange Network, created in 2008 by Jeff Atwood and Joel Spolsky.\n\tIt features "\
17 | "questions and answers on a wide range of topics in computer programming. It was "\
18 | "created to be a more open alternative to earlier question and answer sites "\
19 | "such as Experts-Exchange"
20 |
21 | def get_params(self, query=None, offset=None, page=None, **kwargs):
22 | params = {}
23 | params["page"] = page
24 | params["q"] = query
25 | params["pagesize"] = 15
26 | return params
27 |
28 | def parse_soup(self, soup):
29 | """
30 | Parses StackOverflow for a search query
31 | """
32 | # find all divs
33 | return soup.find_all('div', class_='summary')
34 |
35 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
36 | """
37 | Parses the source code to return
38 |
39 | :param single_result: single result found in
40 | :type single_result: `bs4.element.ResultSet`
41 | :return: parsed title, link and description of single result
42 | :rtype: dict
43 | """
44 | rdict = SearchItem()
45 | h3 = single_result.find('h3') # pylint: disable=invalid-name
46 | link_tag = h3.find('a')
47 | if return_type in (ReturnType.FULL, return_type.TITLE):
48 | # Get the text and link
49 | rdict["titles"] = link_tag.text
50 |
51 | if return_type in (ReturnType.FULL, return_type.LINK):
52 | ref_link = link_tag.get('href')
53 | link = self.base_url + ref_link
54 | rdict["links"] = link
55 |
56 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
57 | caption = single_result.find('div', class_='excerpt')
58 | rdict["descriptions"] = caption.text
59 | return rdict
60 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/yahoo.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for Yahoo search results
3 | """
4 | import re
5 |
6 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
7 |
8 |
9 | class Search(BaseSearch):
10 | """
11 | Searches Yahoo for string
12 | """
13 | name = "Yahoo"
14 | search_url = "https://search.yahoo.com/search?"
15 | summary = "\tYahoo is one the most popular email providers and holds the fourth place in "\
16 | "search with 3.90% market share.\n\tFrom October 2011 to October 2015, Yahoo search "\
17 | "was powered exclusively by Bing. \n\tSince October 2015 Yahoo agreed with Google to "\
18 | "provide search-related services and since then the results of Yahoo are powered both "\
19 | "by Google and Bing. \n\tYahoo is also the default search engine for Firefox browsers "\
20 | "in the United States (since 2014)."
21 |
22 | def get_params(self, query=None, page=None, offset=None, **kwargs):
23 | params = {}
24 | params["p"] = query
25 | params["b"] = offset
26 | return params
27 |
28 | def parse_soup(self, soup):
29 | """
30 | Parses Yahoo for a search query
31 | """
32 | # find all divs
33 | return soup.find_all('div', class_='Sr')
34 |
35 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
36 | """
37 | Parses the source code to return
38 |
39 | :param single_result: single result found in
40 | :type single_result: `bs4.element.ResultSet`
41 | :return: parsed title, link and description of single result
42 | :rtype: dict
43 | """
44 | rdict = SearchItem()
45 | h3_tag = single_result.find('h3', class_='title')
46 |
47 | if return_type in (ReturnType.FULL, return_type.TITLE):
48 | title = h3_tag.text
49 | rdict["titles"] = title
50 |
51 | if return_type in (ReturnType.FULL, ReturnType.LINK):
52 | link_tag = h3_tag.find('a')
53 | raw_link = link_tag.get('href')
54 | re_str = re.findall("/RU=(.+)/RK", raw_link)[0]
55 | re_str = re_str.replace("%3a", ":")
56 | link = re_str.replace("%2f", "/")
57 | rdict["links"] = link
58 |
59 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
60 | desc = single_result.find('span', class_='fc-falcon')
61 | rdict["descriptions"] = desc.text
62 |
63 | return rdict
64 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/yandex.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for Yandex search results
3 | """
4 |
5 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
6 |
7 |
8 | class Search(BaseSearch):
9 | """
10 | Searches Yandex for string
11 | """
12 | name = "Yandex"
13 | search_url = "https://yandex.com/search/?"
14 | summary = "\tYandex is the largest technology company in Russia and the"\
15 | " largest search engine on the internet in Russian"\
16 | ", with a market share of over 52%."\
17 | "\n\tThe Yandex.ru home page is the 4th most popular website in Russia."\
18 | "\n\tIt also has the largest market share of any search engine in the Commonwealth"\
19 | " of Independent States and is the 5th largest search engine worldwide"\
20 | " after Google, Baidu, Bing, and Yahoo!"
21 |
22 | def get_params(self, query=None, page=None, offset=None, **kwargs):
23 | params = {}
24 | params["text"] = query
25 | params["p"] = offset
26 | return params
27 |
28 | def parse_soup(self, soup):
29 | """
30 | Parses Yandex for a search query
31 | """
32 | return soup.find_all('li', class_="serp-item")
33 |
34 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
35 | """
36 | Parses the source code to return
37 |
38 | :param single_result: single result found in
39 | :type single_result: `bs4.element.ResultSet`
40 | :return: parsed title, link and description of single result
41 | :rtype: str, str, str
42 | """
43 | rdict = SearchItem()
44 | h3_tag = single_result.find('div', class_="organic__url-text")
45 |
46 | if return_type in (ReturnType.FULL, return_type.TITLE):
47 | # Get the text and link
48 | title = h3_tag.text
49 | # Handle read more type texts
50 | index = title.find("Read more")
51 | if index >= 0:
52 | title = title[0:int(index)]
53 | rdict["titles"] = title
54 |
55 | if return_type in (ReturnType.FULL, ReturnType.LINK):
56 | link_tag = single_result.find('a')
57 | link = link_tag.get('href')
58 | rdict["links"] = link
59 |
60 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
61 | desc = single_result.find('div', class_="organic__content-wrapper")
62 | desc = desc.text
63 | rdict["descriptions"] = desc
64 | return rdict
65 |
--------------------------------------------------------------------------------
/search_engine_parser/core/engines/youtube.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Parser for YouTube search results
3 | """
4 | from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem
5 |
6 |
7 | class Search(BaseSearch):
8 | """
9 | Searches YouTube for string
10 | """
11 | name = "YouTube"
12 | base_url = "https://youtube.com"
13 | search_url = base_url + "/results?"
14 | summary = "\tYouTube is an American video-sharing website headquartered in San Bruno, "\
15 | "California. Three former PayPal employees—Chad Hurley, Steve Chen, and Jawed "\
16 | "Karim—created the service in February 2005.\n\tGoogle bought the site in November "\
17 | "2006 for US$1.65 billion; YouTube now operates as one of Google's subsidiaries. "\
18 | "As of May 2019, more than 500 hours of video content are uploaded to YouTube every minute"
19 |
20 | def get_params(self, query=None, page=None, offset=None, **kwargs):
21 | params = {}
22 | params["search_query"] = query
23 | return params
24 |
25 | def parse_soup(self, soup):
26 | """
27 | Parses YouTube for a search query.
28 | """
29 | # find all ytd-video-renderer tags
30 | return soup.find_all('div', class_='yt-lockup-content')
31 |
32 | def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
33 | """
34 | Parses the source code to return
35 |
36 | :param single_result: single result found in
37 | :type single_result: `bs4.element.ResultSet`
38 | :return: parsed title, link and description of single result
39 | :rtype: dict
40 | """
41 | rdict = SearchItem()
42 | # pylint: disable=too-many-locals
43 | title_tag = single_result.find('a', class_='yt-uix-tile-link')
44 | channel_name = ""
45 |
46 | if return_type in (ReturnType.FULL, return_type.TITLE):
47 | # Get the text and link
48 | rdict["titles"] = title_tag.text
49 |
50 | # try for single videos
51 | try:
52 | if return_type in (ReturnType.FULL, ReturnType.LINK):
53 | ref_link = title_tag.get('href')
54 | link = self.base_url + ref_link
55 | rdict["links"] = link
56 |
57 | if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
58 | desc = single_result.find(
59 | 'div', class_="yt-lockup-description").text
60 | rdict["descriptions"] = desc
61 |
62 | if return_type in (ReturnType.FULL, ):
63 | duration = single_result.find(
64 | 'span', class_='accessible-description').text
65 | ul_tag = single_result.find('ul', class_='yt-lockup-meta-info')
66 |
67 | channel_name = single_result.find(
68 | 'a', class_='yt-uix-sessionlink spf-link').text
69 | views_and_upload_date = ul_tag.find_all('li')
70 | upload_date = views_and_upload_date[0].text
71 | views = views_and_upload_date[1].text
72 | rdict.update({
73 | "channels": channel_name,
74 | "durations": duration,
75 | "views": views,
76 | "upload_dates": upload_date,
77 | })
78 | except BaseException: # pylint: disable=broad-except
79 | link_tags = single_result.find_all(
80 | 'a', class_='yt-uix-sessionlink spf-link')
81 | # TODO Optimize calls here so that we don't assign ref_link and channel_name
82 | # when we don't need them
83 | for i in link_tags:
84 | if i.get("href").startswith("/playlist"):
85 | ref_link = i.get("href")
86 | elif i.get("href").startswith("/user"):
87 | channel_name = i.text
88 | if return_type in (ReturnType.FULL, ReturnType.LINK):
89 | link = self.base_url + ref_link
90 | rdict["links"] = link
91 |
92 | if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
93 | desc = single_result.find(
94 | 'span', class_='accessible-description').text
95 | rdict["descriptions"] = desc
96 | if return_type in (ReturnType.FULL,):
97 | rdict.update({
98 | "channels": channel_name,
99 | })
100 | return rdict
101 |
--------------------------------------------------------------------------------
/search_engine_parser/core/exceptions.py:
--------------------------------------------------------------------------------
1 | """@desc
2 | Exceptions
3 | """
4 |
5 |
6 | class NoResultsFound(Exception):
7 | pass
8 |
9 |
10 | class NoResultsOrTrafficError(Exception):
11 | """ When No results is returned or unusual traffic caused app to return empty results """
12 |
13 | class IncorrectKeyWord(Exception):
14 | """ When a wrong keyword argument is passed to the search function """
15 |
--------------------------------------------------------------------------------
/search_engine_parser/core/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 | import pickle
4 | import hashlib
5 | import aiohttp
6 | from fake_useragent import UserAgent
7 |
8 | FILEPATH = os.path.dirname(os.path.abspath(__file__))
9 |
10 | # prevent caching
11 | USER_AGENT_LIST = [
12 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0",
13 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
14 | "Chrome/72.0.3626.121 Safari/537.36",
15 | "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0",
16 | "Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0",
17 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) "
18 | "Chrome/19.0.1084.46 Safari/536.5",
19 | "Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) "
20 | "Chrome/19.0.1084.46 Safari/536.5",
21 | ]
22 |
23 |
24 | def get_rand_user_agent():
25 | user_agent = random.choice(USER_AGENT_LIST)
26 | try:
27 | user_agent = UserAgent().random
28 | except:
29 | pass
30 | return user_agent
31 |
32 |
33 |
34 | class CacheHandler:
35 | def __init__(self):
36 | self.cache = os.path.join(FILEPATH, "cache")
37 | engine_path = os.path.join(FILEPATH, "engines")
38 | if not os.path.exists(self.cache):
39 | os.makedirs(self.cache)
40 | enginelist = os.listdir(engine_path)
41 | self.engine_cache = {i[:-3]: os.path.join(self.cache, i[:-3]) for i in enginelist if i not in
42 | ("__init__.py")}
43 | for cache in self.engine_cache.values():
44 | if not os.path.exists(cache):
45 | os.makedirs(cache)
46 |
47 | async def get_source(self, engine, url, headers, cache=True,
48 | proxy=None, proxy_auth=None):
49 | """
50 | Retrieves source code of webpage from internet or from cache
51 |
52 | :rtype: str, bool
53 | :param engine: engine of the engine saving
54 | :type engine: str
55 | :param url: URL to pull source code from
56 | :type url: str
57 | :param headers: request headers to make use of
58 | :type headers: dict
59 | :param cache: use cache or not
60 | :type cache: bool
61 | :param proxy: proxy address to make use off
62 | :type proxy: str
63 | :param proxy_auth: (user, password) tuple to authenticate proxy
64 | :type proxy_auth: (str, str)
65 | """
66 | encodedUrl = url.encode("utf-8")
67 | urlhash = hashlib.sha256(encodedUrl).hexdigest()
68 | engine = engine.lower()
69 | cache_path = os.path.join(self.engine_cache[engine], urlhash)
70 | if os.path.exists(cache_path) and cache:
71 | with open(cache_path, 'rb') as stream:
72 | return pickle.load(stream), True
73 | get_vars = { 'url':url, 'headers':headers }
74 | if proxy and proxy_auth:
75 | auth = aiohttp.BasicAuth(*proxy_auth)
76 | get_vars.update({'proxy':proxy, 'proxy_auth': auth})
77 |
78 | async with aiohttp.ClientSession() as session:
79 | async with session.get(**get_vars) as resp:
80 | html = await resp.text()
81 | with open(cache_path, 'wb') as stream:
82 | pickle.dump(str(html), stream)
83 | return str(html), False
84 |
85 | def clear(self, engine=None):
86 | """
87 | Clear the entire cache either by engine name
88 | or just all
89 |
90 | :param engine: engine to clear
91 | """
92 | if not engine:
93 | for engine_cache in self.engine_cache.values():
94 | for root, dirs, files in os.walk(engine_cache):
95 | for f in files:
96 | os.remove(os.path.join(engine_cache, f))
97 | else:
98 | engine_cache = self.engine_cache[engine.lower()]
99 | for _, _, files in os.walk(engine_cache):
100 | for f in files:
101 | os.remove(os.path.join(engine_cache, f))
102 |
--------------------------------------------------------------------------------
/search_engine_parser/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bisohns/search-engine-parser/0c2f4bde7dd21c10e64c9204417d9a228e96c187/search_engine_parser/tests/__init__.py
--------------------------------------------------------------------------------
/search_engine_parser/tests/test_base.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 | from importlib import import_module
4 | from urllib.parse import urlparse
5 | from unittest.mock import patch, MagicMock
6 | import vcr
7 | from parameterized import parameterized_class
8 |
9 | from search_engine_parser.core.exceptions import NoResultsOrTrafficError
10 |
11 | SEARCH_ARGS = ('Hello', 1)
12 |
13 |
14 | def get_engines():
15 | """ Returns a list of all engines for tests """
16 | engines = []
17 |
18 | base_dir = os.getcwd()
19 | engines_dir = os.path.join(base_dir, 'search_engine_parser', 'core', 'engines')
20 |
21 | for filename in os.listdir(engines_dir):
22 | if os.path.isfile(os.path.join(engines_dir, filename)) and filename.endswith('.py') \
23 | and filename != '__init__.py':
24 | engine = filename.split('.py')[0]
25 | module = import_module("search_engine_parser.core.engines.{}".format(engine.lower()))
26 | engine_class = getattr(module, "Search")
27 | engines.append([engine, engine_class(),])
28 | return engines
29 |
30 |
31 | def validate_url(url):
32 | """ Checks if a url is valid
33 | urls must contain scheme, netloc and path
34 | """
35 | try:
36 | result = urlparse(url)
37 | return all([result.scheme, result.netloc, result.path])
38 | except BaseException: # pylint: disable=broad-except
39 | print("URL: %s\n" % url)
40 | return False
41 |
42 |
43 | # pylint: disable=no-member
44 | class EngineBaseTest(unittest.TestCase):
45 | """ Testbase for Engines
46 |
47 | provides tests for engine methods
48 | """
49 |
50 | def setUp(self):
51 | from search_engine_parser.core.engines.google import Search # pylint: disable=import-outside-toplevel
52 | self.engine = Search()
53 |
54 | @patch('search_engine_parser.core.engines.google.Search.get_results')
55 | @patch('search_engine_parser.core.engines.google.Search.get_soup')
56 | async def test_urls(self, get_results_mock, get_soup_mock):
57 | """ Test that url updates work fine """
58 | await self.engine.search(query="hello", url="google.com.tr")
59 | first_url = self.engine._parsed_url.geturl()
60 | self.assertTrue(validate_url(first_url))
61 |
62 | self.engine.search(query="World", url="https://google.com.tr")
63 | second_url = self.engine._parsed_url.geturl()
64 | self.assertTrue(validate_url(second_url))
65 |
66 | self.assertNotEqual(second_url, first_url)
67 |
68 | # Test for https://github.com/bisoncorps/search-engine-parser/issues/92
69 | def test_two_queries_different_results(self):
70 | """ Test that url updates work fine """
71 | from search_engine_parser.core.engines.google import Search as GoogleSearch # pylint: disable=import-outside-toplevel
72 | from search_engine_parser.core.engines.yahoo import Search as YahooSearch # pylint: disable=import-outside-toplevel
73 | gengine = GoogleSearch()
74 | yahoo_engine = YahooSearch()
75 | gresults = None
76 | gresults = None
77 | with vcr.use_cassette('fixtures/google-test-diff-synopsis.yaml', record_mode='once'):
78 | gresults = gengine.search(query="What's up from this side")
79 | with vcr.use_cassette('fixtures/yahoo-test-diff-synopsis.yaml', record_mode='once'):
80 | yresults = yahoo_engine.search(query="this is example Bob")
81 | for key in gresults[0]:
82 | self.assertNotEqual(gresults[0].get(key, "GSearch"), yresults[0].get(key, "Ysearch"))
83 |
84 | self.assertNotEqual(gresults, yresults)
85 |
86 | # pylint: disable=no-member
87 | @parameterized_class(('name', 'engine'), get_engines())
88 | class TestScraping(unittest.TestCase):
89 | """ Testbase for Engines
90 |
91 | provides tests for titles, description and return urls
92 | """
93 | engine_class = None
94 |
95 | @classmethod
96 | def setUpClass(cls):
97 | super().setUpClass()
98 |
99 | try:
100 | cls.vcr_search(*SEARCH_ARGS)
101 | except NoResultsOrTrafficError:
102 | raise unittest.SkipTest(
103 | '{} failed due to traffic'.format(
104 | cls.engine))
105 |
106 | @classmethod
107 | def vcr_search(cls, *args, **kwargs):
108 | print(cls.name)
109 | with vcr.use_cassette('fixtures/{}-{}-synopsis.yaml'.format(cls.name, args[0].replace(" ", "-")), record="once"):
110 | cls.results = cls.engine.search(*args, **kwargs)
111 |
112 | @classmethod
113 | def test_cache_used(cls):
114 | """
115 | Test that the cache was used
116 | """
117 | try:
118 | cls.vcr_search(*SEARCH_ARGS, cache=True)
119 | if cls.engine._cache_hit == False:
120 | assert False, "{} cache - unexpected miss".format(
121 | cls.engine.name)
122 | except NoResultsOrTrafficError:
123 | raise unittest.SkipTest(
124 | '{} failed due to traffic'.format(
125 | cls.engine))
126 |
127 | @classmethod
128 | def test_cache_not_used(cls):
129 | """
130 | Test that the cache was used
131 | """
132 | try:
133 | cls.vcr_search(*SEARCH_ARGS, cache=False)
134 | if cls.engine._cache_hit == True:
135 | assert False, "{} cache - unexpected hit".format(
136 | cls.engine.name)
137 | except NoResultsOrTrafficError:
138 | raise unittest.SkipTest(
139 | '{} failed due to traffic'.format(
140 | cls.engine))
141 |
142 | @classmethod
143 | def test_cache_bypassed(cls):
144 | """
145 | Test that cache was bypassed
146 | """
147 | # wrongly set cls.engine._cache_hit
148 | cls.engine._cache_hit = True
149 | try:
150 | cls.vcr_search(*SEARCH_ARGS, cache=False)
151 | if cls.engine._cache_hit == True:
152 | assert False, "{} cache - not bypassed".format(
153 | cls.engine.name)
154 | except NoResultsOrTrafficError:
155 | raise unittest.SkipTest(
156 | '{} failed due to traffic'.format(
157 | cls.engine))
158 |
159 | def test_search_urls(self):
160 | """
161 | Test that the search urls generated are valid
162 | """
163 | self.assertTrue(validate_url(self.engine._parsed_url.geturl()))
164 |
165 | def test_returned_results(self):
166 | """
167 | Test that the returned results have valid data. 8 is just a chosen value as most search
168 | engines return values more than that
169 | """
170 | self.assertTrue(len(self.results['titles']) >= 4)
171 | self.assertTrue(len(self.results['links']) >= 4)
172 | # coursera does not return descriptions for
173 | # Preaching to the choir
174 | if not self.engine.name.lower() == "coursera":
175 | self.assertTrue(len(self.results['descriptions']) >= 4)
176 | else:
177 | self.assertTrue(len(self.results["difficulties"]) >= 4)
178 |
179 | def test_links(self):
180 | for link in self.results['links']:
181 | print("{}:::::{}".format(self.name, link))
182 | # Sometimes googlescholar returns empty links for citation type results
183 | if not link and self.name.lower() == "googlescholar":
184 | continue
185 | self.assertTrue(validate_url(link))
186 |
187 | def test_results_length_are_the_same(self):
188 | """ Tests if returned result items are equal.
189 | :param args: a list/tuple of other keys returned
190 | """
191 | # Different engines have different keys which may be returned or not returned
192 | # So if all keys are not the same length check that the titles and links length are
193 | # the same
194 | default_keys = ["titles", "links"]
195 | default_keys_set = set(map(lambda x: len(self.results[x]), default_keys))
196 |
197 | items = self.results.keys()
198 | items_set = set(map(lambda x: len(self.results[x]), items))
199 |
200 | self.assertTrue(len(items_set) == 1 or len(default_keys_set) == 1)
201 |
--------------------------------------------------------------------------------
/search_engine_parser/tests/test_cli.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 | from unittest.mock import patch, MagicMock
4 |
5 | from search_engine_parser.core import cli
6 |
7 | engine_class_mock = MagicMock()
8 | engine_class_mock.name = "Random Engine Name"
9 | engine_class_mock.clear_cache = MagicMock()
10 | engine_class_mock.search = MagicMock()
11 |
12 | class CliTests(unittest.TestCase):
13 |
14 | def setUp(self):
15 | self.parser = cli.create_parser()
16 |
17 | def test_show_summary(self):
18 | args = self.parser.parse_args(["-e", "google", "--show-summary"])
19 | # If it executes properly it should return None
20 | self.assertTrue(cli.main(args) is None)
21 |
22 | @patch('search_engine_parser.core.cli.get_engine_class', return_value=engine_class_mock)
23 | def test_query(self, engine_class):
24 | args = self.parser.parse_args(["-e", "google", "Preach"])
25 | # If it executes properly it should return None
26 | self.assertTrue(cli.main(args) is None)
27 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import re
2 | import setuptools
3 |
4 | REQUIRED_PYTHON = (3, 5)
5 |
6 | # Load requirements
7 | REQUIREMENTS = 'requirements/main.txt'
8 | CLI_REQUIREMENTS = 'requirements/cli.txt'
9 | REQUIREMENTS = [line.strip('\n') for line in open(REQUIREMENTS).readlines()]
10 | CLI_REQUIREMENTS = [line.strip('\n') for line in open(CLI_REQUIREMENTS).readlines()]
11 |
12 | with open("README.md", "r", encoding="utf8") as fh:
13 | LONG_DESCRIPTION = fh.read()
14 |
15 | # Trying to load version directly from `search-engine-parser` module attempts
16 | # to load __init__.py which will try to load other libraries not yet installed
17 | with open("search_engine_parser/__init__.py", "rt", encoding="utf8") as f:
18 | VERSION = re.search(r'__version__ = "(.*?)"', f.read(), re.M).group(1)
19 |
20 | setuptools.setup(
21 | name="search-engine-parser",
22 | version=VERSION,
23 | author='Domnan Diretnan, Mmadu Manasseh',
24 | author_email="diretnandomnan@gmail.com",
25 | description="scrapes search engine pages for query titles, descriptions and links",
26 | url="https://github.com/bisoncorps/search-engine-parser",
27 | project_urls={
28 | "Documentation":"https://search-engine-parser.readthedocs.io/en/latest",
29 | "Source": "https://github.com/bisoncorps/search-engine-parser",
30 | },
31 | packages=setuptools.find_packages(),
32 | install_requires=REQUIREMENTS,
33 | long_description=LONG_DESCRIPTION,
34 | long_description_content_type="text/markdown",
35 | license="MIT",
36 | keywords='\
37 | search-engine \
38 | search \
39 | parser \
40 | google \
41 | yahoo \
42 | bing \
43 | yandex \
44 | stackoverflow \
45 | github \
46 | baidu ',
47 | entry_points={'console_scripts': [
48 | 'pysearch=search_engine_parser.core.cli:runner'
49 | ]},
50 | classifiers=[
51 | "Programming Language :: Python :: 3",
52 | "License :: OSI Approved :: MIT License",
53 | "Operating System :: OS Independent",
54 | ],
55 | package_data={
56 | '': ['*.*'],
57 | 'requirements': ['*.*'],
58 | },
59 | include_package_data=True,
60 | extras_require={
61 | 'cli': CLI_REQUIREMENTS
62 | },
63 | python_requires='>={}.{}'.format(*REQUIRED_PYTHON),
64 | )
65 |
--------------------------------------------------------------------------------