├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
└── workflows
│ ├── heroku-deploy.yml
│ ├── pip-upload.yml
│ └── python-test.yml
├── .gitignore
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── bin
└── websearch
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
├── __init__.py
└── test.py
└── websearch
├── __init__.py
├── __main__.py
├── extension.json
└── script.py
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: "[BUG]"
5 | labels: bug
6 | assignees: gaetan1903
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior
15 |
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 |
19 | **Screenshots**
20 | If applicable, add screenshots to help explain your problem.
21 |
22 | **Desktop (please complete the following information):**
23 | - OS: [e.g. Linux, Windows]
24 | - Python Version [e.g. 3.6, 3.9]
25 | - Version [e.g. 22]
26 |
27 |
28 | **Additional context**
29 | Add any other context about the problem here.
30 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: "[Features]"
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/workflows/heroku-deploy.yml:
--------------------------------------------------------------------------------
1 | name: Deploy on Heroku [CD]
2 |
3 | on:
4 | release:
5 | types:
6 | - created
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v2
13 | - uses: akhileshns/heroku-deploy@v3.12.12
14 | with:
15 | heroku_api_key: ${{secrets.HEROKU_API_KEY}}
16 | heroku_app_name: "websearch-python"
17 | heroku_email: "gaetan.s118@gmail.com"
18 | usedocker: true
19 |
--------------------------------------------------------------------------------
/.github/workflows/pip-upload.yml:
--------------------------------------------------------------------------------
1 | name: CD Publish
2 | on:
3 | release:
4 | types: # This configuration does not affect the page_build event above
5 | - created
6 | jobs:
7 | publish:
8 | runs-on: ubuntu-latest
9 | steps:
10 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
11 | - uses: actions/checkout@v2
12 |
13 | # Sets up python
14 | - uses: actions/setup-python@v2
15 | with:
16 | python-version: 3.8
17 |
18 | # Install dependencies
19 | - name: "Installs dependencies"
20 | run: |
21 | python3 -m pip install --upgrade pip
22 | python3 -m pip install setuptools wheel twine
23 |
24 | # Build and upload to PyPI
25 | - name: "Builds and uploads to PyPI"
26 | run: |
27 | python3 setup.py sdist bdist_wheel
28 | python3 -m twine upload dist/*
29 | env:
30 | TWINE_USERNAME: __token__
31 | TWINE_PASSWORD: ${{ secrets.PIP_TOKEN }}
32 |
33 |
--------------------------------------------------------------------------------
/.github/workflows/python-test.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: CI Python Test
5 |
6 | on:
7 | push:
8 | branches: [ main ]
9 | pull_request:
10 | branches: [ main ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 |
17 | steps:
18 | - uses: actions/checkout@v2
19 | - name: Set up Python 3.8
20 | uses: actions/setup-python@v2
21 | with:
22 | python-version: 3.8
23 | - name: Install dependencies
24 | run: |
25 | python -m pip install --upgrade pip
26 | pip install flake8
27 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
28 | - name: Lint with flake8
29 | run: |
30 | # stop the build if there are Python syntax errors or undefined names
31 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
32 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
33 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
34 | - name: Run UnitTest
35 | run: |
36 | python -m unittest -v
37 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | env/
132 |
133 | .vscode/
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | **To contribute to this project**, make sure you touch the 3 files:
2 | - the code file
3 | - the test file
4 | - the doc file (README)
5 |
6 | The steps are therefore:
7 | - Add your contribution
8 | - Write the unit test following the model already present
9 | - Update the README by specifying the new functionality
10 | - Create your REQUEST PULL
11 |
12 |
13 | Notes: Thos links can help for adding a new extension
14 | - `https://developer.mozilla.org/fr/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types`
15 | - `https://support.google.com/webmasters/answer/35287`
16 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9-buster
2 |
3 | RUN pip install --no-cache-dir websearch-python
4 |
5 | CMD ["sh", "-c", "websearch --host 0.0.0.0 --port $PORT"]
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | « Copyright © 2021, iTeam-$ Community
2 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”),
3 | to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
4 | and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5 |
6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7 |
8 | The Software is provided “as is”, without warranty of any kind, express or implied, including but not limited to the warranties of merchantability,
9 | fitness for a particular purpose and noninfringement. In no event shall the authors or copyright holders X be liable for any claim, damages or other liability,
10 | whether in an action of contract, tort or otherwise, arising from, out of or in connection with the software or the use or other dealings in the Software.
11 |
12 | Except as contained in this notice, the name of the iTeam-$ Community shall not be used in advertising or otherwise to promote the sale,
13 | use or other dealings in this Software without prior written authorization from the iTeam-$ Community Manager. »
14 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.json
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # WebSearch
2 |
3 |
4 |
5 | > Python module allowing you to do various searches for links on the Web.
6 |
7 |
8 | [](https://github.com/iTeam-S/WebSearch/actions/workflows/python-test.yml)
9 | [](https://github.com/iTeam-S/WebSearch/actions/workflows/pip-upload.yml)
10 |
11 | [](https://pypi.org/project/websearch-python/)
12 | [](https://pypi.org/project/websearch-python/)
13 |
14 |
15 |
16 | ## Installation
17 |
18 | ```s
19 | pip3 install websearch-python
20 | ```
21 | **OR** you can install dev version
22 | ```s
23 | pip3 install https://github.com/iTeam-S/WebSearch/archive/refs/heads/main.zip
24 | ```
25 |
26 | ## Use
27 |
28 | ### Quick Start as Module
29 |
30 | ```python
31 | from websearch import WebSearch as web
32 | for page in web('iTeam-$').pages[:2]:
33 | print(page)
34 | ```
35 |
36 | ```
37 | [RESULTS]
38 |
39 | https://iteam-s.mg/
40 | https://github.com/iTeam-S
41 | ```
42 |
43 |
44 | ### Quick Start as Webserver
45 |
46 | ```s
47 | # run webserver
48 | websearch --host 0.0.0.0 --port 7845
49 | ```
50 |
51 | **OR**
52 |
53 | ```s
54 | # run webserver
55 | python -m websearch --host 0.0.0.0 --port 7845
56 | ```
57 |
58 | ```s
59 | # requests contents
60 | curl http://0.0.0.0:7845/pages/botoravony+arleme
61 | ```
62 |
63 | ```json
64 | [
65 | "https://portfolio.iteam-s.mg/?id=2",
66 | "https://portfolio.iteam-s.mg/libs/cv/arleme.pdf",
67 | "https://madagascar.webcup.fr/team-webcup/iteams"
68 | ]
69 | ```
70 |
71 | ### Use Deployed Version
72 | ```s
73 | curl https://websearch-python.herokuapp.com/pages/botoravony+arleme
74 | ```
75 |
76 | __________________________
77 |
78 |
79 | FULL DOCUMENTATION
80 |
81 | ### Initialization
82 |
83 | ```python
84 | from websearch import WebSearch
85 | web = WebSearch('Gaetan Jonathan BAKARY')
86 | ```
87 | You can pass a `list` for mutliple keyword.
88 |
89 | ```python
90 | web = WebSearch(['Gaetan Jonathan BAKARY', 'iTeam-S'])
91 | ```
92 | You can also specify a `website` as a reference.
93 |
94 | ```python
95 | web = WebSearch('Gaetan Jonathan', site='iteam-s.mg')
96 | ```
97 |
98 |
99 | ### Webpages results
100 |
101 | ```python
102 | from websearch import WebSearch
103 | web = WebSearch('Gaetan Jonathan BAKARY')
104 | webpages = web.pages
105 | for wp in webpages[:5]:
106 | print(wp)
107 | ```
108 |
109 | ```
110 | [RESULTS]
111 |
112 | https://mg.linkedin.com/in/gaetanj
113 | https://portfolio.iteam-s.mg/?u=gaetan
114 | https://github.com/gaetan1903
115 | https://medium.com/@gaetan1903
116 | https://gitlab.com/gaetan1903
117 | ```
118 |
119 |
120 | ### Images results
121 |
122 | ```python
123 | from websearch import WebSearch
124 | web = WebSearch('Gaetan Jonathan BAKARY')
125 | webimages = web.images
126 | for im in webimages[:5]:
127 | print(im)
128 | ```
129 |
130 | ```
131 | [RESULTS]
132 |
133 | https://tse3.mm.bing.net/th?id=OIP.-K25y8TqkOi9UG_40Ti8bgAAAA
134 | https://tse1.mm.bing.net/th?id=OIP.yJPVcDx6znFSOewLdQBbHgHaJA
135 | https://tse3.mm.bing.net/th?id=OIP.7rO2T_nDAS0bXm4tQ4LKQAHaJA
136 | https://tse2.mm.bing.net/th?id=OIP.IUIEkGQVzYRKaDA7WeeV7QHaEF
137 | https://tse3.explicit.bing.net/th?id=OIP.OmvVnMIVu2ZdNZHZzJK_hgAAAA
138 | ```
139 |
140 |
141 | ### PDF results
142 |
143 | ```python
144 | from websearch import WebSearch
145 | web = WebSearch('Math 220')
146 | pdfs = web.pdf
147 | for pdf in pdfs[:5]:
148 | print(pdf)
149 | ```
150 |
151 | ```
152 | [RESULTS]
153 |
154 | https://www.coconino.edu/resources/files/pdfs/registration/curriculum/course-outlines/m/mat/mat_220.pdf
155 | https://www.jmu.edu/mathstat/Files/ALEKSmatrix.pdf
156 | https://www.jjc.edu/sites/default/files/Academics/Math/M220%20Master%20Syllabus%20SP18.pdf
157 | https://www.sonoma.edu/sites/www/files/2018-19cat-11math.pdf
158 | https://www.svsd.net/cms/lib5/PA01001234/Centricity/Domain/1009/3.3-3.3B-Practice-KEY.pdf
159 | ```
160 |
161 | To prevent the search for attachments with format verification, set `verif=False`, which is `True` by default.
162 |
163 | Format verification is presented [here](https://github.com/iTeam-S/WebSearch/pull/4)
164 |
165 | ```python
166 | from websearch import WebSearch
167 | web = WebSearch('Math 220', verif=False)
168 | ```
169 |
170 |
171 | ### DOCX results
172 | ```python
173 | from websearch import WebSearch:
174 | web = WebSearch('python')
175 | words = web.docx
176 | for word in words[:3]:
177 | print(word)
178 | ```
179 |
180 | ```
181 | [RESULTS]
182 |
183 | https://www.ocr.org.uk/Images/572953-j277-programming-techniques-python.docx
184 | https://www.niu.edu/brown/_pdf/physics374_spring2021/l1-19-21.docx
185 | https://ent2d.ac-bordeaux.fr/disciplines/mathematiques/wp-content/uploads/sites/3/2017/09/de-Scratch-%C3%A0-Python.docx
186 | ```
187 |
188 |
189 | ### XLSX results
190 | ```python
191 | from websearch import WebSearch:
192 | web = WebSearch('datalist')
193 | excels = web.xlsx
194 | for excel in excels[:3]:
195 | print(excel)
196 | ```
197 |
198 | ```
199 | [RESULTS]
200 |
201 | https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/979255/Detailed_Single_Data_List_-_2021-2022.xlsx
202 | https://www.jaist.ac.jp/top/data/list-achievement-research-e.xlsx
203 | https://img1.wsimg.com/blobby/go/bed8f8d7-d6c2-488d-9aa3-5910e18aa8d2/downloads/Datalist.xlsx
204 | ```
205 |
206 |
207 | ### PPTX results
208 | ```python
209 | from websearch import WebSearch:
210 | web = WebSearch('Leadership')
211 | powerpoints = web.pptx
212 | for powerpoint in powerpoints[:3]:
213 | print(powerpoint)
214 | ```
215 |
216 | ```
217 | [RESULTS]
218 |
219 | https://www.plainviewisd.org/cms/lib6/TX01918200/Centricity/Domain/853/Leadership%20Behav.%20Styles.pptx
220 | https://www.yorksandhumberdeanery.nhs.uk/sites/default/files/leadership_activity_and_msf.pptx
221 | https://www.itfglobal.org/sites/default/files/node/resources/files/Stage%203.1%20Powerpoint.pptx
222 | ```
223 |
224 |
225 | ### ODT results
226 | ```python
227 | from websearch import WebSearch
228 | web = WebSearch('Finance')
229 | documents = web.odt
230 | for doc in documents[:2]:
231 | print(doc)
232 | ```
233 |
234 | ```
235 | [RESULTS]
236 | https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/970748/Green_Finance_Report.odt
237 | https://iati.fcdo.gov.uk/iati_documents/3678707.odt
238 |
239 | ```
240 |
241 | ### ODS results
242 | ```python
243 | from websearch import WebSearch
244 | web = WebSearch('Commerce')
245 | documents = web.ods
246 | for doc in documents[:2]:
247 | print(doc)
248 | ```
249 |
250 | ```
251 | [RESULTS]
252 | http://www.justice.gouv.fr/art_pix/Stat_RSJ_12.7_Civil_Les_tribunaux_de_commerce.ods
253 | https://www.insee.fr/fr/metadonnees/source/fichier/Precision-principaux-indicateurs-crise-sanitaire-2020.ods
254 | ```
255 |
256 | ### ODP results
257 | ```python
258 | from websearch import WebSearch
259 | web = WebSearch('Renaissance')
260 | documents = web.odp
261 | for doc in documents[:2]:
262 | print(doc)
263 | ```
264 |
265 | ```
266 | [RESULTS]
267 | http://ekladata.com/9sHTcbLYfwbNGKU9cpnZXjlsbfA/17-Art-Renaissance.odp
268 | https://www.college-yvescoppens-malestroit.ac-rennes.fr/sites/college-yvescoppens-malestroit.ac-rennes.fr/IMG/odp/diapo-presentation-voyage-5e.odp
269 | ```
270 |
271 | ### KML results
272 | ```python
273 | from websearch import WebSearch
274 | web = WebSearch('Madagascar')
275 | maps = web.kml
276 | for map in maps[:3]:
277 | print(map)
278 | ```
279 |
280 | ```
281 | [RESULTS]
282 | http://www.hydrosciences.fr/sierem/kmz_files/MGPLGRA.kml
283 | https://www.ngoaidmap.org/downloads?doc=kml&name=association-intercooperation-madagascar-aim_projects&partners%5B%5D=6160§ors%5B%5D=1&status=active
284 | https://ngoaidmap.org/downloads?doc=kml&name=nemp-madagascar-cyclone-enawo-response_projects&projects%5B%5D=20655&status=active
285 | ```
286 |
287 | ### CUSTOM results
288 |
289 | For other extensions, not present, use the `custom` function
290 |
291 | Second arg can be taken [here](https://developer.mozilla.org/fr/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types)
292 |
293 | ```python
294 | from websearch import WebSearch
295 | web = WebSearch('Biologie')
296 | ps_documents = web.custom('ps', 'application/postscript')
297 | for doc in ps_documents[:3]:
298 | print(doc)
299 | ```
300 |
301 | ```
302 | [RESULTS]
303 |
304 | http://irma.math.unistra.fr/~fbertran/Master1_2020_2/L3Court.ps
305 | http://jfla.inria.fr/2002/actes/10-michel.ps
306 | https://www.crstra.dz/telechargement/pnr/ps/environnement/fadel-djamel.ps
307 | ```
308 |
309 |
310 | ### Webserver
311 |
312 | you can deploy as webserver and send an http request
313 |
314 | ```s
315 | python -m websearch --host [host] --port [port]
316 | [*] default host : 0.0.0.0
317 | [*] default port : 7845
318 | ```
319 | Exemple for page:
320 | ```s
321 | curl http://:/pages/botoravony+arleme
322 |
323 |
324 | [
325 |
326 | "https://portfolio.iteam-s.mg/?id=2",
327 | "https://portfolio.iteam-s.mg/libs/cv/arleme.pdf",
328 | "https://madagascar.webcup.fr/team-webcup/iteams"
329 | ]
330 | ```
331 |
332 | Exemple for image:
333 | ```s
334 | curl http://:/images/one+piece
335 |
336 |
337 | [
338 | "https://tse1.mm.bing.net/th?id=OIP.GlNk7idD3RCI_SYLiVzSBAHaE7",
339 | "https://tse2.mm.bing.net/th?id=OIP.uePUN5rwpB-7wicu1uxQcgHaFj",
340 | "https://tse2.mm.bing.net/th?id=OIP.dwWBU-A_6KPvvEYsL2nhVgHaFc",
341 | "https://tse1.mm.bing.net/th?id=OIP.5M8tKIhIWvbqGO1prhUGfAHaJ4",
342 | .....
343 | "https://tse4.mm.bing.net/th?id=OIP.uvp3efwHRLDJnUWZ5KLWCwHaE8",
344 | "https://tse3.mm.bing.net/th?id=OIP.d_uUoc-8R13RZ1bb76yhZgHaKp",
345 | "https://tse1.mm.bing.net/th?id=OIP.cBWDvspBM036p6h4DS6RTAHaFj"
346 | }
347 | ```
348 |
349 | Search by extension : `curl http://://`
350 |
351 | Where extension is from this list:
352 |
353 | ```
354 | swf, pdf, ps, dwf, kml, kmz, gpx, hwp, htm, html, xls, xlsx,
355 | ppt, pptx, doc, docx, odp, ods, odt, rtf, svg, tex, txt, text,
356 | bas, c, cc, cpp, cxx, h, hpp, cs, java, pl, py, wml, wap, xml
357 | ```
358 |
359 | Exemple :
360 | ```s
361 | curl http://:/kml/madagascar+antananarivo
362 |
363 |
364 | [
365 | "https://ifl.francophonelibre.org/atelier/ActionOSMMG2019/wms/kml?layers=ActionOSMMG2019:MG_Antananarivo_pharmacy_point_OSM_20190427"
366 | ]
367 | ```
368 |
369 | You can use the parameter `limit` to limit results
370 | ```
371 | curl http://:/images/one+piece?limit=4
372 |
373 |
374 | [
375 | "https://tse1.mm.bing.net/th?id=OIP.GlNk7idD3RCI_SYLiVzSBAHaE7",
376 | "https://tse2.mm.bing.net/th?id=OIP.uePUN5rwpB-7wicu1uxQcgHaFj",
377 | "https://tse2.mm.bing.net/th?id=OIP.dwWBU-A_6KPvvEYsL2nhVgHaFc",
378 | "https://tse1.mm.bing.net/th?id=OIP.5M8tKIhIWvbqGO1prhUGfAHaJ4"
379 | ]
380 |
381 | ```
382 | ##### Note: `site` and `verif` parameter in module can be given in url parameter
383 | `curl http://:/pdf/statut?verif=false&site=iteam-s.mg`
384 |
385 |
386 |
387 |
388 |
389 | _____________________________________________________________________
390 |
391 |
392 | ## Show your support
393 | Give a star 🌟 if this project helped you!
394 |
395 | [](https://www.buymeacoffee.com/gaetan1903)
396 |
397 |
398 | ## License
399 |
400 | MIT License
401 |
402 | Copyright (c) 2021 [iTeam-$](https://iteam-s.mg)
403 |
404 |
405 | ___________________________________________________________________
406 |
407 | ## Contributors
408 | 
409 |
410 |
--------------------------------------------------------------------------------
/bin/websearch:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import argparse
4 | from http.client import error
5 | from websearch import WebSearch
6 | from gevent.pywsgi import WSGIServer
7 | from flask import Flask, redirect, request, jsonify
8 |
9 |
10 | webserver = Flask(__name__)
11 |
12 |
13 | @webserver.errorhandler(404)
14 | def page_not_found(e):
15 | return """
16 | Can't find what you want.
17 | Please change the query or the extensions
18 | """, 404
19 |
20 |
21 | @webserver.route('/v1//')
22 | def old_route(ext, query):
23 | return redirect(f'/{ext}/{query}', code=301)
24 |
25 |
26 | @webserver.route('//')
27 | def websearch(ext, query):
28 | limit = request.args.get('limit', '')
29 | if limit and limit.isdigit():
30 | limit = int(limit)
31 | else:
32 | limit = 100
33 | try:
34 | query = query.replace('+', ' ')
35 | web = WebSearch(query, **dict(request.args))
36 | if ext == 'pages':
37 | res = WebSearch(query).pages
38 | elif ext == 'images':
39 | res = WebSearch(query).images
40 | else:
41 | res = web.custom(extension=ext)
42 | except error as e:
43 | print(e)
44 | return "Error 500, Something Wrong", 500
45 |
46 | return jsonify(res[:limit]) \
47 | if res and type(res) == list else redirect('/404')
48 |
49 |
50 | parser = argparse.ArgumentParser(
51 | description='Webserver version for websearch-python'
52 | )
53 | parser.add_argument(
54 | '--host', help='HOST for server, default: 0.0.0.0', default='0.0.0.0'
55 | )
56 | parser.add_argument(
57 | '--port', type=int, help='PORT for server. default 7845', default=7845
58 | )
59 | args = parser.parse_args()
60 |
61 | print(f'''
62 | _ _ _____ _____ _____ _____ ___ _____ ____ _ _
63 | | | | | | ___| | ___ \ / ___| | ___| / _ \ | ___ \ / __ \ | | | |
64 | | | | | | |__ | |_/ / \ `--. | |__ / /_\ \ | |_/ / | / \/ | |_| |
65 | | |/\| | | __| | ___ \ `--. \ | __| | _ | | / | | | _ |
66 | \ /\ / | |___ | |_/ / /\__/ / | |___ | | | | | |\ \ | \__/\ | | | |
67 | \/ \/ \____/ \____/ \____/ \____/ \_| |_/ \_| \_| \____/ \_| |_/
68 |
69 | Server listening on {args.host}:{args.port}
70 | ''') # noqa: W605
71 |
72 | SERVER = WSGIServer((args.host, args.port), webserver)
73 | SERVER.serve_forever()
74 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | beautifulsoup4
3 | flask
4 | gevent
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = WebSearch
3 | version = 1.2.2
4 | author = iTeam-$
5 | author_email = contact@iteam-s.xyz
6 | description = Python module allowing you to do various searches for links on the Web.
7 | long_description = file: README.md
8 | long_description_content_type = text/markdown
9 | url = https://github.com/iTeam-S/WebSearch
10 | project_urls =
11 | Bug Tracker = https://github.com/iTeam-S/WebSearch/issues
12 | classifiers =
13 | Programming Language :: Python :: 3
14 | License :: OSI Approved :: MIT License
15 | Operating System :: OS Independent
16 |
17 | [options]
18 |
19 | packages = find:
20 | python_requires = >=3.6
21 |
22 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | with open("README.md", "r") as fh:
4 | long_description = fh.read()
5 |
6 | setuptools.setup(
7 | name="websearch-python", # This is the name of the package
8 | version="1.2.2", # The initial release version
9 | author="iTeam-$", # Full name of the author
10 | description="Module allowing you to do various searches for links on the Web",
11 | long_description=long_description, # Long description read from the readme
12 | long_description_content_type="text/markdown",
13 | packages=setuptools.find_packages(), # List of all modules to be installed
14 | classifiers=[
15 | "Programming Language :: Python :: 3",
16 | "License :: OSI Approved :: MIT License",
17 | "Operating System :: OS Independent",
18 | ], # Information to filter the project on PyPi website
19 | python_requires=">=3.6",
20 | py_modules=["websearch"], # Name of the python package
21 | install_requires=["BeautifulSoup4", "requests", "gevent", "flask"], # depandance
22 | include_package_data=True, # Include all data file with the package
23 | package_data={"": ["*.json"]},
24 | scripts=["bin/websearch"],
25 | )
26 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iTeam-S/WebSearch/1f703455e39df384e303bbb7d93bb53951e3c51c/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import unittest
4 |
5 | import websearch
6 |
7 | sys.path.insert(0, os.path.dirname('/'.join(__file__.split('/')[:-1])))
8 |
9 |
10 | class TestCaseModule(unittest.TestCase):
11 |
12 | def test1_pages(self):
13 | pages = websearch.WebSearch('iTeam-$').pages[:5]
14 | # Verification de nombres de resultats
15 | self.assertTrue(len(pages))
16 | # verification lien
17 | for page in pages:
18 | self.assertTrue(page.startswith('http'))
19 |
20 | def test2_images(self):
21 | images = websearch.WebSearch('Madagascar').images[:5]
22 | # Verification de nombres de resultats
23 | self.assertTrue(len(images))
24 | # verification lien
25 | for image in images:
26 | self.assertTrue(image.startswith('http'))
27 |
28 | def test3_pdf(self):
29 | pdfs = websearch.WebSearch('Math 220').pdf[:2]
30 | # Verification de nombres de resultats
31 | self.assertTrue(len(pdfs))
32 | # verification lien
33 | for pdf in pdfs:
34 | self.assertTrue(pdf.startswith('http'))
35 |
36 | def test4_word(self):
37 | words = websearch.WebSearch('python').docx[:3]
38 | # Verification de nombres de resultats
39 | self.assertTrue(len(words))
40 | # Verification lien
41 | for word in words:
42 | self.assertTrue(word.startswith('http'))
43 |
44 | def test5_excel(self):
45 | excels = websearch.WebSearch('datalist').xlsx[:3]
46 | # Verification de nombre de résultats
47 | self.assertTrue(len(excels))
48 | # Verification lien
49 | for excel in excels:
50 | self.assertTrue(excel.startswith('http'))
51 |
52 | def test6_powerpoint(self):
53 | powerpoints = websearch.WebSearch('Communication').pptx[:3]
54 | # Verification de nombre de résultats
55 | self.assertTrue(len(powerpoints))
56 | # Verification lien
57 | for powerpoint in powerpoints:
58 | self.assertTrue(powerpoint.startswith('http'))
59 |
60 | def test7_odt(self):
61 | documents = websearch.WebSearch('Finance').odt[:3]
62 | # Verification de nombre de résultats
63 | self.assertTrue(len(documents))
64 | # Verification lien
65 | for doc in documents:
66 | self.assertTrue(doc.startswith('http'))
67 |
68 | def test8_ods(self):
69 | documents = websearch.WebSearch('Commerce').ods[:1]
70 | # Verification de nombre de résultats
71 | self.assertTrue(len(documents))
72 | # Verification lien
73 | for doc in documents:
74 | self.assertTrue(doc.startswith('http'))
75 |
76 | def test9_kml(self):
77 | maps = websearch.WebSearch('Madagascar').kml[:1]
78 | # Verification de nombre de résultats
79 | self.assertTrue(len(maps))
80 | # Verification lien
81 | for map in maps:
82 | self.assertTrue(map.startswith('http'))
83 |
84 | def test10_custom(self):
85 | web = websearch.WebSearch('Biologie')
86 | documents = web.custom('ps', 'application/postscript')[:1]
87 | # Verification de nombre de résultats
88 | self.assertTrue(len(documents))
89 | # Verification lien
90 | for doc in documents:
91 | self.assertTrue(doc.startswith('http'))
92 |
93 | def test11_odp(self):
94 | documents = websearch.WebSearch('Renaissance').odp[:1]
95 | # Verification de nombre de résultats
96 | self.assertTrue(len(documents))
97 | # Verification lien
98 | for doc in documents:
99 | self.assertTrue(doc.startswith('http'))
100 |
101 |
102 | if __name__ == '__main__':
103 | runner = unittest.TestCase()
104 | runner.run()
105 |
--------------------------------------------------------------------------------
/websearch/__init__.py:
--------------------------------------------------------------------------------
1 | from . import __main__
2 | from .script import WebSearch
3 |
4 | __version__ = "1.2.2"
5 | __author__ = "iTeam-$"
6 | __license__ = "MIT"
7 | __all__ = ["WebSearch", "__main__"]
8 |
--------------------------------------------------------------------------------
/websearch/__main__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | if __name__ == '__main__':
5 | os.system(f"websearch {' '.join(sys.argv[1:])}")
--------------------------------------------------------------------------------
/websearch/extension.json:
--------------------------------------------------------------------------------
1 | {
2 | "swf": "application/x-shockwave-flash",
3 | "pdf": "application/pdf",
4 | "ps": "application/postscript",
5 | "dwf": "application/dwf",
6 | "kml": "application/vnd.google-earth.kml+xml",
7 | "kmz": "application/vnd.google-earth.kmz",
8 | "gpx": "application/gpx+xml",
9 | "hwp": "application/x-hwp",
10 | "htm": "text/html",
11 | "html": "text/html",
12 | "xls": "application/vnd.ms-excel",
13 | "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
14 | "ppt": "application/vnd.ms-powerpoint",
15 | "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
16 | "doc": "application/msword",
17 | "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
18 | "odp": "application/vnd.oasis.opendocument.presentation",
19 | "ods": "application/vnd.oasis.opendocument.spreadsheet",
20 | "odt": "application/vnd.oasis.opendocument.text",
21 | "rtf": "application/rtf",
22 | "svg": "image/svg+xml",
23 | "tex": "application/x-tex",
24 | "txt": "text/plain",
25 | "text": "text/plain",
26 | "bas": "text/plain",
27 | "c": "text/plain",
28 | "cc": "text/plain",
29 | "cpp": "text/plain",
30 | "cxx": "text/plain",
31 | "h": "text/plain",
32 | "hpp": "text/plain",
33 | "cs": "text/plain",
34 | "java": "text/plain",
35 | "pl": "text/plain",
36 | "py": "text/plain",
37 | "wml": "text/vnd.wap.wml",
38 | "wap": "image/wap",
39 | "xml": "application/xml"
40 | }
--------------------------------------------------------------------------------
/websearch/script.py:
--------------------------------------------------------------------------------
1 | import urllib.parse
2 | from bs4 import BeautifulSoup
3 | from requests import get, head
4 |
5 | import os
6 | import json
7 |
8 | __location__ = os.path.dirname(os.path.abspath(__file__))
9 |
10 |
11 | class WebSearch:
12 | """
13 | Module permettant de prendre les différents lien sur le web.
14 | * query: prend l'expression à rechercher.
15 | * verif: si True, lance une requete à l'url pour valider
16 | le bon format du résultat, pardefaut à True.
17 | peut être desactiver en mettant `verif=False` en argument.
18 | * site: pour preciser un site precis comme source
19 | """
20 |
21 | _headers = {"User-Agent": "Googlebot/2.1 (http://www.googlebot.com/bot.html)"}
22 |
23 | def __init__(self, query, **kwargs):
24 | # verifier si la recherche est de type mutliple.
25 | if isinstance(query, list):
26 | self.query = "'"
27 | self.query += "' OR '".join(query)
28 | self.query += "'"
29 | else:
30 | self.query = query
31 |
32 | # verification du presence du site
33 | if kwargs.get("site"):
34 | self.query = f"site:{kwargs.get('site')} {self.query}"
35 |
36 | # Utiliser pour la verification des liens.
37 | self.verif = kwargs.get("verif", True)
38 | # utiliser pour l'optimisation
39 | self.__data = {}
40 |
41 | def __verif_content(self, urls, mimetype):
42 | """
43 | Verification du bon format du lien
44 | argument `ext` peut être consulté ici:
45 | https://developer.mozilla.org/fr/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
46 | """
47 | if not self.verif:
48 | # si Faux pas de verification
49 | # reenvoie directement la liste données
50 | return urls
51 |
52 | new_urls = []
53 | for url in urls:
54 | # Envoie d'une requete qui recupere que l'en tête.
55 | try:
56 | rq = head(url).headers
57 | except Exception as err:
58 | print(err)
59 | continue
60 | # Verfier si le lien renvoie bien le format voulu.
61 | if rq.get("content-type") == mimetype:
62 | new_urls.append(url)
63 | # renvoyer les urls verfiés.
64 | return new_urls
65 |
66 | @property
67 | def images(self):
68 | """
69 | Une fonction qui récupère toutes les liens des images
70 | resultats selon les mot-clé en paramètre.
71 | """
72 | # On verifie que les resultats n'est pas deja enregistrer.
73 | if self.__data.get("images"):
74 | if self.__data["images"][0] == self.query:
75 | return self.__data["images"][1]
76 |
77 | result = []
78 | url = (
79 | "https://fr.images.search.yahoo.com/search/images;_ylt=AwrJS5dMFghcBh4AgWpjAQx.;\
80 | _ylu=X3oDMTE0aDRlcHI2BGNvbG8DaXIyBHBvcwMxBHZ0aWQDQjY1NjlfMQRzZWMDcGl2cw--?p="
81 | + urllib.parse.quote(self.query)
82 | + "&fr2=piv-web&fr=yfp-t-905-s"
83 | )
84 |
85 | requete = get(url, headers=self._headers, timeout=10)
86 | soup = BeautifulSoup(requete.text, "html.parser")
87 | container = soup.find("ul", {"id": "sres"})
88 | try:
89 | lis = container.find_all("li")
90 | except Exception as e:
91 | print(e)
92 | return result
93 |
94 | if len(lis) == 0:
95 | return result
96 |
97 | for li in lis:
98 | try:
99 | img = li.find("img")
100 | img = str(img["data-src"]).split("&pid")
101 | result.append(str(img[0]))
102 |
103 | except Exception as e:
104 | print(e)
105 | continue
106 |
107 | # Sauvegarde des resultats pour optimiser la prochaine même appel.
108 | self.__data["images"] = (self.query, result)
109 | return result
110 |
111 | @property
112 | def pages(self):
113 | """
114 | Une fonction qui récupère toutes les liens des
115 | resultats selon les mot-clé en paramètre.
116 | """
117 | # On verifie que les resultats n'est pas deja enregistrer.
118 | if self.__data.get("pages"):
119 | if self.__data["pages"][0] == self.query:
120 | return self.__data["pages"][1]
121 | result = []
122 |
123 | url = (
124 | "https://www.google.com/search?client=firefox-b-d&q="
125 | + urllib.parse.quote(self.query)
126 | )
127 |
128 | requete = get(url, headers=self._headers, timeout=10)
129 | soup = BeautifulSoup(requete.text, "html.parser")
130 | a = soup.find_all("a")
131 | for link in a:
132 | tmp = link["href"][7:-1].split("&")
133 | if tmp[0].startswith("http"):
134 | result.append(urllib.parse.unquote(tmp[0]))
135 | # Sauvegarde des resultats pour optimiser la prochaine même appel.
136 | self.__data["pages"] = (self.query, result)
137 | """ On enleve les deux liens non necessaire à la fin du liste
138 | -> https://support.google.com/websearch?p=...
139 | -> https://accounts.google.com/ServiceLogin?continue=...
140 | """
141 | return result[:-2]
142 |
143 | @property
144 | def pdf(self):
145 | """
146 | Fonction pour recuperer que les pdf.
147 | """
148 | return self.custom("pdf", "application/pdf")
149 |
150 | @property
151 | def docx(self):
152 | """
153 | Fonction pour récupérer les documents word.
154 | """
155 | # vérifier si les résultats ne sont pas déjà enregistrer
156 | if self.__data.get("docx"):
157 | if self.__data["docx"][0] == self.query:
158 | return self.__data["docx"][1]
159 | tmp = self.query
160 | self.query = "filetype:docx " + self.query
161 | result = self.__verif_content(
162 | self.pages,
163 | "application/vnd.openxmlformats-officedocument"
164 | ".wordprocessingml.document",
165 | )
166 |
167 | self.query = tmp
168 | # Sauvegarde des resultats pour optimiser la prochaine même appel.
169 | self.__data["docx"] = (self.query, result)
170 | return result
171 |
172 | @property
173 | def xlsx(self):
174 | """
175 | Fonction pour récupérer les excels
176 | """
177 | # vérifier si les résultat ne sont pas déjà enregistrer
178 | if self.__data.get("xlsx"):
179 | if self.__data["xlsx"][0] == self.query:
180 | return self.__data["xlsx"][1]
181 | tmp = self.query
182 | self.query = "filetype:xlsx " + self.query
183 | result = self.__verif_content(
184 | self.pages,
185 | "application/vnd.openxmlformats-officedocument" ".spreadsheetml.sheet",
186 | )
187 | self.query = tmp
188 |
189 | # Sauvegarde des resultats pour optimiser la prochaine même appel.
190 | self.__data["xlsx"] = (self.query, result)
191 | return result
192 |
193 | @property
194 | def pptx(self):
195 | """Fonction pour récupérer les excels"""
196 | # Vérifier si les résultat ne sont pas déjà enregistrer
197 | if self.__data.get("pptx"):
198 | if self.__data["pptx"][0] == self.query:
199 | return self.__data["pptx"][1]
200 | tmp = self.query
201 | self.query = "filetype:pptx " + self.query
202 |
203 | result = self.__verif_content(
204 | self.pages,
205 | "application/vnd.openxmlformats-officedocument"
206 | ".presentationml.presentation",
207 | )
208 | self.query = tmp
209 |
210 | # Sauvegarde des resultats pour optimiser la prochaine même appel.
211 | self.__data["pptx"] = (self.query, result)
212 | return result
213 |
214 | @property
215 | def odt(self):
216 | """
217 | Fonction pour recuperer que les documents odt.
218 | """
219 | # On vérifie que les résultats ne sont pas déjà enregistrés.
220 | if self.__data.get("odt"):
221 | if self.__data["odt"][0] == self.query:
222 | return self.__data["odt"][1]
223 | tmp = self.query
224 | self.query = "filetype:odt " + self.query
225 |
226 | result = self.__verif_content(
227 | self.pages, "application/vnd.oasis.opendocument.text"
228 | )
229 | self.query = tmp
230 |
231 | # Sauvegarde des resultats pour optimiser la prochaine même appel.
232 | self.__data["odt"] = (self.query, result)
233 | return result
234 |
235 | @property
236 | def ods(self):
237 | """
238 | Fonction pour recuperer que les documents ods.
239 | """
240 | # On vérifie que les résultats ne sont pas déjà enregistrés.
241 | if self.__data.get("ods"):
242 | if self.__data["ods"][0] == self.query:
243 | return self.__data["ods"][1]
244 | tmp = self.query
245 | self.query = "filetype:ods " + self.query
246 | result = self.__verif_content(
247 | self.pages, "application/vnd.oasis.opendocument.spreadsheet"
248 | )
249 | self.query = tmp
250 | # Sauvegarde des resultats pour optimiser la prochaine même appel.
251 | self.__data["ods"] = (self.query, result)
252 | return result
253 |
254 | @property
255 | def odp(self):
256 | """
257 | Fonction pour recuperer que les documents odp.
258 | """
259 | # On vérifie que les résultats ne sont pas déjà enregistrés.
260 | if self.__data.get("odp"):
261 | if self.__data["odp"][0] == self.query:
262 | return self.__data["odp"][1]
263 | tmp = self.query
264 | self.query = "filetype:odp " + self.query
265 | result = self.__verif_content(
266 | self.pages, "application/vnd.oasis.opendocument.presentation"
267 | )
268 | self.query = tmp
269 | # Sauvegarde des resultats pour optimiser la prochaine même appel.
270 | self.__data["odp"] = (self.query, result)
271 | return result
272 |
273 | @property
274 | def kml(self):
275 | """
276 | Fonction pour recuperer des fichiers de projets géographiques
277 | pour google earth sous la format kml
278 | """
279 | # On vérifie que les résultats ne sont pas déjà enregistrés.
280 | if self.__data.get("kml"):
281 | if self.__data["kml"][0] == self.query:
282 | return self.__data["kml"][1]
283 | tmp = self.query
284 | self.query = "filetype:kml " + self.query
285 | result = self.__verif_content(
286 | self.pages, "application/vnd.google-earth.kml+xml"
287 | )
288 | self.query = tmp
289 |
290 | # Sauvegarde des resultats pour optimiser la prochaine même appel.
291 | self.__data["kml"] = (self.query, result)
292 | return result
293 |
294 | def custom(self, extension="pdf", mimetype=None):
295 | """
296 | Fonction pour recuperer des fichiers en fonction
297 | de l'extension voulu et des type de mime que ce dernier utilise
298 |
299 | Keyword arguments:
300 | extension -- The file's extension (default pdf)
301 | mimetype -- The mimetype that match the extension (default pdf)
302 | """
303 | # On verifie que les resultats n'est pas deja enregistrer.
304 | if self.__data.get(extension):
305 | if self.__data[extension][0] == self.query:
306 | return self.__data[extension][1]
307 | tmp = self.query
308 | self.query = f"filetype:{extension} {self.query}"
309 |
310 | if not mimetype:
311 | with open(os.path.join(__location__, "extension.json")) as file:
312 | mimetype = json.load(file).get(extension)
313 |
314 | if mimetype:
315 | result = self.__verif_content(self.pages, mimetype)
316 | self.query = tmp
317 | # Sauvegarde des resultats pour optimiser la prochaine même appel.
318 | self.__data[extension] = (self.query, result)
319 | return result
320 | else:
321 | return """Can't find mimetype that match this extension\n
322 | Please provide the mimetypes as arguments.
323 | """
324 |
325 | def custom_search(self, *args, **kwargs):
326 | raise Exception(
327 | "`custom_search` is deprecated since v1.0.4, use `custom` instead"
328 | )
329 |
--------------------------------------------------------------------------------