├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── googlesearch ├── __init__.py └── user_agents.py ├── requirements.txt ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | .dmypy.json 136 | dmypy.json 137 | 138 | # Pyre type checker 139 | .pyre/ 140 | 141 | # pytype static type analyzer 142 | .pytype/ 143 | 144 | # Cython debug symbols 145 | cython_debug/ 146 | 147 | # PyCharm 148 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 149 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 150 | # and can be added to the global gitignore or merged into this file. For a more nuclear 151 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 152 | #.idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Nv7 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # googlesearch 2 | googlesearch is a Python library for searching Google, easily. googlesearch uses requests and BeautifulSoup4 to scrape Google. 3 | 4 | ## Installation 5 | To install, run the following command: 6 | ```bash 7 | python3 -m pip install googlesearch-python 8 | ``` 9 | 10 | ## Usage 11 | To get results for a search term, simply use the search function in googlesearch. For example, to get results for "Google" in Google, just run the following program: 12 | ```python 13 | from googlesearch import search 14 | search("Google") 15 | ``` 16 | 17 | ## Additional options 18 | googlesearch supports a few additional options. By default, googlesearch returns 10 results. This can be changed. To get a 100 results on Google for example, run the following program. 19 | ```python 20 | from googlesearch import search 21 | search("Google", num_results=100) 22 | ``` 23 | If you want to have unique links in your search result, you can use the `unique` option as in the following program. 24 | ```python 25 | from googlesearch import search 26 | search("Google", num_results=100, unique=True) 27 | ``` 28 | In addition, you can change the language google searches in. For example, to get results in French run the following program: 29 | ```python 30 | from googlesearch import search 31 | search("Google", lang="fr") 32 | ``` 33 | You can also specify the region ([Country Codes](https://developers.google.com/custom-search/docs/json_api_reference#countryCodes)) for your search results. For example, to get results specifically from the US run the following program: 34 | ```python 35 | from googlesearch import search 36 | search("Google", region="us") 37 | ``` 38 | If you want to turn off the safe search function (this function is on by default), you can do this: 39 | ```python 40 | from googlesearch import search 41 | search("Google", safe=None) 42 | ``` 43 | To extract more information, such as the description or the result URL, use an advanced search: 44 | ```python 45 | from googlesearch import search 46 | search("Google", advanced=True) 47 | # Returns a list of SearchResult 48 | # Properties: 49 | # - title 50 | # - url 51 | # - description 52 | ``` 53 | If requesting more than 100 results, googlesearch will send multiple requests to go through the pages. To increase the time between these requests, use `sleep_interval`: 54 | ```python 55 | from googlesearch import search 56 | search("Google", sleep_interval=5, num_results=200) 57 | ``` 58 | 59 | ``` 60 | If requesting more than 10 results, but want to manage the batching yourself? 61 | Use `start_num` to specify the start number of the results you want to get: 62 | ```python 63 | from googlesearch import search 64 | search("Google", sleep_interval=5, num_results=200, start_result=10) 65 | ``` 66 | 67 | If you are using a HTTP Rotating Proxy which requires you to install their CA Certificate, you can simply add `ssl_verify=False` in the `search()` method to avoid SSL Verification. 68 | ```python 69 | from googlesearch import search 70 | 71 | 72 | proxy = 'http://username:password@proxy.host.com:8080/' 73 | # or for socks5 74 | # proxy = 'socks5://username:password@proxy.host.com:1080/' 75 | 76 | j = search("proxy test", num_results=100, lang="en", proxy=proxy, ssl_verify=False) 77 | for i in j: 78 | print(i) 79 | ``` 80 | -------------------------------------------------------------------------------- /googlesearch/__init__.py: -------------------------------------------------------------------------------- 1 | """googlesearch is a Python library for searching Google, easily.""" 2 | from time import sleep 3 | from bs4 import BeautifulSoup 4 | from requests import get 5 | from urllib.parse import unquote # to decode the url 6 | from .user_agents import get_useragent 7 | 8 | 9 | def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): 10 | resp = get( 11 | url="https://www.google.com/search", 12 | headers={ 13 | "User-Agent": get_useragent(), 14 | "Accept": "*/*" 15 | }, 16 | params={ 17 | "q": term, 18 | "num": results + 2, # Prevents multiple requests 19 | "hl": lang, 20 | "start": start, 21 | "safe": safe, 22 | "gl": region, 23 | }, 24 | proxies=proxies, 25 | timeout=timeout, 26 | verify=ssl_verify, 27 | cookies = { 28 | 'CONSENT': 'PENDING+987', # Bypasses the consent page 29 | 'SOCS': 'CAESHAgBEhIaAB', 30 | } 31 | ) 32 | resp.raise_for_status() 33 | return resp 34 | 35 | 36 | class SearchResult: 37 | def __init__(self, url, title, description): 38 | self.url = url 39 | self.title = title 40 | self.description = description 41 | 42 | def __repr__(self): 43 | return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" 44 | 45 | 46 | def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): 47 | """Search the Google search engine""" 48 | 49 | # Proxy setup 50 | proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http") or proxy.startswith("socks5")) else None 51 | 52 | start = start_num 53 | fetched_results = 0 # Keep track of the total fetched results 54 | fetched_links = set() # to keep track of links that are already seen previously 55 | 56 | while fetched_results < num_results: 57 | # Send request 58 | resp = _req(term, num_results - start, 59 | lang, start, proxies, timeout, safe, ssl_verify, region) 60 | 61 | # put in file - comment for debugging purpose 62 | # with open('google.html', 'w') as f: 63 | # f.write(resp.text) 64 | 65 | # Parse 66 | soup = BeautifulSoup(resp.text, "html.parser") 67 | result_block = soup.find_all("div", class_="ezO2md") 68 | new_results = 0 # Keep track of new results in this iteration 69 | 70 | for result in result_block: 71 | # Find the link tag within the result block 72 | link_tag = result.find("a", href=True) 73 | # Find the title tag within the link tag 74 | title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None 75 | # Find the description tag within the result block 76 | description_tag = result.find("span", class_="FrIlee") 77 | 78 | # Check if all necessary tags are found 79 | if link_tag and title_tag and description_tag: 80 | # Extract and decode the link URL 81 | link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" 82 | # Extract and decode the link URL 83 | link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" 84 | # Check if the link has already been fetched and if unique results are required 85 | if link in fetched_links and unique: 86 | continue # Skip this result if the link is not unique 87 | # Add the link to the set of fetched links 88 | fetched_links.add(link) 89 | # Extract the title text 90 | title = title_tag.text if title_tag else "" 91 | # Extract the description text 92 | description = description_tag.text if description_tag else "" 93 | # Increment the count of fetched results 94 | fetched_results += 1 95 | # Increment the count of new results in this iteration 96 | new_results += 1 97 | # Yield the result based on the advanced flag 98 | if advanced: 99 | yield SearchResult(link, title, description) # Yield a SearchResult object 100 | else: 101 | yield link # Yield only the link 102 | 103 | if fetched_results >= num_results: 104 | break # Stop if we have fetched the desired number of results 105 | 106 | if new_results == 0: 107 | #If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below: 108 | #print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.") 109 | break # Break the loop if no new results were found in this iteration 110 | 111 | start += 10 # Prepare for the next set of results 112 | sleep(sleep_interval) 113 | -------------------------------------------------------------------------------- /googlesearch/user_agents.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | def get_useragent(): 4 | """ 5 | Generates a random user agent string mimicking the format of various software versions. 6 | 7 | The user agent string is composed of: 8 | - Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2 9 | - libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15 10 | - SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5 11 | - OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9 12 | 13 | Returns: 14 | str: A randomly generated user agent string. 15 | """ 16 | lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}" 17 | libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}" 18 | ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}" 19 | openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}" 20 | return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.9 2 | requests>=2.20 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file=README.md 3 | license_files=LICENSE -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open("README.md", "r", encoding='UTF-8') as fh: 4 | long_description = fh.read() 5 | 6 | with open("requirements.txt", "r", encoding='UTF-8') as fh: 7 | requirements = fh.read().split("\n") 8 | 9 | setup( 10 | name="googlesearch-python", 11 | version="1.3.0", 12 | author="Nishant Vikramaditya", 13 | author_email="junk4Nv7@gmail.com", 14 | description="A Python library for scraping the Google search engine.", 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url="https://github.com/Nv7-GitHub/googlesearch", 18 | packages=["googlesearch"], 19 | classifiers=[ 20 | "Programming Language :: Python :: 3", 21 | "License :: OSI Approved :: MIT License", 22 | "Operating System :: OS Independent", 23 | ], 24 | python_requires=">=3.6", 25 | install_requires=[requirements], 26 | include_package_data=True, 27 | ) 28 | --------------------------------------------------------------------------------