├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── googlesearch
    ├── __init__.py
    └── user_agents.py
├── requirements.txt
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 | 
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 | 
111 | # SageMath parsed files
112 | *.sage.py
113 | 
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 | 
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 | 
127 | # Rope project settings
128 | .ropeproject
129 | 
130 | # mkdocs documentation
131 | /site
132 | 
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 | 
138 | # Pyre type checker
139 | .pyre/
140 | 
141 | # pytype static type analyzer
142 | .pytype/
143 | 
144 | # Cython debug symbols
145 | cython_debug/
146 | 
147 | # PyCharm
148 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
149 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
151 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
152 | #.idea/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Nv7
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # googlesearch
 2 | googlesearch is a Python library for searching Google, easily. googlesearch uses requests and BeautifulSoup4 to scrape Google. 
 3 | 
 4 | ## Installation
 5 | To install, run the following command:
 6 | ```bash
 7 | python3 -m pip install googlesearch-python
 8 | ```
 9 | 
10 | ## Usage
11 | To get results for a search term, simply use the search function in googlesearch. For example, to get results for "Google" in Google, just run the following program:
12 | ```python
13 | from googlesearch import search
14 | search("Google")
15 | ```
16 | 
17 | ## Additional options
18 | googlesearch supports a few additional options. By default, googlesearch returns 10 results. This can be changed. To get a 100 results on Google for example, run the following program.
19 | ```python
20 | from googlesearch import search
21 | search("Google", num_results=100)
22 | ```
23 | If you want to have unique links in your search result, you can use the `unique` option as in the following program.
24 | ```python
25 | from googlesearch import search
26 | search("Google", num_results=100, unique=True)
27 | ```
28 | In addition, you can change the language google searches in. For example, to get results in French run the following program:
29 | ```python
30 | from googlesearch import search
31 | search("Google", lang="fr")
32 | ```
33 | You can also specify the region ([Country Codes](https://developers.google.com/custom-search/docs/json_api_reference#countryCodes)) for your search results. For example, to get results specifically from the US run the following program:
34 | ```python
35 | from googlesearch import search
36 | search("Google", region="us")
37 | ```
38 | If you want to turn off the safe search function (this function is on by default), you can do this:
39 | ```python
40 | from googlesearch import search
41 | search("Google", safe=None)
42 | ```
43 | To extract more information, such as the description or the result URL, use an advanced search:
44 | ```python
45 | from googlesearch import search
46 | search("Google", advanced=True)
47 | # Returns a list of SearchResult
48 | # Properties:
49 | # - title
50 | # - url
51 | # - description
52 | ```
53 | If requesting more than 100 results, googlesearch will send multiple requests to go through the pages. To increase the time between these requests, use `sleep_interval`:
54 | ```python
55 | from googlesearch import search
56 | search("Google", sleep_interval=5, num_results=200)
57 | ```
58 | 
59 | ```
60 | If requesting more than 10 results, but want to manage the batching yourself? 
61 | Use `start_num` to specify the start number of the results you want to get:
62 | ```python
63 | from googlesearch import search
64 | search("Google", sleep_interval=5, num_results=200, start_result=10)
65 | ```
66 | 
67 | If you are using a HTTP Rotating Proxy which requires you to install their CA Certificate, you can simply add `ssl_verify=False` in the `search()` method to avoid SSL Verification.
68 | ```python
69 | from googlesearch import search
70 | 
71 | 
72 | proxy = 'http://username:password@proxy.host.com:8080/'
73 | # or for socks5
74 | # proxy = 'socks5://username:password@proxy.host.com:1080/'
75 | 
76 | j = search("proxy test", num_results=100, lang="en", proxy=proxy, ssl_verify=False)
77 | for i in j:
78 |     print(i)
79 | ```
80 | 


--------------------------------------------------------------------------------
/googlesearch/__init__.py:
--------------------------------------------------------------------------------
  1 | """googlesearch is a Python library for searching Google, easily."""
  2 | from time import sleep
  3 | from bs4 import BeautifulSoup
  4 | from requests import get
  5 | from urllib.parse import unquote # to decode the url
  6 | from .user_agents import get_useragent
  7 | 
  8 | 
  9 | def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region):
 10 |     resp = get(
 11 |         url="https://www.google.com/search",
 12 |         headers={
 13 |             "User-Agent": get_useragent(),
 14 |             "Accept": "*/*"
 15 |         },
 16 |         params={
 17 |             "q": term,
 18 |             "num": results + 2,  # Prevents multiple requests
 19 |             "hl": lang,
 20 |             "start": start,
 21 |             "safe": safe,
 22 |             "gl": region,
 23 |         },
 24 |         proxies=proxies,
 25 |         timeout=timeout,
 26 |         verify=ssl_verify,
 27 |         cookies = {
 28 |             'CONSENT': 'PENDING+987', # Bypasses the consent page
 29 |             'SOCS': 'CAESHAgBEhIaAB',
 30 |         }
 31 |     )
 32 |     resp.raise_for_status()
 33 |     return resp
 34 | 
 35 | 
 36 | class SearchResult:
 37 |     def __init__(self, url, title, description):
 38 |         self.url = url
 39 |         self.title = title
 40 |         self.description = description
 41 | 
 42 |     def __repr__(self):
 43 |         return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
 44 | 
 45 | 
 46 | def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False):
 47 |     """Search the Google search engine"""
 48 | 
 49 |     # Proxy setup
 50 |     proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http") or proxy.startswith("socks5")) else None
 51 | 
 52 |     start = start_num
 53 |     fetched_results = 0  # Keep track of the total fetched results
 54 |     fetched_links = set() # to keep track of links that are already seen previously
 55 | 
 56 |     while fetched_results < num_results:
 57 |         # Send request
 58 |         resp = _req(term, num_results - start,
 59 |                     lang, start, proxies, timeout, safe, ssl_verify, region)
 60 |         
 61 |         # put in file - comment for debugging purpose
 62 |         # with open('google.html', 'w') as f:
 63 |         #     f.write(resp.text)
 64 |         
 65 |         # Parse
 66 |         soup = BeautifulSoup(resp.text, "html.parser")
 67 |         result_block = soup.find_all("div", class_="ezO2md")
 68 |         new_results = 0  # Keep track of new results in this iteration
 69 | 
 70 |         for result in result_block:
 71 |             # Find the link tag within the result block
 72 |             link_tag = result.find("a", href=True)
 73 |             # Find the title tag within the link tag
 74 |             title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None
 75 |             # Find the description tag within the result block
 76 |             description_tag = result.find("span", class_="FrIlee")
 77 | 
 78 |             # Check if all necessary tags are found
 79 |             if link_tag and title_tag and description_tag:
 80 |                 # Extract and decode the link URL
 81 |                 link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else ""
 82 |             # Extract and decode the link URL
 83 |             link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else ""
 84 |             # Check if the link has already been fetched and if unique results are required
 85 |             if link in fetched_links and unique:
 86 |                 continue  # Skip this result if the link is not unique
 87 |             # Add the link to the set of fetched links
 88 |             fetched_links.add(link)
 89 |             # Extract the title text
 90 |             title = title_tag.text if title_tag else ""
 91 |             # Extract the description text
 92 |             description = description_tag.text if description_tag else ""
 93 |             # Increment the count of fetched results
 94 |             fetched_results += 1
 95 |             # Increment the count of new results in this iteration
 96 |             new_results += 1
 97 |             # Yield the result based on the advanced flag
 98 |             if advanced:
 99 |                 yield SearchResult(link, title, description)  # Yield a SearchResult object
100 |             else:
101 |                 yield link  # Yield only the link
102 | 
103 |             if fetched_results >= num_results:
104 |                 break  # Stop if we have fetched the desired number of results
105 | 
106 |         if new_results == 0:
107 |             #If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below:
108 |             #print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.")
109 |             break  # Break the loop if no new results were found in this iteration
110 | 
111 |         start += 10  # Prepare for the next set of results
112 |         sleep(sleep_interval)
113 | 


--------------------------------------------------------------------------------
/googlesearch/user_agents.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | def get_useragent():
 4 |     """
 5 |     Generates a random user agent string mimicking the format of various software versions.
 6 | 
 7 |     The user agent string is composed of:
 8 |     - Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2
 9 |     - libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15
10 |     - SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5
11 |     - OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9
12 | 
13 |     Returns:
14 |         str: A randomly generated user agent string.
15 |     """
16 |     lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}"
17 |     libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}"
18 |     ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}"
19 |     openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}"
20 |     return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.9
2 | requests>=2.20
3 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file=README.md
3 | license_files=LICENSE


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open("README.md", "r", encoding='UTF-8') as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | with open("requirements.txt", "r", encoding='UTF-8') as fh:
 7 |     requirements = fh.read().split("\n")
 8 | 
 9 | setup(
10 |     name="googlesearch-python",
11 |     version="1.3.0",
12 |     author="Nishant Vikramaditya",
13 |     author_email="junk4Nv7@gmail.com",
14 |     description="A Python library for scraping the Google search engine.",
15 |     long_description=long_description,
16 |     long_description_content_type="text/markdown",
17 |     url="https://github.com/Nv7-GitHub/googlesearch",
18 |     packages=["googlesearch"],
19 |     classifiers=[
20 |         "Programming Language :: Python :: 3",
21 |         "License :: OSI Approved :: MIT License",
22 |         "Operating System :: OS Independent",
23 |     ],
24 |     python_requires=">=3.6",
25 |     install_requires=[requirements],
26 |     include_package_data=True,
27 | )
28 | 


--------------------------------------------------------------------------------