├── .gitignore ├── .gitpod.yml ├── Dockerfile ├── LICENSE ├── README.md ├── SECURITY.md ├── advanced.md ├── cache └── search │ ├── 9f1f0f498f7a94591b61513c14d07116.json │ └── f0d63bfeda464e93d5c6dfcd413ea193.json ├── docker-compose.yaml ├── images ├── google-logo.png ├── google-scraper-csv-result.png ├── google-scraper-featured-image.png └── google-scraper-social-featured-image.png ├── local_storage.json ├── main.py ├── profiles.json ├── requirements.txt └── src ├── __init__.py ├── google_scraper.py ├── search.py ├── utils.py ├── write_output.py └── write_output_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Botasaurus stuff: 65 | build/ 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | 107 | tasks/ 108 | 109 | profiles/ 110 | 111 | output/ 112 | 113 | 114 | create_cache.py 115 | 116 | error_logs/ 117 | 118 | gpt/ -------------------------------------------------------------------------------- /.gitpod.yml: -------------------------------------------------------------------------------- 1 | image: chetan1111/gitpod-botasaurus:3 2 | 3 | tasks: 4 | - init: pip install -r requirements.txt -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM chetan1111/botasaurus:latest 2 | 3 | ENV PYTHONUNBUFFERED=1 4 | 5 | COPY requirements.txt . 6 | 7 | RUN python -m pip install -r requirements.txt 8 | 9 | RUN mkdir app 10 | WORKDIR /app 11 | COPY . /app 12 | 13 | CMD ["python", "main.py"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Chetan Jain 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Google Scraper Featured Image](https://raw.githubusercontent.com/omkarcloud/google-scraper/master/images/google-scraper-featured-image.png) 2 | 3 |
4 |

✨ Google Scraper 🚀

5 |

💦 Google Scraper helps you collect search results from Google. 💦

6 |
7 | 8 |
(Programming Language - Python 3)
9 |
10 |

11 | 12 | google-scraper forks 13 | 14 | 15 | Repo stars 16 | 17 | 18 | google-scraper License 19 | 20 | 21 | issues 22 | 23 |

24 |

25 | View 26 |

27 | 28 |

29 | 30 | Open in Gitpod 31 | 32 |

33 | 34 | --- 35 | 36 | ## Disclaimer for Google Scraper Project 37 | 38 | > By using Google Scraper, you agree to comply with all applicable local and international laws related to data scraping, copyright, and privacy. The developers of Google Scraper will not be held liable for any misuse of this software. It is the user's sole responsibility to ensure adherence to all relevant laws regarding data scraping, copyright, and privacy, and to use Google Scraper in an ethical and legal manner, in line with both local and international regulations. 39 | 40 | We take concerns related to the Google Scraper Project very seriously. If you have any inquiries or issues, please contact Chetan Jain at [chetan@omkar.cloud](mailto:chetan@omkar.cloud). We will take prompt and necessary action in response to your emails. 41 | 42 | ## 👉 Explore Our Other Awesome Products 43 | 44 | - ✅ [Botasaurus](https://github.com/omkarcloud/botasaurus): The All-in-One Web Scraping Framework with Anti-Detection, Parallelization, Asynchronous, and Caching Superpowers. 45 | 46 | --- 47 | 48 | Google Scraper helps you collect search results from Google. 49 | 50 | ## 🚀 Getting Started 51 | 52 | 1️⃣ **Clone the Magic 🧙‍♀:** 53 | ```shell 54 | git clone https://github.com/omkarcloud/google-scraper 55 | cd google-scraper 56 | ``` 57 | 2️⃣ **Install Dependencies 📦:** 58 | ```shell 59 | python -m pip install -r requirements.txt 60 | ``` 61 | 3️⃣ **Let the Scraping Begin 😎**: 62 | ```shell 63 | python main.py 64 | ``` 65 | 66 | Find your data in the `output` directory. 67 | 68 | ![Google Scraper CSV Result](https://raw.githubusercontent.com/omkarcloud/google-scraper/master/images/google-scraper-csv-result.png) 69 | 70 | *Note: If you don't have Python installed. Follow this Simple FAQ [here](https://github.com/omkarcloud/google-scraper/blob/master/advanced.md#-i-dont-have-python-installed-how-can-i-run-the-scraper) and you will have your Google data in next 5 Minutes* 71 | 72 | ## 🤔 FAQs 73 | 74 | ### ❓ How to Scrape Google? 75 | 76 | 1. Open the `main.py` file. 77 | 2. Update the `queries` list with the locations you are interested in. For example: 78 | 79 | ```python 80 | queries = [ 81 | "Mango", 82 | "Watermelon", 83 | ] 84 | 85 | Google.search(queries, max=10) 86 | ``` 87 | 88 | 3. Run it. 89 | 90 | ```bash 91 | python main.py 92 | ``` 93 | 94 | Then find your data in the `output` directory. 95 | 96 | 97 | ### ❓ How to Scrape More Google Search Results Using Your Google API? 98 | 99 | To scrape additional data, follow these steps to use our Google API. You can make 50 requests for free: 100 | 101 | 1. Sign up on RapidAPI by visiting [this link](https://rapidapi.com/auth/sign-up). 102 | 103 | ![Sign Up on RapidAPI](https://raw.githubusercontent.com/omkarcloud/assets/master/images/sign-up.png) 104 | 105 | 2. Then, subscribe to our Free Plan by visiting [this link](https://rapidapi.com/Chetan11dev/api/google-scraper/pricing). 106 | 107 | ![Subscribe to Free Plan](https://raw.githubusercontent.com/omkarcloud/assets/master/images/free-subscription.png) 108 | 109 | 3. Now, copy the API key. 110 | 111 | ![Copy the API Key](https://raw.githubusercontent.com/omkarcloud/assets/master/images/api-key.png) 112 | 113 | 4. Use it in the scraper as follows: 114 | ```python 115 | Google.search("Orange", max=10, key="YOUR_API_KEY") 116 | ``` 117 | 118 | 5. Run the script, and you'll find your data in the `output` folder. 119 | ```bash 120 | python main.py 121 | ``` 122 | 123 | The first 50 requests are free. After that, you can upgrade to the Pro Plan, which will get you 1000 requests for just $9. 124 | 125 | 126 | ### ❓ How did you build it? 127 | 128 | We used Botasaurus, It's an All-in-One Web Scraping Framework with Anti-Detection, Parallelization, Asynchronous, and Caching Superpowers. 129 | 130 | Botasaurus helped us cut down the development time by 50% and helped us focus only on the core extraction logic of the scraper. 131 | 132 | If you are a Web Scraper, you should learn about Botasaurus [here](https://github.com/omkarcloud/botasaurus), because Botasaurus will save you countless hours in your life as a Web Scraper. 133 | 134 |

135 | 136 | botasaurus 137 | 138 |

139 | 140 | 141 | ### ❓ Need More Help or Have Additional Questions? 142 | 143 | For further help, contact us on WhatsApp. We'll be happy to help you out. 144 | 145 | [![Contact Us on WhatsApp about Google Scraper](https://raw.githubusercontent.com/omkarcloud/assets/master/images/whatsapp-us.png)](https://api.whatsapp.com/send?phone=918295042963&text=Hi,%20I%20would%20like%20to%20learn%20more%20about%20your%20products.) 146 | 147 | ## Love It? [Star It! ⭐](https://github.com/omkarcloud/google-scraper/stargazers) 148 | 149 | ## Made with ❤️ using [Botasaurus Web Scraping Framework](https://github.com/omkarcloud/botasaurus) -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | # Security Policy for the Google Scraper Project 3 | 4 | ## Disclaimer 5 | 6 | By using Google Scraper, you agree to comply with all applicable local and international laws related to data scraping, copyright, and privacy. The developers of Google Scraper will not be held liable for any misuse of this software. It is the user's sole responsibility to ensure adherence to all relevant laws regarding data scraping, copyright, and privacy, and to use Google Scraper in an ethical and legal manner, in line with both local and international regulations. 7 | 8 | ## 1. Reporting a Vulnerability 9 | 10 | We take the privacy and security of the Google Scraper Project very seriously. If you have discovered a security vulnerability or have concerns about the project, we appreciate your assistance in responsibly disclosing it to us. 11 | 12 | To report a security issue or express a concern, please email Chetan Jain at [chetan@omkar.cloud](mailto:chetan@omkar.cloud). We will promptly respond to your concerns. 13 | 14 | ## 2. Use at Your Own Risk 15 | 16 | This project is provided for ethical and legal purposes only. It must be used in compliance with all relevant local and international laws and is not intended for unauthorized or illegal use. 17 | 18 | ## 3. Contact 19 | 20 | For questions regarding this security policy or the security of the Google Scraper Project, please contact [chetan@omkar.cloud](mailto:chetan@omkar.cloud). 21 | 22 | The information in this `SECURITY.md` is provided "as is," without any kind of warranty. -------------------------------------------------------------------------------- /advanced.md: -------------------------------------------------------------------------------- 1 | ## 🤔 Advanced Questions 2 | 3 | ### ❓ I don't have Python installed. How can I run the scraper? 4 | 5 | You can easily run the scraper in Gitpod, a browser-based development environment. Set it up in just 5 minutes by following these steps: 6 | 7 | 1. Visit [this link](https://gitpod.io/#https://github.com/omkarcloud/google-scraper) and sign up using your GitHub account. 8 | 9 | ![Gitpod Google Scraper Sign Up](https://raw.githubusercontent.com/omkarcloud/assets/master/images/open-in-gitpod.png) 10 | 11 | 2. Once signed up, open it in Gitpod. 12 | 13 | ![Gitpod Google Scraper Continue](https://raw.githubusercontent.com/omkarcloud/assets/master/images/gitpod-continue.png) 14 | 15 | 3. In the terminal, run the following command to start scraping: 16 | ```bash 17 | python main.py 18 | ``` 19 | 20 | 4. Once the scraper has finished running, download the data from the `output` folder. 21 | ![Data Google Scraper Download](https://raw.githubusercontent.com/omkarcloud/assets/master/images/download-data.png) 22 | 23 | ### ❓ Need More Help or Have Additional Questions? 24 | 25 | For further help, contact us on WhatsApp. We'll be happy to help you out. 26 | 27 | [![Contact Us on WhatsApp about Google Scraper](https://raw.githubusercontent.com/omkarcloud/assets/master/images/whatsapp-us.png)](https://api.whatsapp.com/send?phone=918295042963&text=Hi,%20I%20would%20like%20to%20learn%20more%20about%20your%20products.) 28 | 29 | ## Love It? [Star It! ⭐](https://github.com/omkarcloud/google-scraper/stargazers) 30 | 31 | ## Made with ❤️ using [Botasaurus Web Scraping Framework](https://github.com/omkarcloud/botasaurus) -------------------------------------------------------------------------------- /cache/search/9f1f0f498f7a94591b61513c14d07116.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "count": 9, 4 | "next": "https://google-scraper.p.rapidapi.com/search/?query=watermelon&page=3", 5 | "previous": "https://google-scraper.p.rapidapi.com/search/?query=watermelon&page=1", 6 | "results": [ 7 | { 8 | "title": "Watermelon", 9 | "link": "https://en.wikipedia.org/wiki/Watermelon", 10 | "description": "It has a thin, light and dark green striped rind, and sweet white flesh with black seeds. It can grow well in cool climates. It was originally brought to ..." 11 | }, 12 | { 13 | "title": "Top 9 Health Benefits of Eating Watermelon", 14 | "link": "https://www.healthline.com/nutrition/watermelon-health-benefits", 15 | "description": "Watermelon contains a variety of nutrients, including potassium, magnesium, and vitamins A and C. It's also relatively low in calories, ..." 16 | }, 17 | { 18 | "title": "Welcome to Watermelon.org", 19 | "link": "https://www.watermelon.org/", 20 | "description": "Learn about the wonderful world of watermelon with recipes, carvings, educational resources, activities and industry promotion materials." 21 | }, 22 | { 23 | "title": "Watermelon: Health benefits, nutrition, and risks", 24 | "link": "https://www.medicalnewstoday.com/articles/266886", 25 | "description": "Watermelon is around 90% water and also provides electrolytes, such as potassium. This makes it a healthful choice of snack during the hot summer months." 26 | }, 27 | { 28 | "title": "Watermelon: Health benefits, risks & nutrition facts", 29 | "link": "https://www.livescience.com/46019-watermelon-nutrition.html", 30 | "description": "Watermelon (Citrullus lanatus) is a delicious fruit which contains many nutrients, is low in calories and is free of fat." 31 | }, 32 | { 33 | "title": "The Health Benefits of Watermelon", 34 | "link": "https://www.webmd.com/diet/ss/slideshow-health-benefits-of-watermelon", 35 | "description": "Just one medium slice of watermelon gives you contains 9-11% of the vitamin A you need each day. This nutrient is one of the keys to keeping your eyes healthy." 36 | }, 37 | { 38 | "title": "Watermelon | Nutrition, Health Benefits, Recipes", 39 | "link": "https://www.britannica.com/plant/watermelon", 40 | "description": "The fruit is a type of berry known botanically as a pepo. The sweet juicy flesh may be reddish, white, or yellow; flesh colour, shape of the ..." 41 | }, 42 | { 43 | "title": "Watermelon's Benefits", 44 | "link": "https://www.watermelon.org/nutrition/watermelons-benefits/", 45 | "description": "Watermelon contains higher levels of lycopene than any other fresh fruit or vegetable (12.7 mg per 2-cup serving) and is part of a healthy diet." 46 | }, 47 | { 48 | "title": "Watermelon 101: Nutrition Facts and Health Benefits", 49 | "link": "https://www.healthline.com/nutrition/foods/watermelon", 50 | "description": "Nutrition Facts. Watermelon consists mostly of water (91%) and carbs (7.5%). It provides almost no protein or fat and is very low in calories." 51 | }, 52 | { 53 | "title": "Watermelon 101: Nutrition Facts and Health Benefits", 54 | "link": "https://www.healthline.com/nutrition/foods/watermelon", 55 | "description": "Nutrition Facts. Watermelon consists mostly of water (91%) and carbs (7.5%). It provides almost no protein or fat and is very low in calories." 56 | } 57 | ] 58 | }, 59 | "error": null 60 | } -------------------------------------------------------------------------------- /cache/search/f0d63bfeda464e93d5c6dfcd413ea193.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "count": 9, 4 | "next": "https://google-scraper.p.rapidapi.com/search/?query=mango&page=3", 5 | "previous": "https://google-scraper.p.rapidapi.com/search/?query=mango&page=1", 6 | "results": [ 7 | { 8 | "title": "Fashion for Women | Mango USA", 9 | "link": "https://shop.mango.com/us", 10 | "description": "Latest trends in fashion for women. Discover our designs: dresses, tops, jeans, shoes, bags and accessories. Free delivery from US$50 - Returns within 30 ..." 11 | }, 12 | { 13 | "title": "Mango", 14 | "link": "https://en.wikipedia.org/wiki/Mango", 15 | "description": "A mango is an edible stone fruit produced by the tropical tree Mangifera indica. It is believed to have originated in southern Asia, particularly in eastern ..." 16 | }, 17 | { 18 | "title": "Mango Languages: Homepage", 19 | "link": "https://mangolanguages.com/", 20 | "description": "Native speaker and sophisticated AI-generated audio. Real-life Conversations. Cultural Context. The best way to learn a foreign language." 21 | }, 22 | { 23 | "title": "Women's Fashion Outlet 2023 | Mango Outlet USA", 24 | "link": "https://www.mangooutlet.com/us/women", 25 | "description": "Find in Mango Outlet factory exclusive promotions and special prices in women's fashion: dresses, t-shirts, jeans, shoes, bags and accessories." 26 | }, 27 | { 28 | "title": "Mangos - All You Need To Know About Mango - Mango.org", 29 | "link": "https://www.mango.org/", 30 | "description": "A mango tree's leaves are long and leathery. They are a deep shade of green and around five to sixteen inches in length, alternating on a slim woody branch." 31 | }, 32 | { 33 | "title": "Mangoes: Benefits, nutrition, and recipes", 34 | "link": "https://www.medicalnewstoday.com/articles/275921", 35 | "description": "Health experts consider mangoes to contain medium to high amounts of potassium. A 165-gramTrusted Source (g) cup of raw mango provides 277 ..." 36 | }, 37 | { 38 | "title": "Mango | Description, History, Cultivation, & Facts", 39 | "link": "https://www.britannica.com/plant/mango-plant-and-fruit", 40 | "description": "The tree is evergreen, often reaching 15\u201318 metres (50\u201360 feet) in height and attaining great age. The simple leaves are lanceolate, up to 30 cm ..." 41 | }, 42 | { 43 | "title": "MANGO (@mango) \u2022 Instagram photos and videos", 44 | "link": "https://www.instagram.com/mango/", 45 | "description": "MANGO Official Account Welcome to the home of Mediterranean style and culture. Mention @mango and use #MangoCommunity to be featured. go.mango/shop." 46 | }, 47 | { 48 | "title": "mango - Wiktionary, the free dictionary", 49 | "link": "https://en.wiktionary.org/wiki/mango", 50 | "description": "The Oxford English Dictionary says it ultimately stems from Malayalam \u0d2e\u0d3e\u0d19\u0d4d\u0d19 (m\u0101\u1e45\u1e45a, \u201cunripe mango\u201d) (\u0d2e\u0d3e\u0d35\u0d4d (m\u0101v\u016d, \u201cmango tree\u201d) + \u0d15\u0d3e\u0d2f (k\u0101ya, \u201cunripe fruit\u201d)), ..." 51 | }, 52 | { 53 | "title": "Mango: Nutrition, Health Benefits, and How to Eat It", 54 | "link": "https://www.healthline.com/nutrition/mango", 55 | "description": "One of its most impressive nutrient facts is that just 1 cup (165 grams) of fresh mango provides nearly 67% of the DV for vitamin C. This water- ..." 56 | } 57 | ] 58 | }, 59 | "error": null 60 | } -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | bot-1: 4 | restart: "no" 5 | shm_size: 800m 6 | build: 7 | dockerfile: Dockerfile 8 | context: . 9 | volumes: 10 | - ./output:/app/output 11 | - ./tasks:/app/tasks 12 | - ./profiles:/app/profiles 13 | - ./profiles.json:/app/profiles.json 14 | - ./local_storage.json:/app/local_storage.json -------------------------------------------------------------------------------- /images/google-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omkarcloud/google-scraper/f0063eba18fa976eb3970f157a59cb0d9f04aedc/images/google-logo.png -------------------------------------------------------------------------------- /images/google-scraper-csv-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omkarcloud/google-scraper/f0063eba18fa976eb3970f157a59cb0d9f04aedc/images/google-scraper-csv-result.png -------------------------------------------------------------------------------- /images/google-scraper-featured-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omkarcloud/google-scraper/f0063eba18fa976eb3970f157a59cb0d9f04aedc/images/google-scraper-featured-image.png -------------------------------------------------------------------------------- /images/google-scraper-social-featured-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/omkarcloud/google-scraper/f0063eba18fa976eb3970f157a59cb0d9f04aedc/images/google-scraper-social-featured-image.png -------------------------------------------------------------------------------- /local_storage.json: -------------------------------------------------------------------------------- 1 | { 2 | "count": 11, 3 | "credits_used": 22 4 | } -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from src import Google 2 | 3 | queries = [ 4 | "Mango", 5 | "Watermelon", 6 | ] 7 | 8 | Google.search(queries, max=10) 9 | -------------------------------------------------------------------------------- /profiles.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | botasaurus 2 | casefy 3 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | from .google_scraper import Google -------------------------------------------------------------------------------- /src/google_scraper.py: -------------------------------------------------------------------------------- 1 | from typing import List,Optional, Union, Dict 2 | from botasaurus import bt 3 | from .write_output import write_output 4 | 5 | from .search import FAILED_DUE_TO_CREDITS_EXHAUSTED, FAILED_DUE_TO_NO_KEY,FAILED_DUE_TO_NOT_SUBSCRIBED, FAILED_DUE_TO_UNKNOWN_ERROR, search 6 | 7 | 8 | def clean_data(social_details): 9 | success, credits_exhausted, not_subscribed, unknown_error, no_key = [], [], [], [], [] 10 | 11 | for detail in social_details: 12 | if detail.get("error") is None: 13 | success.append(detail) 14 | elif detail["error"] == FAILED_DUE_TO_CREDITS_EXHAUSTED: 15 | credits_exhausted.append(detail) 16 | elif detail["error"] == FAILED_DUE_TO_NOT_SUBSCRIBED: 17 | not_subscribed.append(detail) 18 | elif detail["error"] == FAILED_DUE_TO_UNKNOWN_ERROR: 19 | unknown_error.append(detail) 20 | elif detail["error"] == FAILED_DUE_TO_NO_KEY: 21 | no_key.append(detail) 22 | 23 | return success, credits_exhausted, not_subscribed, unknown_error, no_key 24 | 25 | def print_data_errors(credits_exhausted, not_subscribed, unknown_error, no_key): 26 | 27 | if credits_exhausted: 28 | name = "queries" if len(credits_exhausted) > 1 else "query" 29 | print(f"Could not get data for {len(credits_exhausted)} {name} due to credit exhaustion. Please consider upgrading your plan by visiting https://rapidapi.com/Chetan11dev/api/google-scraper/pricing to continue scraping data.") 30 | 31 | if not_subscribed: 32 | name = "queries" if len(not_subscribed) > 1 else "query" 33 | print(f"Could not get data for {len(not_subscribed)} {name} as you are not subscribed to Google Scraper API. Please subscribe to a free plan by visiting https://rapidapi.com/Chetan11dev/api/google-scraper/pricing") 34 | 35 | if unknown_error: 36 | name = "queries" if len(unknown_error) > 1 else "query" 37 | print(f"Could not get data for {len(unknown_error)} {name} due to Unknown Error.") 38 | 39 | if no_key: 40 | name = "queries" if len(no_key) > 1 else "query" 41 | print(f"Could not get data for {len(no_key)} {name} as you are not subscribed to Google Scraper API. Please subscribe to a free plan by visiting https://rapidapi.com/Chetan11dev/api/google-scraper/pricing") 42 | 43 | 44 | class Google: 45 | 46 | @staticmethod 47 | def search(query: Union[str, List[str]], max: Optional[int] = None, key: Optional[str] =None, use_cache: bool = True) -> Dict: 48 | """ 49 | Function to scrape data from Google. 50 | :param use_cache: Boolean indicating whether to use cached data. 51 | :return: List of dictionaries with the scraped data. 52 | """ 53 | cache = use_cache 54 | if isinstance(query, str): 55 | query = [query] 56 | 57 | query = [{"query":query_query, "max": max} for query_query in query] 58 | result = [] 59 | for item in query: 60 | # TODO: max fixes 61 | data = item 62 | metadata = {"key": key} 63 | 64 | result_item = search(data, cache=cache, metadata=metadata) 65 | 66 | success, credits_exhausted, not_subscribed, unknown_error, no_key = clean_data([result_item]) 67 | print_data_errors(credits_exhausted, not_subscribed, unknown_error, no_key) 68 | 69 | if success: 70 | data = result_item.get('data') 71 | if not data: 72 | data = {} 73 | 74 | result_item = data.get('results', []) 75 | result.extend(result_item) 76 | write_output(item['query'], result_item,None) 77 | 78 | if result: 79 | # bt.write_json(result, "result") 80 | write_output('_all',result, None, lambda x:x) 81 | 82 | search.close() 83 | 84 | return result -------------------------------------------------------------------------------- /src/search.py: -------------------------------------------------------------------------------- 1 | 2 | from botasaurus import bt 3 | from botasaurus.cache import DontCache 4 | from botasaurus import cl 5 | from time import sleep 6 | from botasaurus import * 7 | from .utils import default_request_options 8 | import requests 9 | 10 | FAILED_DUE_TO_CREDITS_EXHAUSTED = "FAILED_DUE_TO_CREDITS_EXHAUSTED" 11 | FAILED_DUE_TO_NOT_SUBSCRIBED = "FAILED_DUE_TO_NOT_SUBSCRIBED" 12 | FAILED_DUE_TO_NO_KEY = "FAILED_DUE_TO_NO_KEY" 13 | FAILED_DUE_TO_UNKNOWN_ERROR = "FAILED_DUE_TO_UNKNOWN_ERROR" 14 | 15 | def update_credits(): 16 | credits_used = bt.LocalStorage.get_item("credits_used", 0) 17 | bt.LocalStorage.set_item("credits_used", credits_used + 1) 18 | 19 | def do_request(data, retry_count=3): 20 | 21 | params = data["params"] 22 | 23 | 24 | link = params["link"] 25 | key = data["key"] 26 | # print(params) 27 | # print("link", link) 28 | if retry_count == 0: 29 | print(f"Failed to get data, after 3 retries") 30 | return { 31 | "data": None, 32 | "error":FAILED_DUE_TO_UNKNOWN_ERROR, 33 | } 34 | 35 | 36 | 37 | headers = { 38 | "X-RapidAPI-Key": key, 39 | "X-RapidAPI-Host": "google-scraper.p.rapidapi.com" 40 | } 41 | 42 | 43 | response = requests.get(link, headers=headers) 44 | response_data = response.json() 45 | if response.status_code == 200 or response.status_code == 404: 46 | 47 | message = response_data.get("message", "") 48 | if "API doesn't exists" in message: 49 | return { 50 | "data": None, 51 | "error":FAILED_DUE_TO_UNKNOWN_ERROR 52 | } 53 | 54 | update_credits() 55 | # print(response_data) 56 | # bt.write_json(response_data, "response.json") 57 | if response.status_code == 404: 58 | print(f"No data found") 59 | 60 | return { 61 | "data": response_data, 62 | "error": None 63 | } 64 | 65 | return { 66 | "data": response_data, 67 | "error": None 68 | } 69 | else: 70 | 71 | message = response_data.get("message", "") 72 | 73 | if "exceeded the MONTHLY quota" in message: 74 | return { 75 | "data": None, 76 | "error":FAILED_DUE_TO_CREDITS_EXHAUSTED 77 | } 78 | elif "exceeded the rate limit per second for your plan" in message or "many requests" in message: 79 | sleep(2) 80 | return do_request(data, retry_count - 1) 81 | elif "You are not subscribed to this API." in message: 82 | return { 83 | "data": None, 84 | "error": FAILED_DUE_TO_NOT_SUBSCRIBED 85 | } 86 | 87 | print(f"Error: {response.status_code}", response_data) 88 | 89 | return { 90 | "data": None, 91 | "error":FAILED_DUE_TO_UNKNOWN_ERROR, 92 | } 93 | 94 | @request(**default_request_options) 95 | def search(_, data, metadata): 96 | if not metadata.get('key'): 97 | return DontCache({ 98 | "data": None, 99 | "error":FAILED_DUE_TO_NO_KEY 100 | }) 101 | max_items = data['max'] 102 | url = "https://google-scraper.p.rapidapi.com/search/" 103 | qp = {"query": data['query']} 104 | params = {**qp, 'link':cl.join_link(url, query_params=qp)} 105 | 106 | request_data = {**metadata, "params": params} 107 | result = do_request(request_data) 108 | initial_results = cl.select(result, 'data', 'results', default=[]) 109 | 110 | if not cl.select(result, 'error'): 111 | more_results = cl.select(result, 'data', 'results', default=[]) 112 | print(f"Got {len(more_results)} more results") 113 | 114 | while cl.select(result, 'data', 'next') and (max_items is None or len(initial_results) < max_items): 115 | next = cl.select(result, 'data', 'next') 116 | 117 | params = {**qp, 'link':next} 118 | request_data = {**metadata, "params": params} 119 | result = do_request(request_data) 120 | if result.get('error'): 121 | break 122 | more_results = cl.select(result, 'data', 'results', default=[]) 123 | print(f"Got {len(more_results)} more results") 124 | initial_results.extend(more_results) 125 | 126 | 127 | if cl.select(result, 'error'): 128 | return DontCache(result) 129 | else: 130 | if max_items is not None: 131 | initial_results = initial_results[:max_items] 132 | 133 | result['data']['results'] = initial_results 134 | return result 135 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | 2 | default_browser_options = { 3 | "block_images": True, 4 | "reuse_driver": True, 5 | "keep_drivers_alive": True, 6 | "close_on_crash": True, # When Ready for Production change to True 7 | "headless": True, # When Ready change to True 8 | 'output': None, # When Ready for Production Uncomment 9 | } 10 | 11 | # max 5 retries add 12 | default_request_options = { 13 | "close_on_crash": True, # When Ready for Production change to True 14 | 'output': None, # When Ready for Production Uncomment 15 | "raise_exception":True, 16 | } -------------------------------------------------------------------------------- /src/write_output.py: -------------------------------------------------------------------------------- 1 | from botasaurus import bt 2 | from botasaurus.decorators import print_filenames 3 | from .write_output_utils import kebab_case, make_folders 4 | 5 | 6 | def create_json(path, data): 7 | bt.write_json(data, path, False ) 8 | 9 | 10 | def create_csv(path, data): 11 | bt.write_csv(data, path, False ) 12 | 13 | def format(query_kebab, entity_type, type): 14 | return f"{query_kebab}.{type}" 15 | 16 | def create(data, selected_fields, csv_path, json_path, query_kebab,entity_type): 17 | written = [] 18 | 19 | fname = json_path + format(query_kebab, entity_type,"json",) 20 | create_json(fname, data) 21 | written.append(fname) 22 | 23 | fname = csv_path + format(query_kebab, entity_type,"csv",) 24 | create_csv(fname, data) 25 | written.append(fname) 26 | 27 | print_filenames(written) 28 | 29 | def write_output(query, data, entity_type,transformer = kebab_case): 30 | 31 | query_kebab = transformer(query) 32 | make_folders(query_kebab) 33 | 34 | csv_path = f"output/{query_kebab}/csv/" 35 | json_path = f"output/{query_kebab}/json/" 36 | 37 | create(data,[], csv_path, json_path, query_kebab,entity_type) 38 | -------------------------------------------------------------------------------- /src/write_output_utils.py: -------------------------------------------------------------------------------- 1 | from botasaurus.decorators_utils import create_directory_if_not_exists 2 | from casefy import kebabcase 3 | 4 | 5 | def kebab_case(s): 6 | return kebabcase(s) 7 | 8 | def make_folders(query_kebab): 9 | create_directory_if_not_exists(f"output/{query_kebab}/") 10 | create_directory_if_not_exists(f"output/{query_kebab}/json/") 11 | create_directory_if_not_exists(f"output/{query_kebab}/csv/") 12 | 13 | def format(query_kebab, type, name): 14 | return f"{name}-of-{query_kebab}.{type}" 15 | 16 | --------------------------------------------------------------------------------