├── Google2Csv.ipynb └── README.md /Google2Csv.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Google2Csv.ipynb", 7 | "provenance": [], 8 | "toc_visible": true, 9 | "authorship_tag": "ABX9TyN/EIlGRKp/NPVnoqRZtLV/", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "3w-fyktSoKxu", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "##GoogleScraper" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "zCQtYO-Po_kf", 42 | "colab_type": "code", 43 | "colab": { 44 | "base_uri": "https://localhost:8080/", 45 | "height": 34 46 | }, 47 | "outputId": "2d2221b1-b425-4723-adbc-7aa415ecf254" 48 | }, 49 | "source": [ 50 | "!pip install beautifulsoup4\n", 51 | "\n", 52 | "from bs4 import BeautifulSoup\n", 53 | "import requests\n", 54 | "import pandas as pd" 55 | ], 56 | "execution_count": 1, 57 | "outputs": [ 58 | { 59 | "output_type": "stream", 60 | "text": [ 61 | "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.6/dist-packages (4.6.3)\n" 62 | ], 63 | "name": "stdout" 64 | } 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "metadata": { 70 | "id": "6JZjJa-joXEh", 71 | "colab_type": "code", 72 | "colab": {} 73 | }, 74 | "source": [ 75 | "def simpleGoogleSearch(query, start):\n", 76 | " results = []\n", 77 | "\n", 78 | " query = query.replace(' ', '+')\n", 79 | " URL = f\"https://google.com/search?q={query}&start={start}\"\n", 80 | "\n", 81 | " # desktop user-agent\n", 82 | " USER_AGENT = \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0\"\n", 83 | " \n", 84 | " headers = {\"user-agent\" : USER_AGENT}\n", 85 | " resp = requests.get(URL, headers=headers)\n", 86 | "\n", 87 | " if resp.status_code == 200:\n", 88 | " soup = BeautifulSoup(resp.content, \"html.parser\")\n", 89 | "\n", 90 | " for g in soup.find_all('div', class_='r'):\n", 91 | " anchors = g.find_all('a')\n", 92 | "\n", 93 | " if anchors:\n", 94 | " link = anchors[0]['href']\n", 95 | " title = g.find('h3').text\n", 96 | " item = {\"title\": title, \"link\": link}\n", 97 | " results.append(item)\n", 98 | "\n", 99 | " return results" 100 | ], 101 | "execution_count": 0, 102 | "outputs": [] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "metadata": { 107 | "id": "kBCwCEcGob2M", 108 | "colab_type": "code", 109 | "colab": {} 110 | }, 111 | "source": [ 112 | "def googleToPandas(googleQuery):\n", 113 | " resultsCounter = 0\n", 114 | " resultsList = []\n", 115 | "\n", 116 | " while True:\n", 117 | " pageResults = simpleGoogleSearch(googleQuery, resultsCounter)\n", 118 | " \n", 119 | " if not pageResults: break\n", 120 | " else: \n", 121 | " resultsList.extend(pageResults)\n", 122 | " resultsCounter = resultsCounter + 10\n", 123 | "\n", 124 | " return pd.DataFrame(resultsList)" 125 | ], 126 | "execution_count": 0, 127 | "outputs": [] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "metadata": { 132 | "id": "SoUutC4DojV7", 133 | "colab_type": "code", 134 | "colab": {} 135 | }, 136 | "source": [ 137 | "googleSearchQuery = \"Covid 19\"" 138 | ], 139 | "execution_count": 0, 140 | "outputs": [] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "metadata": { 145 | "id": "r44AZDaBpjGO", 146 | "colab_type": "code", 147 | "colab": { 148 | "base_uri": "https://localhost:8080/", 149 | "height": 419 150 | }, 151 | "outputId": "d7a82dce-8463-45fa-c088-8cac5dba3fa5" 152 | }, 153 | "source": [ 154 | "results = googleToPandas(googleSearchQuery)\n", 155 | "results" 156 | ], 157 | "execution_count": 5, 158 | "outputs": [ 159 | { 160 | "output_type": "execute_result", 161 | "data": { 162 | "text/html": [ 163 | "
\n", 164 | "\n", 177 | "\n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | "
titlelink
0Coronavirus Disease 2019 (COVID-19) | CDChttps://www.cdc.gov/coronavirus/2019-ncov/inde...
1Cases in the U.S. | CDChttps://www.cdc.gov/coronavirus/2019-ncov/case...
2Coronavirus (COVID-19) frequently asked questi...https://www.cdc.gov/coronavirus/2019-ncov/faq....
3Coronavirus disease 2019 - World Health Organi...https://www.who.int/emergencies/diseases/novel...
4Coronavirus (COVID-19) events as they happenhttps://www.who.int/emergencies/diseases/novel...
.........
231This web page is parked FREE, courtesy of GoDa...http://covid-2019.me/
232COVID-19 - Doctors eBookhttp://doctorsebook.com/covid-19/
233COVID-19 Updatehttps://www.oepreview.com/covid19sample/covid-...
234Page 2 – Corona Virus (Covid-19) - Covid-19 Ou...https://covid-19outbreak.co/page/2/
235This web page is parked FREE, courtesy of GoDa...http://covid-2019.me/
\n", 243 | "

236 rows × 2 columns

\n", 244 | "
" 245 | ], 246 | "text/plain": [ 247 | " title link\n", 248 | "0 Coronavirus Disease 2019 (COVID-19) | CDC https://www.cdc.gov/coronavirus/2019-ncov/inde...\n", 249 | "1 Cases in the U.S. | CDC https://www.cdc.gov/coronavirus/2019-ncov/case...\n", 250 | "2 Coronavirus (COVID-19) frequently asked questi... https://www.cdc.gov/coronavirus/2019-ncov/faq....\n", 251 | "3 Coronavirus disease 2019 - World Health Organi... https://www.who.int/emergencies/diseases/novel...\n", 252 | "4 Coronavirus (COVID-19) events as they happen https://www.who.int/emergencies/diseases/novel...\n", 253 | ".. ... ...\n", 254 | "231 This web page is parked FREE, courtesy of GoDa... http://covid-2019.me/\n", 255 | "232 COVID-19 - Doctors eBook http://doctorsebook.com/covid-19/\n", 256 | "233 COVID-19 Update https://www.oepreview.com/covid19sample/covid-...\n", 257 | "234 Page 2 – Corona Virus (Covid-19) - Covid-19 Ou... https://covid-19outbreak.co/page/2/\n", 258 | "235 This web page is parked FREE, courtesy of GoDa... http://covid-2019.me/\n", 259 | "\n", 260 | "[236 rows x 2 columns]" 261 | ] 262 | }, 263 | "metadata": { 264 | "tags": [] 265 | }, 266 | "execution_count": 5 267 | } 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "metadata": { 273 | "id": "y-wbjJuYqnOT", 274 | "colab_type": "code", 275 | "colab": {} 276 | }, 277 | "source": [ 278 | "results.to_csv('GoogleResults.csv', index=False)\n", 279 | "results.to_excel('GoogleResults.xlsx', index=False)\n", 280 | "results.to_json('GoogleResults.jsonl', orient='records', lines=True)" 281 | ], 282 | "execution_count": 0, 283 | "outputs": [] 284 | } 285 | ] 286 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Google2Csv 2 | [![Colab badge](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/psalias2006/Google2Csv/blob/master/Google2Csv.ipynb) 3 | 4 | This is a dead simple tutorial on how to scrape google using BeautifulSoup and save the results on a csv/xlsx/jsonl file 5 | 6 | 7 | ![alt text](https://i.imgur.com/G8acZQf.png) 8 | 9 | 10 | ## License 11 | [MIT](https://choosealicense.com/licenses/mit/) 12 | 13 | #### Legal & Disclaimer 14 | *This script is just a proof of concept. The author isn't responsible for the actions of the end users.* 15 | --------------------------------------------------------------------------------