├── Google2Csv.ipynb
└── README.md
/Google2Csv.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "Google2Csv.ipynb",
7 | "provenance": [],
8 | "toc_visible": true,
9 | "authorship_tag": "ABX9TyN/EIlGRKp/NPVnoqRZtLV/",
10 | "include_colab_link": true
11 | },
12 | "kernelspec": {
13 | "name": "python3",
14 | "display_name": "Python 3"
15 | }
16 | },
17 | "cells": [
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {
21 | "id": "view-in-github",
22 | "colab_type": "text"
23 | },
24 | "source": [
25 | ""
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {
31 | "id": "3w-fyktSoKxu",
32 | "colab_type": "text"
33 | },
34 | "source": [
35 | "##GoogleScraper"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "metadata": {
41 | "id": "zCQtYO-Po_kf",
42 | "colab_type": "code",
43 | "colab": {
44 | "base_uri": "https://localhost:8080/",
45 | "height": 34
46 | },
47 | "outputId": "2d2221b1-b425-4723-adbc-7aa415ecf254"
48 | },
49 | "source": [
50 | "!pip install beautifulsoup4\n",
51 | "\n",
52 | "from bs4 import BeautifulSoup\n",
53 | "import requests\n",
54 | "import pandas as pd"
55 | ],
56 | "execution_count": 1,
57 | "outputs": [
58 | {
59 | "output_type": "stream",
60 | "text": [
61 | "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.6/dist-packages (4.6.3)\n"
62 | ],
63 | "name": "stdout"
64 | }
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "metadata": {
70 | "id": "6JZjJa-joXEh",
71 | "colab_type": "code",
72 | "colab": {}
73 | },
74 | "source": [
75 | "def simpleGoogleSearch(query, start):\n",
76 | " results = []\n",
77 | "\n",
78 | " query = query.replace(' ', '+')\n",
79 | " URL = f\"https://google.com/search?q={query}&start={start}\"\n",
80 | "\n",
81 | " # desktop user-agent\n",
82 | " USER_AGENT = \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0\"\n",
83 | " \n",
84 | " headers = {\"user-agent\" : USER_AGENT}\n",
85 | " resp = requests.get(URL, headers=headers)\n",
86 | "\n",
87 | " if resp.status_code == 200:\n",
88 | " soup = BeautifulSoup(resp.content, \"html.parser\")\n",
89 | "\n",
90 | " for g in soup.find_all('div', class_='r'):\n",
91 | " anchors = g.find_all('a')\n",
92 | "\n",
93 | " if anchors:\n",
94 | " link = anchors[0]['href']\n",
95 | " title = g.find('h3').text\n",
96 | " item = {\"title\": title, \"link\": link}\n",
97 | " results.append(item)\n",
98 | "\n",
99 | " return results"
100 | ],
101 | "execution_count": 0,
102 | "outputs": []
103 | },
104 | {
105 | "cell_type": "code",
106 | "metadata": {
107 | "id": "kBCwCEcGob2M",
108 | "colab_type": "code",
109 | "colab": {}
110 | },
111 | "source": [
112 | "def googleToPandas(googleQuery):\n",
113 | " resultsCounter = 0\n",
114 | " resultsList = []\n",
115 | "\n",
116 | " while True:\n",
117 | " pageResults = simpleGoogleSearch(googleQuery, resultsCounter)\n",
118 | " \n",
119 | " if not pageResults: break\n",
120 | " else: \n",
121 | " resultsList.extend(pageResults)\n",
122 | " resultsCounter = resultsCounter + 10\n",
123 | "\n",
124 | " return pd.DataFrame(resultsList)"
125 | ],
126 | "execution_count": 0,
127 | "outputs": []
128 | },
129 | {
130 | "cell_type": "code",
131 | "metadata": {
132 | "id": "SoUutC4DojV7",
133 | "colab_type": "code",
134 | "colab": {}
135 | },
136 | "source": [
137 | "googleSearchQuery = \"Covid 19\""
138 | ],
139 | "execution_count": 0,
140 | "outputs": []
141 | },
142 | {
143 | "cell_type": "code",
144 | "metadata": {
145 | "id": "r44AZDaBpjGO",
146 | "colab_type": "code",
147 | "colab": {
148 | "base_uri": "https://localhost:8080/",
149 | "height": 419
150 | },
151 | "outputId": "d7a82dce-8463-45fa-c088-8cac5dba3fa5"
152 | },
153 | "source": [
154 | "results = googleToPandas(googleSearchQuery)\n",
155 | "results"
156 | ],
157 | "execution_count": 5,
158 | "outputs": [
159 | {
160 | "output_type": "execute_result",
161 | "data": {
162 | "text/html": [
163 | "
\n", 181 | " | title | \n", 182 | "link | \n", 183 | "
---|---|---|
0 | \n", 188 | "Coronavirus Disease 2019 (COVID-19) | CDC | \n", 189 | "https://www.cdc.gov/coronavirus/2019-ncov/inde... | \n", 190 | "
1 | \n", 193 | "Cases in the U.S. | CDC | \n", 194 | "https://www.cdc.gov/coronavirus/2019-ncov/case... | \n", 195 | "
2 | \n", 198 | "Coronavirus (COVID-19) frequently asked questi... | \n", 199 | "https://www.cdc.gov/coronavirus/2019-ncov/faq.... | \n", 200 | "
3 | \n", 203 | "Coronavirus disease 2019 - World Health Organi... | \n", 204 | "https://www.who.int/emergencies/diseases/novel... | \n", 205 | "
4 | \n", 208 | "Coronavirus (COVID-19) events as they happen | \n", 209 | "https://www.who.int/emergencies/diseases/novel... | \n", 210 | "
... | \n", 213 | "... | \n", 214 | "... | \n", 215 | "
231 | \n", 218 | "This web page is parked FREE, courtesy of GoDa... | \n", 219 | "http://covid-2019.me/ | \n", 220 | "
232 | \n", 223 | "COVID-19 - Doctors eBook | \n", 224 | "http://doctorsebook.com/covid-19/ | \n", 225 | "
233 | \n", 228 | "COVID-19 Update | \n", 229 | "https://www.oepreview.com/covid19sample/covid-... | \n", 230 | "
234 | \n", 233 | "Page 2 – Corona Virus (Covid-19) - Covid-19 Ou... | \n", 234 | "https://covid-19outbreak.co/page/2/ | \n", 235 | "
235 | \n", 238 | "This web page is parked FREE, courtesy of GoDa... | \n", 239 | "http://covid-2019.me/ | \n", 240 | "
236 rows × 2 columns
\n", 244 | "