├── README.md
├── knowledge-panel-scraper.py
└── requirements.txt
/README.md:
--------------------------------------------------------------------------------
1 | # knowledge-panel-scraper
2 | knowledge-panel-scraper is a Python 3 command line tool that scrapes Google's [Knowledge Panels](https://support.google.com/business/answer/6331288?hl=en) to retrieve the phone numbers, addresses, and hours of an inputted list of businesses.
3 |
4 | ## Installation
5 |
6 | Use git to clone the repository, then install required libraries with the package manager [pip](https://pip.pypa.io/en/stable/).
7 |
8 | ```bash
9 | git clone https://github.com/jnovak98/knowledge-panel-scraper.git
10 | cd knowledge-panel-scraper
11 | pip install -r requirements.txt
12 | ```
13 |
14 | ## Usage
15 | ```bash
16 | python knowledge-panel-scraper.py inputfile.csv
17 | ```
18 |
19 | inputfile.csv should be a plain text CSV file with each row containing data to generate a search query for a specific business.
20 | For example:
21 | ```csv
22 | "Bobcat of Monroe,Monroe,NC",1711 MORGAN MILL ROAD,MONROE,NC,28110,(704) 289-2200
23 | "Kelly's Garage,Perry,NY",2868 STATE ROUTE 246,PERRY,NY,14530,(585) 237-2504
24 | "Hoxie Implement Co,Hoxie,KS",933 OAK AVENUE,HOXIE,KS,67740-0587,(785) 675-3201
25 | "Duhon Machinery,St. Rose,LA",10460 WEST AIRLINE HIGHWAY,ST. ROSE,LA,70087,(504) 466-5495
26 | ```
27 |
28 | The results will be saved in results.csv in the same directory.
29 |
30 | ## Contributing
31 | Pull requests are welcome.
32 |
33 | ## License
34 | [MIT](https://choosealicense.com/licenses/mit/)
35 |
--------------------------------------------------------------------------------
/knowledge-panel-scraper.py:
--------------------------------------------------------------------------------
1 | import csv, json, re, sys
2 | import requests
3 |
4 | headers_Get = {
5 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
6 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
7 | 'Accept-Language': 'en-US,en;q=0.5',
8 | 'Accept-Encoding': 'gzip, deflate',
9 | 'DNT': '1',
10 | 'Connection': 'keep-alive',
11 | 'Upgrade-Insecure-Requests': '1'
12 | }
13 |
14 | html_tags = {
15 | 'knowledge_panel': 'kp-blk knowledge-panel',
16 | 'claimed': "Own this business?",
17 | 'name': "kno-ecr-pt kno-fb-ctx",
18 | 'phone': 'LrzXr zdqRlf kno-fv',
19 | 'days': "kc:/location/location:hours",
20 | 'address': "kc:/location/location:address",
21 | 'website': "IzNS7c duf-h"
22 | }
23 |
24 | html_regexes = {
25 | 'name': '(.*)',
26 | 'phone': '(.*?)',
27 | 'hours': '
(.*) | ',
28 | 'address': '(.*)',
29 | 'website': 'href="(.*?)"'
30 | }
31 |
32 | days = ["Sunday", "Monday","Tuesday","Wednesday", "Thursday","Friday","Saturday"]
33 |
34 | def google(q):
35 | s = requests.Session()
36 | q = '+'.join(q.split())
37 | url = 'https://www.google.com/search?q=' + q + '&ie=utf-8&oe=utf-8'
38 | r = s.get(url, headers=headers_Get)
39 | return r.text
40 |
41 | def get_string_after_tag(string, tag, regex, distance):
42 | if(tag not in string):
43 | return None
44 |
45 | index = string.find(tag)
46 | substr = string[index:index+distance]
47 | if re.search(regex,substr):
48 | return re.search(regex,substr).group(1)
49 | else:
50 | return None
51 |
52 | def get_details(query):
53 | html_results = google(query)
54 | results = {'query':query}
55 | has_knowledge_panel = html_tags['knowledge_panel'] in html_results
56 |
57 | if(has_knowledge_panel):
58 | results['exists'] = True
59 | results['name'] = get_string_after_tag(html_results, html_tags['name'],html_regexes['name'],500)
60 |
61 | results['claimed'] = html_tags['claimed'] not in html_results
62 |
63 | phone_number = get_string_after_tag(html_results, html_tags['phone'],html_regexes['phone'],200)
64 | if(phone_number):
65 | results['phone_number'] = phone_number
66 |
67 | address = get_string_after_tag(html_results, html_tags['address'],html_regexes['address'],1000)
68 | if(address):
69 | results['address'] = address
70 |
71 | website = get_string_after_tag(html_results, html_tags['website'],html_regexes['website'],200)
72 | if(website):
73 | results['website'] = website
74 |
75 | if html_tags['days'] in html_results:
76 | hours_index = html_results.find(html_tags['days'])
77 | hours_substr = html_results[hours_index:hours_index+2000]
78 | for day in days:
79 | results['{}_hours'.format(day)] = get_string_after_tag(hours_substr,day,html_regexes['hours'],50)
80 | else:
81 | results['exists'] = False
82 | return results
83 |
84 |
85 | if __name__ == "__main__":
86 | with open(sys.argv[1], newline='') as csvfile:
87 | with open('results.csv', 'w', newline='') as results:
88 | reader = csv.reader(csvfile)
89 | fieldnames = ['query','exists', 'name','claimed','phone_number','address','website',
90 | "Friday_hours","Saturday_hours","Sunday_hours", "Monday_hours","Tuesday_hours","Wednesday_hours", "Thursday_hours"]
91 | writer = csv.DictWriter(results, fieldnames=fieldnames)
92 | writer.writeheader()
93 | for row in reader:
94 | print(reader.line_num)
95 | writer.writerow(get_details(u" ".join(row)))
96 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.22.0
--------------------------------------------------------------------------------