├── README.md
└── jd_scraper.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Justdial-Scrapper
 2 | A 100% working Justdial scrapper, Just enter the url and it'll extract business info from it
 3 | 
 4 | 1. Enter the url from which you want to extract information and save it in the 'url' variable
 5 | ex: 
 6 | 
 7 | url="https://www.justdial.com/Agra/Readymade-Garment-Retailers/nct-10401947/page-%s" % (page_number)
 8 | 
 9 | 2. change the name of the csv file to be generated to be an appropriate one
10 | ex: 
11 | 
12 | out_file = open('Readymade-Garment-Retailers_agra.csv','wb')
13 | 
14 | 
15 | Run the file :
16 | python ./jd_scraper.py
17 | 
18 | 
19 | You will see the results in CSV as well as on the terminal.
20 | 
21 | 
22 | Thanks !!!
23 | 
24 | Happy coding..
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/jd_scraper.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import urllib
  3 | import csv
  4 | 
  5 | 
  6 | def innerHTML(element):
  7 |     return element.decode_contents(formatter="html")
  8 | 
  9 | def get_name(body):
 10 | 	return body.find('span', {'class':'jcn'}).a.string
 11 | 
 12 | def get_phone_number(body):
 13 | 	try:
 14 | 		return body.find('p', {'class':'contact-info'}).span.a.string
 15 | 	except AttributeError:
 16 | 		return ''
 17 | 
 18 | def get_rating(body):
 19 | 	rating = 0.0
 20 | 	text = body.find('span', {'class':'star_m'})
 21 | 	if text is not None:
 22 | 		for item in text:
 23 | 			rating += float(item['class'][0][1:])/10
 24 | 
 25 | 	return rating
 26 | 
 27 | def get_rating_count(body):
 28 | 	text = body.find('span', {'class':'rt_count'}).string
 29 | 
 30 | 	# Get only digits
 31 | 	rating_count =''.join(i for i in text if i.isdigit())
 32 | 	return rating_count
 33 | 
 34 | def get_address(body):
 35 | 	return body.find('span', {'class':'mrehover'}).text.strip()
 36 | 
 37 | def get_location(body):
 38 | 	text = body.find('a', {'class':'rsmap'})
 39 | 	if text == None:
 40 | 		return
 41 | 	text_list = text['onclick'].split(",")
 42 | 	
 43 | 	latitutde = text_list[3].strip().replace("'", "")
 44 | 	longitude = text_list[4].strip().replace("'", "")
 45 | 	
 46 | 	return latitutde + ", " + longitude
 47 | 
 48 | page_number = 1
 49 | service_count = 1
 50 | 
 51 | 
 52 | fields = ['Name', 'Phone', 'Rating', 'Rating Count', 'Address', 'Location']
 53 | out_file = open('Readymade-Garment-Retailers_agra.csv','w')
 54 | csvwriter = csv.DictWriter(out_file, delimiter=',', fieldnames=fields)
 55 | 
 56 | # Write fields first
 57 | #csvwriter.writerow(dict((fn,fn) for fn in fields))
 58 | 
 59 | while True:
 60 | 
 61 | 	# Check if reached end of result
 62 | 	if page_number > 50:
 63 | 		break
 64 | 
 65 | 	url="https://www.justdial.com/Agra/Readymade-Garment-Retailers/nct-10401947/page-%s" % (page_number)
 66 | 	req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"}) 
 67 | 	page = urllib.request.urlopen( req )
 68 | 	# page=urllib2.urlopen(url)
 69 | 
 70 | 	soup = BeautifulSoup(page.read(), "html.parser")
 71 | 	services = soup.find_all('li', {'class': 'cntanr'})
 72 | 
 73 | 
 74 | 	# Iterate through the 10 results in the page
 75 | 	for service_html in services:
 76 | 
 77 | 		# Parse HTML to fetch data
 78 | 		dict_service = {}
 79 | 		name = get_name(service_html)
 80 | 		phone = get_phone_number(service_html)
 81 | 		rating = get_rating(service_html)
 82 | 		count = get_rating_count(service_html)
 83 | 		address = get_address(service_html)
 84 | 		location = get_location(service_html)
 85 | 		if name != None:
 86 | 			dict_service['Name'] = name
 87 | 		if phone != None:
 88 | 			print('getting phone number')
 89 | 			dict_service['Phone'] = phone
 90 | 		if rating != None:
 91 | 			dict_service['Rating'] = rating
 92 | 		if count != None:
 93 | 			dict_service['Rating Count'] = count
 94 | 		if address != None:
 95 | 			dict_service['Address'] = address
 96 | 		if location != None:
 97 | 			dict_service['Address'] = location
 98 | 
 99 | 		# Write row to CSV
100 | 		csvwriter.writerow(dict_service)
101 | 
102 | 		print("#" + str(service_count) + " " , dict_service)
103 | 		service_count += 1
104 | 
105 | 	page_number += 1
106 | 
107 | out_file.close()
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------