├── README.md └── jd_scraper.py /README.md: -------------------------------------------------------------------------------- 1 | # Justdial-Scrapper 2 | A 100% working Justdial scrapper, Just enter the url and it'll extract business info from it 3 | 4 | 1. Enter the url from which you want to extract information and save it in the 'url' variable 5 | ex: 6 | 7 | url="https://www.justdial.com/Agra/Readymade-Garment-Retailers/nct-10401947/page-%s" % (page_number) 8 | 9 | 2. change the name of the csv file to be generated to be an appropriate one 10 | ex: 11 | 12 | out_file = open('Readymade-Garment-Retailers_agra.csv','wb') 13 | 14 | 15 | Run the file : 16 | python ./jd_scraper.py 17 | 18 | 19 | You will see the results in CSV as well as on the terminal. 20 | 21 | 22 | Thanks !!! 23 | 24 | Happy coding.. 25 | 26 | 27 | -------------------------------------------------------------------------------- /jd_scraper.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import urllib 3 | import csv 4 | 5 | 6 | def innerHTML(element): 7 | return element.decode_contents(formatter="html") 8 | 9 | def get_name(body): 10 | return body.find('span', {'class':'jcn'}).a.string 11 | 12 | def get_phone_number(body): 13 | try: 14 | return body.find('p', {'class':'contact-info'}).span.a.string 15 | except AttributeError: 16 | return '' 17 | 18 | def get_rating(body): 19 | rating = 0.0 20 | text = body.find('span', {'class':'star_m'}) 21 | if text is not None: 22 | for item in text: 23 | rating += float(item['class'][0][1:])/10 24 | 25 | return rating 26 | 27 | def get_rating_count(body): 28 | text = body.find('span', {'class':'rt_count'}).string 29 | 30 | # Get only digits 31 | rating_count =''.join(i for i in text if i.isdigit()) 32 | return rating_count 33 | 34 | def get_address(body): 35 | return body.find('span', {'class':'mrehover'}).text.strip() 36 | 37 | def get_location(body): 38 | text = body.find('a', {'class':'rsmap'}) 39 | if text == None: 40 | return 41 | text_list = text['onclick'].split(",") 42 | 43 | latitutde = text_list[3].strip().replace("'", "") 44 | longitude = text_list[4].strip().replace("'", "") 45 | 46 | return latitutde + ", " + longitude 47 | 48 | page_number = 1 49 | service_count = 1 50 | 51 | 52 | fields = ['Name', 'Phone', 'Rating', 'Rating Count', 'Address', 'Location'] 53 | out_file = open('Readymade-Garment-Retailers_agra.csv','w') 54 | csvwriter = csv.DictWriter(out_file, delimiter=',', fieldnames=fields) 55 | 56 | # Write fields first 57 | #csvwriter.writerow(dict((fn,fn) for fn in fields)) 58 | 59 | while True: 60 | 61 | # Check if reached end of result 62 | if page_number > 50: 63 | break 64 | 65 | url="https://www.justdial.com/Agra/Readymade-Garment-Retailers/nct-10401947/page-%s" % (page_number) 66 | req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"}) 67 | page = urllib.request.urlopen( req ) 68 | # page=urllib2.urlopen(url) 69 | 70 | soup = BeautifulSoup(page.read(), "html.parser") 71 | services = soup.find_all('li', {'class': 'cntanr'}) 72 | 73 | 74 | # Iterate through the 10 results in the page 75 | for service_html in services: 76 | 77 | # Parse HTML to fetch data 78 | dict_service = {} 79 | name = get_name(service_html) 80 | phone = get_phone_number(service_html) 81 | rating = get_rating(service_html) 82 | count = get_rating_count(service_html) 83 | address = get_address(service_html) 84 | location = get_location(service_html) 85 | if name != None: 86 | dict_service['Name'] = name 87 | if phone != None: 88 | print('getting phone number') 89 | dict_service['Phone'] = phone 90 | if rating != None: 91 | dict_service['Rating'] = rating 92 | if count != None: 93 | dict_service['Rating Count'] = count 94 | if address != None: 95 | dict_service['Address'] = address 96 | if location != None: 97 | dict_service['Address'] = location 98 | 99 | # Write row to CSV 100 | csvwriter.writerow(dict_service) 101 | 102 | print("#" + str(service_count) + " " , dict_service) 103 | service_count += 1 104 | 105 | page_number += 1 106 | 107 | out_file.close() 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | --------------------------------------------------------------------------------