├── inc ├── __init__.py ├── util.py ├── parser.py ├── wiggle.py └── chainreaction.py ├── calc.py ├── README.md └── makepost.py /inc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /calc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import sys 5 | 6 | PER_POUND = 130 7 | CHARGE_PER_KG = 600 8 | 9 | if len(sys.argv) < 2: 10 | print("missing url to parse") 11 | sys.exit(1) 12 | price = float(sys.argv[1]) 13 | weight = float(sys.argv[2]) 14 | 15 | print((PER_POUND * price) + ((weight / 1000) * CHARGE_PER_KG)) 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # price-parser-py 2 | 3 | A small script that downloads a product page and parse details of the product. Uses `LXML` to parse websites 4 | 5 | Two parsers are already included in `inc` folder for chainreaction and wiggle. New parsers can be added by extending `Parser` class and defining selectors. 6 | 7 | The script also takes the product image and imposes price on that image, using imagemagick. 8 | -------------------------------------------------------------------------------- /inc/util.py: -------------------------------------------------------------------------------- 1 | from lxml import html 2 | import requests 3 | import shutil 4 | 5 | 6 | def get_dom(url): 7 | """get dom from html 8 | 9 | """ 10 | resp = requests.get(url) 11 | return html.fromstring(resp.content) 12 | 13 | 14 | def download_image(url): 15 | """downloads an image 16 | 17 | :url: @todo 18 | :returns: @todo 19 | """ 20 | response = requests.get(url, stream=True) 21 | with open('img.jpg', 'wb') as out_file: 22 | shutil.copyfileobj(response.raw, out_file) 23 | del response 24 | 25 | 26 | def get_elm(dom, xpath, index=None): 27 | """returns element by xpath 28 | 29 | """ 30 | elm = dom.xpath(xpath) 31 | if index != None: 32 | return elm[0] 33 | return elm 34 | 35 | 36 | def get_text(dom, xpath, index): 37 | """returns text of element by index 38 | 39 | """ 40 | elm = get_elm(dom, xpath, index) 41 | return elm.text_content().strip() 42 | -------------------------------------------------------------------------------- /makepost.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import sys 5 | import inc.wiggle as wiggle 6 | import inc.chainreaction as chainreaction 7 | import subprocess 8 | 9 | TEMPLATE = """%(title)s 10 | ======================== 11 | Our Price: BDT %(total_price)s 12 | Original Price: £%(original_price)s 13 | == 14 | Est. Weight 15 | %(weight)s g 16 | ======================== 17 | Rating : %(rating)s 18 | ======================== 19 | Details: %(url)s 20 | Review : %(review_url)s 21 | ======================== 22 | Description: 23 | %(description)s 24 | ======================== 25 | Features: 26 | » %(features)s 27 | """ 28 | 29 | 30 | def main(): 31 | """Main""" 32 | rate = 130 33 | weight_charge = 600 34 | 35 | if len(sys.argv) < 2: 36 | print("missing url to parse") 37 | sys.exit(1) 38 | url = sys.argv[1] 39 | parser = None 40 | if url.find('wiggle.co.uk') >= 0: 41 | parser = wiggle.Wiggle(url) 42 | elif url.find('chainreaction') >= 0: 43 | parser = chainreaction.ChainReaction(url) 44 | else: 45 | print("unknown website") 46 | return 47 | data = parser.parse_page(rate, weight_charge) 48 | if len(sys.argv) >= 3: 49 | weight = int(sys.argv[2]) 50 | data['weight'] = weight 51 | data['weight_charge'] = round((weight / 1000) * weight_charge, 2) 52 | data['total_price'] = round(data['weight_charge'] + data['price']) 53 | else: 54 | data['total_price'] = '%s taka BUT NO WEIGHT PROVIDED' % data['price'] 55 | with open('post_content', 'w') as fp: 56 | txt = TEMPLATE % data 57 | fp.write(txt) 58 | cmd = "convert_img.jpg_-font_Ubuntu_-pointsize_24_-background_Orange_" +\ 59 | "label:| Price: %s taka |_-gravity_Center_-append_imgtxt.jpg" % data['total_price'] 60 | subprocess.check_output(cmd.split('_')) 61 | 62 | 63 | if __name__ == '__main__': 64 | main() 65 | -------------------------------------------------------------------------------- /inc/parser.py: -------------------------------------------------------------------------------- 1 | import inc.util as util 2 | 3 | 4 | class Parser(object): 5 | 6 | """parse wiggle website""" 7 | def __init__(self, url): 8 | super(Parser, self).__init__() 9 | self.url = url 10 | self.dom = util.get_dom(self.url) 11 | self.XTITLE = None 12 | self.XPRICE = None 13 | self.XRATING = None 14 | self.XDESC = None 15 | self.XFEAT = None 16 | self.XIMG = None 17 | self.REVIEW_PREFIX = None 18 | 19 | def get_title(self): 20 | """get product title 21 | 22 | :returns: @todo 23 | """ 24 | return util.get_text(self.dom, self.XTITLE, 0).replace('\n', ' ') 25 | 26 | def get_price(self, rate): 27 | """parse price and calculate with the rate 28 | 29 | :rate: @todo 30 | :returns: @todo 31 | 32 | """ 33 | return self.get_original_price() * rate 34 | 35 | def get_original_price(self): 36 | """get original price 37 | :returns: @todo 38 | 39 | """ 40 | return float(util.get_text(self.dom, self.XPRICE, 0)[1:]) 41 | 42 | def get_review_url(self): 43 | """get review url 44 | 45 | :returns: @todo 46 | 47 | """ 48 | return self.url + self.REVIEW_PREFIX 49 | 50 | def get_rating(self): 51 | """return rating""" 52 | raise NotImplementedError() 53 | 54 | def get_description(self): 55 | """return description""" 56 | raise NotImplementedError() 57 | 58 | def get_features(self): 59 | """get features""" 60 | raise NotImplementedError() 61 | 62 | def get_image(self): 63 | """get_image""" 64 | raise NotImplementedError() 65 | 66 | def parse_page(self, rate, weight_charge): 67 | """parsed the page for product data 68 | 69 | :returns: @todo 70 | """ 71 | data = {} 72 | data['title'] = self.get_title() 73 | data['original_price'] = self.get_original_price() 74 | data['price'] = self.get_price(rate) 75 | data['rating'] = self.get_rating() 76 | data['description'] = self.get_description() 77 | data['features'] = self.get_features() 78 | data['url'] = self.url 79 | data['review_url'] = self.get_review_url() 80 | data['img'] = self.get_image() 81 | return data 82 | -------------------------------------------------------------------------------- /inc/wiggle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import inc.parser as parser 5 | import inc.util as util 6 | 7 | 8 | class Wiggle(parser.Parser): 9 | """Parses wiggle""" 10 | 11 | def __init__(self, url): 12 | super(Wiggle, self).__init__(url) 13 | self.XTITLE = "id('productTitle')" 14 | self.XPRICE = '//div[@class="bem-product-price__unit--pdp"]' 15 | self.XRATING = '//div[@class="bem-review-stars__wrapper"]' 16 | self.XDESC = '//div[@itemprop="description"]' 17 | self.XFEAT = '//div[@class="bem-content"]/dl[1]/dd' 18 | self.XIMG = '//a[@class="zoomable-image"]/img' 19 | self.XIMG2 = '//div[@id="mainImageWrapper"]/img' 20 | self.REVIEW_PREFIX = '#tabCustReviews' 21 | 22 | def get_original_price(self): 23 | """get original price 24 | :returns: @todo 25 | 26 | """ 27 | return float(util.get_text(self.dom, self.XPRICE, 0)[1:]) 28 | 29 | def get_rating(self): 30 | """get product rating 31 | 32 | :returns: @todo 33 | """ 34 | rating = util.get_elm(self.dom, self.XRATING, 0) 35 | return rating.attrib['title'].strip().replace('Star review', '') 36 | 37 | def get_description(self): 38 | """get product description 39 | 40 | :returns: @todo 41 | """ 42 | return util.get_text(self.dom, self.XDESC, 0) 43 | 44 | def get_features(self): 45 | """get product features 46 | 47 | 48 | """ 49 | features = self.dom.xpath(self.XFEAT) 50 | txt = [] 51 | for feature in features: 52 | txt.append(feature.text_content()) 53 | s = "\n» " 54 | return s.join(txt).strip() 55 | 56 | def get_image(self): 57 | """get image url and download 58 | 59 | :returns: @todo 60 | """ 61 | failed = False 62 | try: 63 | img = util.get_elm(self.dom, self.XIMG, 0).attrib['src'] 64 | except Exception: 65 | failed = True 66 | if failed: 67 | try: 68 | img = util.get_elm(self.dom, self.XIMG2, 0).attrib['src'] 69 | except Exception: 70 | print("Failed to download image") 71 | img = '' 72 | if img.startswith('//'): 73 | img = 'http:' + img 74 | if img != '': 75 | util.download_image(img) 76 | return img 77 | -------------------------------------------------------------------------------- /inc/chainreaction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import inc.parser as parser 5 | import inc.util as util 6 | 7 | 8 | class ChainReaction(parser.Parser): 9 | 10 | """parse ChainReaction website""" 11 | def __init__(self, url): 12 | super(ChainReaction, self).__init__(url) 13 | self.XTITLE = '//li[@class="crcPDPTitle"]' 14 | self.XPRICE = '//span[@class="crcPDPPriceHidden"]' 15 | self.XRATING = '//li[@class="crcPDPRatingsReviewsStarsInfo"]' 16 | self.XDESC = 'id("crcPDPComponentDescription")' 17 | self.XFEAT = 'id("crcPDPComponentDescription")/ul/li' 18 | self.XIMG = '//li[@class="crcPDPImage"]//img' 19 | self.REVIEW_PREFIX = '#bazaarvoice_reviews_tab' 20 | 21 | def get_original_price(self): 22 | """parse for original price 23 | :returns: @todo 24 | 25 | """ 26 | return float(util.get_text(self.dom, self.XPRICE, 0)) 27 | 28 | def get_rating(self): 29 | """get product rating 30 | 31 | :returns: @todo 32 | """ 33 | try: 34 | rating = util.get_elm(self.dom, self.XRATING, 0) 35 | except Exception: 36 | return 'N/A' 37 | return rating.text_content().strip() 38 | 39 | def get_description(self): 40 | """get product description 41 | 42 | :returns: @todo 43 | """ 44 | desc = util.get_text(self.dom, self.XDESC, 0) 45 | return desc.split('Features:')[0] 46 | 47 | def get_features(self): 48 | """@todo: Docstring for function. 49 | 50 | :arg1: @todo 51 | :returns: @todo 52 | 53 | """ 54 | feat = util.get_elm(self.dom, self.XFEAT) 55 | line = [] 56 | if feat != None: 57 | for li in feat: 58 | line.append('» ' + li.text_content()) 59 | return '\n'.join(line).strip('» ') 60 | desc = util.get_text(self.dom, self.XDESC, 0) 61 | try: 62 | lines = desc.split('Features:')[1].strip().split('\n') 63 | except Exception: 64 | return '' 65 | newline = [] 66 | for l in lines: 67 | line = l.strip() 68 | if line != '': 69 | newline.append('» ' + line) 70 | else: 71 | newline.append('\n') 72 | return '\n'.join(newline).strip('» ') 73 | 74 | def get_image(self): 75 | """get image url and download 76 | 77 | :returns: @todo 78 | """ 79 | try: 80 | img = util.get_elm(self.dom, self.XIMG, 0).attrib['src'] 81 | except Exception: 82 | print("Failed to download image") 83 | img = '' 84 | if img.startswith('//'): 85 | img = 'http:' + img 86 | if img != '': 87 | util.download_image(img) 88 | return img 89 | 90 | 91 | --------------------------------------------------------------------------------