├── inc
    ├── __init__.py
    ├── util.py
    ├── parser.py
    ├── wiggle.py
    └── chainreaction.py
├── calc.py
├── README.md
└── makepost.py


/inc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/calc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | import sys
 5 | 
 6 | PER_POUND = 130
 7 | CHARGE_PER_KG = 600
 8 | 
 9 | if len(sys.argv) < 2:
10 |     print("missing url to parse")
11 |     sys.exit(1)
12 | price = float(sys.argv[1])
13 | weight = float(sys.argv[2])
14 | 
15 | print((PER_POUND * price) + ((weight / 1000) * CHARGE_PER_KG))
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # price-parser-py
2 | 
3 | A small script that downloads a product page and parse details of the product. Uses `LXML` to parse websites
4 | 
5 | Two parsers are already included in `inc` folder for chainreaction and wiggle. New parsers can be added by extending `Parser` class  and defining selectors.
6 | 
7 | The script also takes the product image and imposes price on that image, using imagemagick.
8 | 


--------------------------------------------------------------------------------
/inc/util.py:
--------------------------------------------------------------------------------
 1 | from lxml import html
 2 | import requests
 3 | import shutil
 4 | 
 5 | 
 6 | def get_dom(url):
 7 |     """get dom from html
 8 | 
 9 |     """
10 |     resp = requests.get(url)
11 |     return html.fromstring(resp.content)
12 | 
13 | 
14 | def download_image(url):
15 |     """downloads an image
16 | 
17 |     :url: @todo
18 |     :returns: @todo
19 |     """
20 |     response = requests.get(url, stream=True)
21 |     with open('img.jpg', 'wb') as out_file:
22 |         shutil.copyfileobj(response.raw, out_file)
23 |     del response
24 | 
25 | 
26 | def get_elm(dom, xpath, index=None):
27 |     """returns element by xpath
28 | 
29 |     """
30 |     elm = dom.xpath(xpath)
31 |     if index != None:
32 |         return elm[0]
33 |     return elm
34 | 
35 | 
36 | def get_text(dom, xpath, index):
37 |     """returns text of element by index
38 | 
39 |     """
40 |     elm = get_elm(dom, xpath, index)
41 |     return elm.text_content().strip()
42 | 


--------------------------------------------------------------------------------
/makepost.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | import sys
 5 | import inc.wiggle as wiggle
 6 | import inc.chainreaction as chainreaction
 7 | import subprocess
 8 | 
 9 | TEMPLATE = """%(title)s
10 | ========================
11 | Our Price: BDT %(total_price)s
12 | Original Price: £%(original_price)s
13 | ==
14 | Est. Weight
15 | %(weight)s g
16 | ========================
17 | Rating : %(rating)s
18 | ========================
19 | Details: %(url)s
20 | Review : %(review_url)s
21 | ========================
22 | Description:
23 | %(description)s
24 | ========================
25 | Features:
26 | » %(features)s
27 | """
28 | 
29 | 
30 | def main():
31 |     """Main"""
32 |     rate = 130
33 |     weight_charge = 600
34 | 
35 |     if len(sys.argv) < 2:
36 |         print("missing url to parse")
37 |         sys.exit(1)
38 |     url = sys.argv[1]
39 |     parser = None
40 |     if url.find('wiggle.co.uk') >= 0:
41 |         parser = wiggle.Wiggle(url)
42 |     elif url.find('chainreaction') >= 0:
43 |         parser = chainreaction.ChainReaction(url)
44 |     else:
45 |         print("unknown website")
46 |         return
47 |     data = parser.parse_page(rate, weight_charge)
48 |     if len(sys.argv) >= 3:
49 |         weight = int(sys.argv[2])
50 |         data['weight'] = weight
51 |         data['weight_charge'] = round((weight / 1000) * weight_charge, 2)
52 |         data['total_price'] = round(data['weight_charge'] + data['price'])
53 |     else:
54 |         data['total_price'] = '%s taka BUT NO WEIGHT PROVIDED' % data['price']
55 |     with open('post_content', 'w') as fp:
56 |         txt = TEMPLATE % data
57 |         fp.write(txt)
58 |     cmd = "convert_img.jpg_-font_Ubuntu_-pointsize_24_-background_Orange_" +\
59 |           "label:| Price: %s taka |_-gravity_Center_-append_imgtxt.jpg" % data['total_price']
60 |     subprocess.check_output(cmd.split('_'))
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     main()
65 | 


--------------------------------------------------------------------------------
/inc/parser.py:
--------------------------------------------------------------------------------
 1 | import inc.util as util
 2 | 
 3 | 
 4 | class Parser(object):
 5 | 
 6 |     """parse wiggle website"""
 7 |     def __init__(self, url):
 8 |         super(Parser, self).__init__()
 9 |         self.url = url
10 |         self.dom = util.get_dom(self.url)
11 |         self.XTITLE = None
12 |         self.XPRICE = None
13 |         self.XRATING = None
14 |         self.XDESC = None
15 |         self.XFEAT = None
16 |         self.XIMG = None
17 |         self.REVIEW_PREFIX = None
18 | 
19 |     def get_title(self):
20 |         """get product title
21 | 
22 |         :returns: @todo
23 |         """
24 |         return util.get_text(self.dom, self.XTITLE, 0).replace('\n', ' ')
25 | 
26 |     def get_price(self, rate):
27 |         """parse price and calculate with the rate
28 | 
29 |         :rate: @todo
30 |         :returns: @todo
31 | 
32 |         """
33 |         return self.get_original_price() * rate
34 | 
35 |     def get_original_price(self):
36 |         """get original price
37 |         :returns: @todo
38 | 
39 |         """
40 |         return float(util.get_text(self.dom, self.XPRICE, 0)[1:])
41 | 
42 |     def get_review_url(self):
43 |         """get review url
44 | 
45 |         :returns: @todo
46 | 
47 |         """
48 |         return self.url + self.REVIEW_PREFIX
49 | 
50 |     def get_rating(self):
51 |         """return rating"""
52 |         raise NotImplementedError()
53 | 
54 |     def get_description(self):
55 |         """return description"""
56 |         raise NotImplementedError()
57 | 
58 |     def get_features(self):
59 |         """get features"""
60 |         raise NotImplementedError()
61 | 
62 |     def get_image(self):
63 |         """get_image"""
64 |         raise NotImplementedError()
65 | 
66 |     def parse_page(self, rate, weight_charge):
67 |         """parsed the page for product data
68 | 
69 |         :returns: @todo
70 |         """
71 |         data = {}
72 |         data['title'] = self.get_title()
73 |         data['original_price'] = self.get_original_price()
74 |         data['price'] = self.get_price(rate)
75 |         data['rating'] = self.get_rating()
76 |         data['description'] = self.get_description()
77 |         data['features'] = self.get_features()
78 |         data['url'] = self.url
79 |         data['review_url'] = self.get_review_url()
80 |         data['img'] = self.get_image()
81 |         return data
82 | 


--------------------------------------------------------------------------------
/inc/wiggle.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | import inc.parser as parser
 5 | import inc.util as util
 6 | 
 7 | 
 8 | class Wiggle(parser.Parser):
 9 |     """Parses wiggle"""
10 | 
11 |     def __init__(self, url):
12 |         super(Wiggle, self).__init__(url)
13 |         self.XTITLE = "id('productTitle')"
14 |         self.XPRICE = '//div[@class="bem-product-price__unit--pdp"]'
15 |         self.XRATING = '//div[@class="bem-review-stars__wrapper"]'
16 |         self.XDESC = '//div[@itemprop="description"]'
17 |         self.XFEAT = '//div[@class="bem-content"]/dl[1]/dd'
18 |         self.XIMG = '//a[@class="zoomable-image"]/img'
19 |         self.XIMG2 = '//div[@id="mainImageWrapper"]/img'
20 |         self.REVIEW_PREFIX = '#tabCustReviews'
21 | 
22 |     def get_original_price(self):
23 |         """get original price
24 |         :returns: @todo
25 | 
26 |         """
27 |         return float(util.get_text(self.dom, self.XPRICE, 0)[1:])
28 | 
29 |     def get_rating(self):
30 |         """get product rating
31 | 
32 |         :returns: @todo
33 |         """
34 |         rating = util.get_elm(self.dom, self.XRATING, 0)
35 |         return rating.attrib['title'].strip().replace('Star review', '')
36 | 
37 |     def get_description(self):
38 |         """get product description
39 | 
40 |         :returns: @todo
41 |         """
42 |         return util.get_text(self.dom, self.XDESC, 0)
43 | 
44 |     def get_features(self):
45 |         """get product features
46 | 
47 | 
48 |         """
49 |         features = self.dom.xpath(self.XFEAT)
50 |         txt = []
51 |         for feature in features:
52 |             txt.append(feature.text_content())
53 |         s = "\n»    "
54 |         return s.join(txt).strip()
55 | 
56 |     def get_image(self):
57 |         """get image url and download
58 | 
59 |         :returns: @todo
60 |         """
61 |         failed = False
62 |         try:
63 |             img = util.get_elm(self.dom, self.XIMG, 0).attrib['src']
64 |         except Exception:
65 |             failed = True
66 |         if failed:
67 |             try:
68 |                 img = util.get_elm(self.dom, self.XIMG2, 0).attrib['src']
69 |             except Exception:
70 |                 print("Failed to download image")
71 |                 img = ''
72 |         if img.startswith('//'):
73 |             img = 'http:' + img
74 |         if img != '':
75 |             util.download_image(img)
76 |         return img
77 | 


--------------------------------------------------------------------------------
/inc/chainreaction.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | import inc.parser as parser
 5 | import inc.util as util
 6 | 
 7 | 
 8 | class ChainReaction(parser.Parser):
 9 | 
10 |     """parse ChainReaction website"""
11 |     def __init__(self, url):
12 |         super(ChainReaction, self).__init__(url)
13 |         self.XTITLE = '//li[@class="crcPDPTitle"]'
14 |         self.XPRICE = '//span[@class="crcPDPPriceHidden"]'
15 |         self.XRATING = '//li[@class="crcPDPRatingsReviewsStarsInfo"]'
16 |         self.XDESC = 'id("crcPDPComponentDescription")'
17 |         self.XFEAT = 'id("crcPDPComponentDescription")/ul/li'
18 |         self.XIMG = '//li[@class="crcPDPImage"]//img'
19 |         self.REVIEW_PREFIX = '#bazaarvoice_reviews_tab'
20 | 
21 |     def get_original_price(self):
22 |         """parse for original price
23 |         :returns: @todo
24 | 
25 |         """
26 |         return float(util.get_text(self.dom, self.XPRICE, 0))
27 | 
28 |     def get_rating(self):
29 |         """get product rating
30 | 
31 |         :returns: @todo
32 |         """
33 |         try:
34 |             rating = util.get_elm(self.dom, self.XRATING, 0)
35 |         except Exception:
36 |             return 'N/A'
37 |         return rating.text_content().strip()
38 | 
39 |     def get_description(self):
40 |         """get product description
41 | 
42 |         :returns: @todo
43 |         """
44 |         desc = util.get_text(self.dom, self.XDESC, 0)
45 |         return desc.split('Features:')[0]
46 | 
47 |     def get_features(self):
48 |         """@todo: Docstring for function.
49 | 
50 |         :arg1: @todo
51 |         :returns: @todo
52 | 
53 |         """
54 |         feat = util.get_elm(self.dom, self.XFEAT)
55 |         line = []
56 |         if feat != None:
57 |             for li in feat:
58 |                 line.append('» ' + li.text_content())
59 |             return '\n'.join(line).strip('» ')
60 |         desc = util.get_text(self.dom, self.XDESC, 0)
61 |         try:
62 |             lines = desc.split('Features:')[1].strip().split('\n')
63 |         except Exception:
64 |             return ''
65 |         newline = []
66 |         for l in lines:
67 |             line = l.strip()
68 |             if line != '':
69 |                 newline.append('» ' + line)
70 |             else:
71 |                 newline.append('\n')
72 |         return '\n'.join(newline).strip('»  ')
73 | 
74 |     def get_image(self):
75 |         """get image url and download
76 | 
77 |         :returns: @todo
78 |         """
79 |         try:
80 |             img = util.get_elm(self.dom, self.XIMG, 0).attrib['src']
81 |         except Exception:
82 |             print("Failed to download image")
83 |             img = ''
84 |         if img.startswith('//'):
85 |             img = 'http:' + img
86 |         if img != '':
87 |             util.download_image(img)
88 |         return img
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------