├── README.md ├── LICENSE └── geeksforgeeks-pdf.py /README.md: -------------------------------------------------------------------------------- 1 | geeksforgeeks-pdf 2 | ================= 3 | 4 | This Python script aimed to download all Amazon Interview Experience from GeeksforGeeks website. You can modify this script to download as per your need. 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Arun Prakash 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /geeksforgeeks-pdf.py: -------------------------------------------------------------------------------- 1 | import httplib2 2 | import pdfcrowd 3 | from bs4 import BeautifulSoup, SoupStrainer 4 | 5 | http = httplib2.Http() 6 | s= 'http://www.geeksforgeeks.org/' 7 | i=0 8 | to_crawl=[] 9 | to_crawl.append(s) 10 | status, response = http.request(s) 11 | crawled=[] 12 | crawled.append(s) 13 | 14 | 15 | for link in BeautifulSoup(response, parse_only=SoupStrainer('a')): 16 | if link.has_attr('href'): 17 | li=link['href'] 18 | #print li 19 | if li.find('http://www.geeksforgeeks.org')==0 and li not in crawled and li.find('forums')<0: 20 | to_crawl.append(li) 21 | 22 | 23 | #print to_crawl 24 | print len(to_crawl) 25 | count=0 26 | 27 | def get_page(page): 28 | import urllib2 29 | source=urllib2.urlopen(page) 30 | return source.read() 31 | 32 | 33 | def save_as_pdf(s): 34 | global i 35 | try: 36 | client = pdfcrowd.Client("mkap1234", "fc5ada9fbd1c55f46822d6e9e985a9bb") 37 | output_file = open('amazon'+str(i)+'.pdf', 'wb') 38 | i=i+1 39 | html=get_page(s) 40 | client.convertHtml(html, output_file) 41 | output_file.close() 42 | except pdfcrowd.Error,why: 43 | print 'Failed:', why 44 | 45 | 46 | while len(to_crawl): 47 | b=to_crawl.pop() 48 | if b.find('http://www.geeksforgeeks.org')==0 and b not in crawled and b.find('forums')<0: 49 | count=count+1 50 | print count 51 | crawled.append(b) 52 | status, response = http.request(b) 53 | for link in BeautifulSoup(response, parse_only=SoupStrainer('a')): 54 | if link.has_attr('href'): 55 | li=link['href'] 56 | if b.find('http://www.geeksforgeeks.org')==0 and li not in crawled: 57 | to_crawl.append(li) 58 | 59 | 60 | 61 | 62 | 63 | amazon=[] 64 | 65 | for st in crawled: 66 | if st.find('amazon')>=0 and st.find('#')<0 and st.find('tag')<0 and st.find('forum')<0: 67 | print st 68 | amazon.append(st) 69 | 70 | 71 | 72 | print "Finished" 73 | print len(amazon) 74 | 75 | 76 | for page in amazon: 77 | save_as_pdf(page) 78 | --------------------------------------------------------------------------------