├── .gitignore ├── G4GExtractor.py ├── LICENSE ├── README.md └── g4g-test.py /.gitignore: -------------------------------------------------------------------------------- 1 | cache 2 | cache-downloads 3 | inspector 4 | api 5 | source/inspector.html 6 | -------------------------------------------------------------------------------- /G4GExtractor.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from urllib2 import urlopen 3 | from xhtml2pdf import pisa 4 | import os, httplib2, re 5 | 6 | 7 | class G4GExtractor: 8 | __BASE_WEB_URL = 'http://www.geeksforgeeks.org/category/' 9 | __FILE_SAVE_PATH = '' 10 | __CURR_DIR_PATH = os.path.dirname(os.path.realpath(__file__)) + '/' 11 | 12 | def __init__(self, path=''): 13 | """ 14 | Main class constructor and contains methods for crawler and content extraction 15 | 16 | :param path: path where we need to save the files 17 | :raise Exception: When the path is invalid or write permission error. 18 | """ 19 | if len(path.strip()) == 0: 20 | self.__FILE_SAVE_PATH = self.__CURR_DIR_PATH 21 | elif os.path.exists(path) and os.access(path, os.W_OK): 22 | self.__FILE_SAVE_PATH = path 23 | else: 24 | raise Exception("Either the supplied path doesn't exists or you don't " 25 | "have write permissions. \n Check the directory write " 26 | "permissions and try again later. Thank You") 27 | 28 | def set_filesave_path(self, path): 29 | """ 30 | Sets the file save path where contents will be downloaded 31 | 32 | :param path: path to set 33 | :raise Exception: When the path is invalid or write permission error. 34 | """ 35 | if os.path.exists(path) and os.access(path, os.W_OK): 36 | self.__FILE_SAVE_PATH = path 37 | else: 38 | raise Exception("Either the supplied path doesn't exists or you don't " 39 | "have write permissions. \n Check the directory write " 40 | "permissions and try again later. Thank You") 41 | 42 | def set_baseweburl_path(self, url): 43 | """ 44 | Sets the base url path path which enables you to choose category 45 | download or download based on tags 46 | 47 | :param path: path to set 48 | :raise Exception: When the path is invalid or write permission error. 49 | """ 50 | self.__BASE_WEB_URL = url 51 | 52 | def __valid_webpage(self,urllink): 53 | 54 | """ 55 | Checks is a link is valid or not. returns true is Status is 200 56 | and false if status is 404. 57 | 58 | :param urllink: Link of page whose validity is to be checked 59 | :return: True if connection status is 200 else False when connection 60 | status is 404 61 | """ 62 | h = httplib2.Http() 63 | resp = h.request(urllink, 'HEAD') 64 | return int(resp[0]['status']) == 200 65 | 66 | def __remove_non_ascii(self,text): 67 | """ 68 | Remove unicode or ascii chars from html source 69 | 70 | :param text: Html source 71 | :return: string after cleaning text 72 | """ 73 | return ''.join([i if ord(i) < 128 else '' for i in text]) 74 | 75 | def extract_content_and_save(self, cat_list, pdf=False): 76 | """ 77 | Returns a list of all the links whose content is to be crawled. 78 | This method takes care of the pagination and gets all links for 79 | tags or categories 80 | 81 | :param cat_list: List of the categories whose links are to be crawled 82 | :return: List of all gathered links 83 | """ 84 | 85 | #List to store all the links. 86 | totallinks = [] 87 | 88 | #String to store html code 89 | pagedata = '' 90 | 91 | #Iterate for each category 92 | for cat in cat_list: 93 | #Create Directory path. 94 | newpath = self.__FILE_SAVE_PATH + cat 95 | 96 | #Create Directory for each category. 97 | os.mkdir(newpath) 98 | 99 | #Prepare URL to extract number of pagination pages 100 | url = self.__BASE_WEB_URL + cat + "/" 101 | 102 | #Check if webpage exists and is valid 103 | if self.__valid_webpage(url): 104 | pagedata = urlopen(url).read() 105 | soup = BeautifulSoup(pagedata) 106 | 107 | #Get number of Pagination pages for each category 108 | pages = soup.find('span', {"class": "pages"}) 109 | if pages: 110 | cat_content_pages = int(str(pages.text).split()[3]) 111 | else: 112 | cat_content_pages = 1 113 | 114 | for i in range(1, cat_content_pages + 1): 115 | 116 | listofLinks = [] 117 | 118 | #Prepare URL to extract links 119 | if i == 1: 120 | url = self.__BASE_WEB_URL + cat + "/" 121 | else: 122 | url = self.__BASE_WEB_URL + cat + "/page/" + str(i) + "/" 123 | 124 | print("Working with %s" % url) 125 | 126 | #Check if the webpages have Status 200 or 404 127 | if self.__valid_webpage(url): 128 | pagedata = urlopen(url).read() 129 | soup = BeautifulSoup(pagedata) 130 | 131 | #Find all the title links in the page 132 | content_links = soup.findAll("h2", class_="post-title") 133 | 134 | #Iterate every page and save the content links in a list 135 | for link in content_links: 136 | mainLink = \ 137 | str(link.findAll("a")[0]).split("