├── .gitignore
├── G4GExtractor.py
├── LICENSE
├── README.md
└── g4g-test.py


/.gitignore:
--------------------------------------------------------------------------------
1 | cache
2 | cache-downloads
3 | inspector
4 | api
5 | source/inspector.html
6 | 


--------------------------------------------------------------------------------
/G4GExtractor.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | from urllib2 import urlopen
  3 | from xhtml2pdf import pisa
  4 | import os, httplib2, re
  5 | 
  6 | 
  7 | class G4GExtractor:
  8 |     __BASE_WEB_URL = 'http://www.geeksforgeeks.org/category/'
  9 |     __FILE_SAVE_PATH = ''
 10 |     __CURR_DIR_PATH = os.path.dirname(os.path.realpath(__file__)) + '/'
 11 | 
 12 |     def __init__(self, path=''):
 13 |         """
 14 |         Main class constructor and contains methods for crawler and content extraction
 15 | 
 16 |         :param path: path where we need to save the files
 17 |         :raise Exception: When the path is invalid or write permission error.
 18 |         """
 19 |         if len(path.strip()) == 0:
 20 |             self.__FILE_SAVE_PATH = self.__CURR_DIR_PATH
 21 |         elif os.path.exists(path) and os.access(path, os.W_OK):
 22 |             self.__FILE_SAVE_PATH = path
 23 |         else:
 24 |             raise Exception("Either the supplied path doesn't exists or you don't "
 25 |                             "have write permissions. \n Check the directory write "
 26 |                             "permissions and try again later. Thank You")
 27 | 
 28 |     def set_filesave_path(self, path):
 29 |         """
 30 |         Sets the file save path where contents will be downloaded
 31 | 
 32 |         :param path: path to set
 33 |         :raise Exception: When the path is invalid or write permission error.
 34 |         """
 35 |         if os.path.exists(path) and os.access(path, os.W_OK):
 36 |             self.__FILE_SAVE_PATH = path
 37 |         else:
 38 |             raise Exception("Either the supplied path doesn't exists or you don't "
 39 |                             "have write permissions. \n Check the directory write "
 40 |                             "permissions and try again later. Thank You")
 41 | 
 42 |     def set_baseweburl_path(self, url):
 43 |         """
 44 |         Sets the base url path path which enables you to choose category
 45 |         download or download based on tags
 46 | 
 47 |         :param path: path to set
 48 |         :raise Exception: When the path is invalid or write permission error.
 49 |         """
 50 |         self.__BASE_WEB_URL = url
 51 | 
 52 |     def __valid_webpage(self,urllink):
 53 | 
 54 |         """
 55 |         Checks is a link is valid or not. returns true is Status is 200
 56 |         and false if status is 404.
 57 | 
 58 |         :param urllink: Link of page whose validity is to be checked
 59 |         :return: True if connection status is 200 else False when connection
 60 |             status is 404
 61 |         """
 62 |         h = httplib2.Http()
 63 |         resp = h.request(urllink, 'HEAD')
 64 |         return int(resp[0]['status']) == 200
 65 | 
 66 |     def __remove_non_ascii(self,text):
 67 |         """
 68 |         Remove unicode or ascii chars from html source
 69 | 
 70 |         :param text: Html source
 71 |         :return: string after cleaning text
 72 |         """
 73 |         return ''.join([i if ord(i) < 128 else '' for i in text])
 74 | 
 75 |     def extract_content_and_save(self, cat_list, pdf=False):
 76 |         """
 77 |         Returns a list of all the links whose content is to be crawled.
 78 |         This method takes care of the pagination and gets all links for
 79 |         tags or categories
 80 | 
 81 |         :param cat_list: List of the categories whose links are to be crawled
 82 |         :return: List of all gathered links
 83 |         """
 84 | 
 85 |         #List to store all the links.
 86 |         totallinks = []
 87 | 
 88 |         #String to store html code
 89 |         pagedata = ''
 90 | 
 91 |         #Iterate for each category
 92 |         for cat in cat_list:
 93 |             #Create Directory path.
 94 |             newpath = self.__FILE_SAVE_PATH + cat
 95 | 
 96 |             #Create Directory for each category.
 97 |             os.mkdir(newpath)
 98 | 
 99 |             #Prepare URL to extract number of pagination pages
100 |             url = self.__BASE_WEB_URL + cat + "/"
101 | 
102 |             #Check if webpage exists and is valid
103 |             if self.__valid_webpage(url):
104 |                 pagedata = urlopen(url).read()
105 |                 soup = BeautifulSoup(pagedata)
106 | 
107 |                 #Get number of Pagination pages for each category
108 |                 pages = soup.find('span', {"class": "pages"})
109 |                 if pages:
110 |                     cat_content_pages = int(str(pages.text).split()[3])
111 |                 else:
112 |                     cat_content_pages = 1
113 | 
114 |                 for i in range(1, cat_content_pages + 1):
115 | 
116 |                     listofLinks = []
117 | 
118 |                     #Prepare URL to extract links
119 |                     if i == 1:
120 |                         url = self.__BASE_WEB_URL + cat + "/"
121 |                     else:
122 |                         url = self.__BASE_WEB_URL + cat + "/page/" + str(i) + "/"
123 | 
124 |                     print("Working with %s" % url)
125 | 
126 |                     #Check if the webpages have Status 200 or 404
127 |                     if self.__valid_webpage(url):
128 |                         pagedata = urlopen(url).read()
129 |                         soup = BeautifulSoup(pagedata)
130 | 
131 |                         #Find all the title links in the page
132 |                         content_links = soup.findAll("h2", class_="post-title")
133 | 
134 |                         #Iterate every page and save the content links in a list
135 |                         for link in content_links:
136 |                             mainLink = \
137 |                                 str(link.findAll("a")[0]).split("<a href=")[1].split('rel="bookmark"')[0].strip(
138 |                                     '"').split(
139 |                                     '"')[0]
140 |                             listofLinks.append(mainLink)
141 |                             self.save_pages(listofLinks, newpath, pdf)
142 |                         totallinks.append(listofLinks)
143 |                     else:
144 |                         print url + ' Returned Status 404'
145 |             else:
146 |                 print url + ' Returned Status 404'
147 | 
148 |         return totallinks
149 | 
150 |     def save_pages(self, listoflinks, newpath, pdf=False):
151 | 
152 |         """
153 |         Function to save the pages either as pdf or html files
154 | 
155 |         :param listoflinks: List of all the links to be saved
156 |         :param newpath: Path to directory where files will be saved
157 |         :param pdf: If True then pdf files are generated else files are saved as .html
158 |         """
159 |         for link in listoflinks:
160 |             pagedata = urlopen(link).read()
161 |             soup = BeautifulSoup(pagedata)
162 |             title = soup.find('h2', {"class": "post-title"})
163 | 
164 |             #Create File name to be saved as
165 |             filename = re.sub('[^a-zA-Z0-9\n\.]', '_', title.text)
166 | 
167 |             #If path ends with trailing slash then remove it.
168 |             if newpath.endswith('/'):
169 |                 newpath = newpath[:len(newpath) - 1]
170 | 
171 |             try:
172 |                 if os.path.exists(newpath):
173 |                     filePath = newpath + "/" + filename
174 |                     if pdf:
175 |                         self.convertHtmlToPdf(pagedata, filePath + '.pdf')
176 |                     else:
177 |                         with open(filePath + '.html', "wb") as f:
178 |                             f.write(self.__remove_non_ascii(pagedata))
179 | 
180 |             except OSError as e:
181 |                 print(e.message)
182 | 
183 |     
184 |     def convertHtmlToPdf(self,sourceHtml, outputFilename):
185 |         """
186 |          Open output file for writing (truncated binary) and
187 |          converts HTML code into pdf file format
188 | 
189 |         :param sourceHtml: The html source to be converted to pdf
190 |         :param outputFilename: Name of the output file as pdf
191 |         :return: Error if pdf not generated successfully
192 |         """
193 |         resultFile = open(outputFilename, "w+b")
194 | 
195 |         # convert HTML to PDF
196 |         pisaStatus = pisa.CreatePDF(sourceHtml, dest=resultFile)
197 | 
198 |         # close output file
199 |         resultFile.close()
200 | 
201 |         # return True on success and False on errors
202 |         return pisaStatus.err
203 | 
204 | '''
205 | def demo():
206 |     """
207 |     A demo run if this app.
208 | 
209 |     """
210 |     demo_cat_list = ['bit-magic']
211 |     path = '/root/PycharmProjects/GeekForGeeks-Spider/'
212 |     demo = G4GExtractor(path)
213 |     totallinks = len(demo.extract_content_and_save(demo_cat_list, True))
214 |     print("Number of links crawled and saved is %d" % totallinks)
215 | 
216 | if __name__ == '__main__':
217 |     demo()
218 | '''
219 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Mozilla Public License, version 2.0
  2 | 
  3 | 1. Definitions
  4 | 
  5 | 1.1. "Contributor"
  6 | 
  7 |      means each individual or legal entity that creates, contributes to the
  8 |      creation of, or owns Covered Software.
  9 | 
 10 | 1.2. "Contributor Version"
 11 | 
 12 |      means the combination of the Contributions of others (if any) used by a
 13 |      Contributor and that particular Contributor's Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 | 
 17 |      means Covered Software of a particular Contributor.
 18 | 
 19 | 1.4. "Covered Software"
 20 | 
 21 |      means Source Code Form to which the initial Contributor has attached the
 22 |      notice in Exhibit A, the Executable Form of such Source Code Form, and
 23 |      Modifications of such Source Code Form, in each case including portions
 24 |      thereof.
 25 | 
 26 | 1.5. "Incompatible With Secondary Licenses"
 27 |      means
 28 | 
 29 |      a. that the initial Contributor has attached the notice described in
 30 |         Exhibit B to the Covered Software; or
 31 | 
 32 |      b. that the Covered Software was made available under the terms of
 33 |         version 1.1 or earlier of the License, but not also under the terms of
 34 |         a Secondary License.
 35 | 
 36 | 1.6. "Executable Form"
 37 | 
 38 |      means any form of the work other than Source Code Form.
 39 | 
 40 | 1.7. "Larger Work"
 41 | 
 42 |      means a work that combines Covered Software with other material, in a
 43 |      separate file or files, that is not Covered Software.
 44 | 
 45 | 1.8. "License"
 46 | 
 47 |      means this document.
 48 | 
 49 | 1.9. "Licensable"
 50 | 
 51 |      means having the right to grant, to the maximum extent possible, whether
 52 |      at the time of the initial grant or subsequently, any and all of the
 53 |      rights conveyed by this License.
 54 | 
 55 | 1.10. "Modifications"
 56 | 
 57 |      means any of the following:
 58 | 
 59 |      a. any file in Source Code Form that results from an addition to,
 60 |         deletion from, or modification of the contents of Covered Software; or
 61 | 
 62 |      b. any new file in Source Code Form that contains any Covered Software.
 63 | 
 64 | 1.11. "Patent Claims" of a Contributor
 65 | 
 66 |       means any patent claim(s), including without limitation, method,
 67 |       process, and apparatus claims, in any patent Licensable by such
 68 |       Contributor that would be infringed, but for the grant of the License,
 69 |       by the making, using, selling, offering for sale, having made, import,
 70 |       or transfer of either its Contributions or its Contributor Version.
 71 | 
 72 | 1.12. "Secondary License"
 73 | 
 74 |       means either the GNU General Public License, Version 2.0, the GNU Lesser
 75 |       General Public License, Version 2.1, the GNU Affero General Public
 76 |       License, Version 3.0, or any later versions of those licenses.
 77 | 
 78 | 1.13. "Source Code Form"
 79 | 
 80 |       means the form of the work preferred for making modifications.
 81 | 
 82 | 1.14. "You" (or "Your")
 83 | 
 84 |       means an individual or a legal entity exercising rights under this
 85 |       License. For legal entities, "You" includes any entity that controls, is
 86 |       controlled by, or is under common control with You. For purposes of this
 87 |       definition, "control" means (a) the power, direct or indirect, to cause
 88 |       the direction or management of such entity, whether by contract or
 89 |       otherwise, or (b) ownership of more than fifty percent (50%) of the
 90 |       outstanding shares or beneficial ownership of such entity.
 91 | 
 92 | 
 93 | 2. License Grants and Conditions
 94 | 
 95 | 2.1. Grants
 96 | 
 97 |      Each Contributor hereby grants You a world-wide, royalty-free,
 98 |      non-exclusive license:
 99 | 
100 |      a. under intellectual property rights (other than patent or trademark)
101 |         Licensable by such Contributor to use, reproduce, make available,
102 |         modify, display, perform, distribute, and otherwise exploit its
103 |         Contributions, either on an unmodified basis, with Modifications, or
104 |         as part of a Larger Work; and
105 | 
106 |      b. under Patent Claims of such Contributor to make, use, sell, offer for
107 |         sale, have made, import, and otherwise transfer either its
108 |         Contributions or its Contributor Version.
109 | 
110 | 2.2. Effective Date
111 | 
112 |      The licenses granted in Section 2.1 with respect to any Contribution
113 |      become effective for each Contribution on the date the Contributor first
114 |      distributes such Contribution.
115 | 
116 | 2.3. Limitations on Grant Scope
117 | 
118 |      The licenses granted in this Section 2 are the only rights granted under
119 |      this License. No additional rights or licenses will be implied from the
120 |      distribution or licensing of Covered Software under this License.
121 |      Notwithstanding Section 2.1(b) above, no patent license is granted by a
122 |      Contributor:
123 | 
124 |      a. for any code that a Contributor has removed from Covered Software; or
125 | 
126 |      b. for infringements caused by: (i) Your and any other third party's
127 |         modifications of Covered Software, or (ii) the combination of its
128 |         Contributions with other software (except as part of its Contributor
129 |         Version); or
130 | 
131 |      c. under Patent Claims infringed by Covered Software in the absence of
132 |         its Contributions.
133 | 
134 |      This License does not grant any rights in the trademarks, service marks,
135 |      or logos of any Contributor (except as may be necessary to comply with
136 |      the notice requirements in Section 3.4).
137 | 
138 | 2.4. Subsequent Licenses
139 | 
140 |      No Contributor makes additional grants as a result of Your choice to
141 |      distribute the Covered Software under a subsequent version of this
142 |      License (see Section 10.2) or under the terms of a Secondary License (if
143 |      permitted under the terms of Section 3.3).
144 | 
145 | 2.5. Representation
146 | 
147 |      Each Contributor represents that the Contributor believes its
148 |      Contributions are its original creation(s) or it has sufficient rights to
149 |      grant the rights to its Contributions conveyed by this License.
150 | 
151 | 2.6. Fair Use
152 | 
153 |      This License is not intended to limit any rights You have under
154 |      applicable copyright doctrines of fair use, fair dealing, or other
155 |      equivalents.
156 | 
157 | 2.7. Conditions
158 | 
159 |      Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in
160 |      Section 2.1.
161 | 
162 | 
163 | 3. Responsibilities
164 | 
165 | 3.1. Distribution of Source Form
166 | 
167 |      All distribution of Covered Software in Source Code Form, including any
168 |      Modifications that You create or to which You contribute, must be under
169 |      the terms of this License. You must inform recipients that the Source
170 |      Code Form of the Covered Software is governed by the terms of this
171 |      License, and how they can obtain a copy of this License. You may not
172 |      attempt to alter or restrict the recipients' rights in the Source Code
173 |      Form.
174 | 
175 | 3.2. Distribution of Executable Form
176 | 
177 |      If You distribute Covered Software in Executable Form then:
178 | 
179 |      a. such Covered Software must also be made available in Source Code Form,
180 |         as described in Section 3.1, and You must inform recipients of the
181 |         Executable Form how they can obtain a copy of such Source Code Form by
182 |         reasonable means in a timely manner, at a charge no more than the cost
183 |         of distribution to the recipient; and
184 | 
185 |      b. You may distribute such Executable Form under the terms of this
186 |         License, or sublicense it under different terms, provided that the
187 |         license for the Executable Form does not attempt to limit or alter the
188 |         recipients' rights in the Source Code Form under this License.
189 | 
190 | 3.3. Distribution of a Larger Work
191 | 
192 |      You may create and distribute a Larger Work under terms of Your choice,
193 |      provided that You also comply with the requirements of this License for
194 |      the Covered Software. If the Larger Work is a combination of Covered
195 |      Software with a work governed by one or more Secondary Licenses, and the
196 |      Covered Software is not Incompatible With Secondary Licenses, this
197 |      License permits You to additionally distribute such Covered Software
198 |      under the terms of such Secondary License(s), so that the recipient of
199 |      the Larger Work may, at their option, further distribute the Covered
200 |      Software under the terms of either this License or such Secondary
201 |      License(s).
202 | 
203 | 3.4. Notices
204 | 
205 |      You may not remove or alter the substance of any license notices
206 |      (including copyright notices, patent notices, disclaimers of warranty, or
207 |      limitations of liability) contained within the Source Code Form of the
208 |      Covered Software, except that You may alter any license notices to the
209 |      extent required to remedy known factual inaccuracies.
210 | 
211 | 3.5. Application of Additional Terms
212 | 
213 |      You may choose to offer, and to charge a fee for, warranty, support,
214 |      indemnity or liability obligations to one or more recipients of Covered
215 |      Software. However, You may do so only on Your own behalf, and not on
216 |      behalf of any Contributor. You must make it absolutely clear that any
217 |      such warranty, support, indemnity, or liability obligation is offered by
218 |      You alone, and You hereby agree to indemnify every Contributor for any
219 |      liability incurred by such Contributor as a result of warranty, support,
220 |      indemnity or liability terms You offer. You may include additional
221 |      disclaimers of warranty and limitations of liability specific to any
222 |      jurisdiction.
223 | 
224 | 4. Inability to Comply Due to Statute or Regulation
225 | 
226 |    If it is impossible for You to comply with any of the terms of this License
227 |    with respect to some or all of the Covered Software due to statute,
228 |    judicial order, or regulation then You must: (a) comply with the terms of
229 |    this License to the maximum extent possible; and (b) describe the
230 |    limitations and the code they affect. Such description must be placed in a
231 |    text file included with all distributions of the Covered Software under
232 |    this License. Except to the extent prohibited by statute or regulation,
233 |    such description must be sufficiently detailed for a recipient of ordinary
234 |    skill to be able to understand it.
235 | 
236 | 5. Termination
237 | 
238 | 5.1. The rights granted under this License will terminate automatically if You
239 |      fail to comply with any of its terms. However, if You become compliant,
240 |      then the rights granted under this License from a particular Contributor
241 |      are reinstated (a) provisionally, unless and until such Contributor
242 |      explicitly and finally terminates Your grants, and (b) on an ongoing
243 |      basis, if such Contributor fails to notify You of the non-compliance by
244 |      some reasonable means prior to 60 days after You have come back into
245 |      compliance. Moreover, Your grants from a particular Contributor are
246 |      reinstated on an ongoing basis if such Contributor notifies You of the
247 |      non-compliance by some reasonable means, this is the first time You have
248 |      received notice of non-compliance with this License from such
249 |      Contributor, and You become compliant prior to 30 days after Your receipt
250 |      of the notice.
251 | 
252 | 5.2. If You initiate litigation against any entity by asserting a patent
253 |      infringement claim (excluding declaratory judgment actions,
254 |      counter-claims, and cross-claims) alleging that a Contributor Version
255 |      directly or indirectly infringes any patent, then the rights granted to
256 |      You by any and all Contributors for the Covered Software under Section
257 |      2.1 of this License shall terminate.
258 | 
259 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user
260 |      license agreements (excluding distributors and resellers) which have been
261 |      validly granted by You or Your distributors under this License prior to
262 |      termination shall survive termination.
263 | 
264 | 6. Disclaimer of Warranty
265 | 
266 |    Covered Software is provided under this License on an "as is" basis,
267 |    without warranty of any kind, either expressed, implied, or statutory,
268 |    including, without limitation, warranties that the Covered Software is free
269 |    of defects, merchantable, fit for a particular purpose or non-infringing.
270 |    The entire risk as to the quality and performance of the Covered Software
271 |    is with You. Should any Covered Software prove defective in any respect,
272 |    You (not any Contributor) assume the cost of any necessary servicing,
273 |    repair, or correction. This disclaimer of warranty constitutes an essential
274 |    part of this License. No use of  any Covered Software is authorized under
275 |    this License except under this disclaimer.
276 | 
277 | 7. Limitation of Liability
278 | 
279 |    Under no circumstances and under no legal theory, whether tort (including
280 |    negligence), contract, or otherwise, shall any Contributor, or anyone who
281 |    distributes Covered Software as permitted above, be liable to You for any
282 |    direct, indirect, special, incidental, or consequential damages of any
283 |    character including, without limitation, damages for lost profits, loss of
284 |    goodwill, work stoppage, computer failure or malfunction, or any and all
285 |    other commercial damages or losses, even if such party shall have been
286 |    informed of the possibility of such damages. This limitation of liability
287 |    shall not apply to liability for death or personal injury resulting from
288 |    such party's negligence to the extent applicable law prohibits such
289 |    limitation. Some jurisdictions do not allow the exclusion or limitation of
290 |    incidental or consequential damages, so this exclusion and limitation may
291 |    not apply to You.
292 | 
293 | 8. Litigation
294 | 
295 |    Any litigation relating to this License may be brought only in the courts
296 |    of a jurisdiction where the defendant maintains its principal place of
297 |    business and such litigation shall be governed by laws of that
298 |    jurisdiction, without reference to its conflict-of-law provisions. Nothing
299 |    in this Section shall prevent a party's ability to bring cross-claims or
300 |    counter-claims.
301 | 
302 | 9. Miscellaneous
303 | 
304 |    This License represents the complete agreement concerning the subject
305 |    matter hereof. If any provision of this License is held to be
306 |    unenforceable, such provision shall be reformed only to the extent
307 |    necessary to make it enforceable. Any law or regulation which provides that
308 |    the language of a contract shall be construed against the drafter shall not
309 |    be used to construe this License against a Contributor.
310 | 
311 | 
312 | 10. Versions of the License
313 | 
314 | 10.1. New Versions
315 | 
316 |       Mozilla Foundation is the license steward. Except as provided in Section
317 |       10.3, no one other than the license steward has the right to modify or
318 |       publish new versions of this License. Each version will be given a
319 |       distinguishing version number.
320 | 
321 | 10.2. Effect of New Versions
322 | 
323 |       You may distribute the Covered Software under the terms of the version
324 |       of the License under which You originally received the Covered Software,
325 |       or under the terms of any subsequent version published by the license
326 |       steward.
327 | 
328 | 10.3. Modified Versions
329 | 
330 |       If you create software not governed by this License, and you want to
331 |       create a new license for such software, you may create and use a
332 |       modified version of this License if you rename the license and remove
333 |       any references to the name of the license steward (except to note that
334 |       such modified license differs from this License).
335 | 
336 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
337 |       Licenses If You choose to distribute Source Code Form that is
338 |       Incompatible With Secondary Licenses under the terms of this version of
339 |       the License, the notice described in Exhibit B of this License must be
340 |       attached.
341 | 
342 | Exhibit A - Source Code Form License Notice
343 | 
344 |       This Source Code Form is subject to the
345 |       terms of the Mozilla Public License, v.
346 |       2.0. If a copy of the MPL was not
347 |       distributed with this file, You can
348 |       obtain one at
349 |       http://mozilla.org/MPL/2.0/.
350 | 
351 | If it is not possible or desirable to put the notice in a particular file,
352 | then You may include the notice in a location (such as a LICENSE file in a
353 | relevant directory) where a recipient would be likely to look for such a
354 | notice.
355 | 
356 | You may add additional accurate notices of copyright ownership.
357 | 
358 | Exhibit B - "Incompatible With Secondary Licenses" Notice
359 | 
360 |       This Source Code Form is "Incompatible
361 |       With Secondary Licenses", as defined by
362 |       the Mozilla Public License, v. 2.0.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | GeeksForGeeks-Content-Extractor
2 | ===============================
3 | 
4 | Download all the content of http://geeksforgeeks.org/ and save them as html or pdf files in your HDD.
5 | 
6 | [![Flattr this git repo](http://api.flattr.com/button/flattr-badge-large.png)](https://flattr.com/submit/auto?user_id=Psycho_Coder&url=https://github.com/AnimeshShaw/GeeksForGeeks-Content-Extractor&title=GeeksForGeeks-Content-Extractor&language=&tags=github&category=software) 
7 | 


--------------------------------------------------------------------------------
/g4g-test.py:
--------------------------------------------------------------------------------
 1 | from G4GExtractor import G4GExtractor
 2 | 
 3 | d = G4GExtractor()
 4 | #You can set the path by calling a method or passing the path as constructor
 5 | d.set_filesave_path("/root/PycharmProjects/GeekForGeeks-Spider/")
 6 | 
 7 | #You can set the base url path for tag or category. Here we show an example with tag.
 8 | #Of Course tags much match the tags of the site else you will receive error.
 9 | d.set_baseweburl_path("http://www.geeksforgeeks.org/tag/")
10 | 
11 | #Set the tag list
12 | tag_list = ["pattern-searching"]
13 | 
14 | #call this function to save the files and if yoy send a second parameter as True
15 | # then files will be saved as pdf.
16 | totallinks = len(d.extract_content_and_save(tag_list, True))
17 | print("Number of links crawled and saved is %d" % totallinks)
18 | 


--------------------------------------------------------------------------------