├── Readme.txt └── theoatmeal.py /Readme.txt: -------------------------------------------------------------------------------- 1 | Python Script to download all comics from theoatmeal.com 2 | 3 | It downloads each comic to seperate folder "OatmealComics/ComicName" where ComicName is the name of the comic. 4 | It DOES NOT download comics which are already downloaded, incase you run the script again. 5 | 6 | Usage is as follows: 7 | 8 | Just run the python script. No input is taken from the user. 9 | Requires BeautifulSoup to run. 10 | 11 | 12 | For any queries, please drop a line to manojmj92@gmail.com 13 | Thanks! 14 | 15 | -------------------------------------------------------------------------------- /theoatmeal.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------- 2 | # Name: theoatmeal downloader 3 | # Purpose: Download all comics from theoatmeal.com 4 | # 5 | # Author: Manoj | Edited by Parin Vachhani 6 | # 7 | #------------------------------------------------------------------------------- 8 | 9 | from bs4 import BeautifulSoup 10 | import urllib 11 | import os 12 | import sys 13 | 14 | dir = os.path.dirname(os.path.abspath(__file__)) 15 | oatmealdir = dir +"/OatmealComics" 16 | 17 | if not os.path.exists(oatmealdir): 18 | os.makedirs(oatmealdir) 19 | 20 | for url_range in range(1,15): 21 | 22 | main_url = "http://theoatmeal.com/comics_pg/page:" + str(url_range) 23 | print "Entered Page " + str(url_range) 24 | 25 | main_url_opener = urllib.urlopen(main_url) 26 | main_url_response = main_url_opener.read() 27 | 28 | main_url_soup = BeautifulSoup(main_url_response,"lxml") 29 | mylist = [] 30 | for comiclink in main_url_soup.find_all('a'): 31 | all_links = comiclink.get('href') 32 | split_links = all_links.split('/') 33 | try: 34 | if split_links[1]=="comics" and split_links[2]!="": 35 | if all_links not in mylist: 36 | mylist.append(all_links) 37 | 38 | except: 39 | pass 40 | 41 | for element in mylist: 42 | old_source = element 43 | new_source = old_source.replace('/comics/','http://theoatmeal.com/comics/') 44 | 45 | #do download stuff here 46 | url = new_source 47 | 48 | opener = urllib.urlopen(url) 49 | response = opener.read() 50 | 51 | soupedversion = BeautifulSoup(response,"lxml") 52 | 53 | comicname = soupedversion.title.string 54 | comicname = comicname.replace('?','') 55 | comicname = comicname.replace(':','') 56 | comicname = comicname.replace('*','') 57 | comicname = comicname.replace('"','') 58 | 59 | comicdir = dir +"/OatmealComics/"+ comicname 60 | 61 | if not os.path.exists(comicdir): 62 | print " Downloading "+comicname 63 | os.makedirs(comicdir) 64 | else: 65 | if not len(os.listdir(comicdir)) == 0: 66 | print "Neglected "+comicname+" because it already exists in your directory." 67 | continue 68 | else: 69 | print " Downloading "+comicname 70 | 71 | for imglink in soupedversion.find_all('img'): 72 | mylink = imglink.get('src') 73 | current_comic_src = mylink.split('/') 74 | if current_comic_src[4] == "comics": 75 | open_img = urllib.urlopen(mylink) 76 | img_data = open_img.read() 77 | filename = current_comic_src[6] 78 | filename = filename.replace('?reload','') 79 | path = os.path.join(comicdir,filename) 80 | with open (path,"wb") as data: 81 | data.write(img_data) 82 | print "Completed Download of Comic :"+comicname --------------------------------------------------------------------------------