├── Readme.txt
└── theoatmeal.py


/Readme.txt:
--------------------------------------------------------------------------------
 1 | Python Script to download all comics from theoatmeal.com
 2 | 
 3 | It downloads each comic to seperate folder "OatmealComics/ComicName" where ComicName is the name of the comic.
 4 | It DOES NOT download comics which are already downloaded, incase you run the script again.
 5 | 
 6 | Usage is as follows:
 7 | 
 8 | Just run the python script. No input is taken from the user.
 9 | Requires BeautifulSoup to run.
10 | 
11 | 
12 | For any queries, please drop a line to manojmj92@gmail.com
13 | Thanks!
14 | 
15 | 


--------------------------------------------------------------------------------
/theoatmeal.py:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------------
 2 | # Name:        theoatmeal downloader
 3 | # Purpose:  Download all comics from theoatmeal.com
 4 | #
 5 | # Author:      Manoj | Edited by Parin Vachhani
 6 | #
 7 | #-------------------------------------------------------------------------------
 8 | 
 9 | from bs4 import BeautifulSoup
10 | import urllib
11 | import os
12 | import sys
13 | 
14 | dir = os.path.dirname(os.path.abspath(__file__))
15 | oatmealdir = dir +"/OatmealComics"
16 | 
17 | if not os.path.exists(oatmealdir):
18 |         os.makedirs(oatmealdir)
19 | 
20 | for url_range in range(1,15):
21 | 
22 |     main_url = "http://theoatmeal.com/comics_pg/page:" + str(url_range)
23 |     print "Entered Page " + str(url_range)
24 | 
25 |     main_url_opener = urllib.urlopen(main_url)
26 |     main_url_response = main_url_opener.read()
27 | 
28 |     main_url_soup = BeautifulSoup(main_url_response,"lxml")
29 |     mylist = []
30 |     for comiclink in main_url_soup.find_all('a'):
31 |         all_links = comiclink.get('href')
32 |         split_links = all_links.split('/')
33 |         try:
34 |            if split_links[1]=="comics" and split_links[2]!="":
35 |                 if all_links not in mylist:
36 |                     mylist.append(all_links)
37 | 
38 |         except:
39 |             pass
40 | 
41 |     for element in mylist:
42 |         old_source = element
43 |         new_source = old_source.replace('/comics/','http://theoatmeal.com/comics/')
44 | 
45 |         #do download stuff here
46 |         url = new_source
47 | 
48 |         opener = urllib.urlopen(url)
49 |         response = opener.read()
50 | 
51 |         soupedversion = BeautifulSoup(response,"lxml")
52 | 
53 |         comicname = soupedversion.title.string
54 |         comicname = comicname.replace('?','')
55 |         comicname = comicname.replace(':','')
56 |         comicname = comicname.replace('*','')
57 |         comicname = comicname.replace('"','')
58 | 
59 |         comicdir = dir +"/OatmealComics/"+ comicname
60 | 
61 |         if not os.path.exists(comicdir):
62 |             print " Downloading "+comicname
63 |             os.makedirs(comicdir)
64 |         else:
65 |             if not len(os.listdir(comicdir)) == 0:
66 |                  print "Neglected "+comicname+" because it already exists in your directory."
67 |                  continue
68 |             else:
69 |                 print " Downloading "+comicname
70 | 
71 |         for imglink in soupedversion.find_all('img'):
72 |             mylink =  imglink.get('src')
73 |             current_comic_src = mylink.split('/')
74 |             if current_comic_src[4] == "comics":
75 |                 open_img = urllib.urlopen(mylink)
76 |                 img_data = open_img.read()
77 |                 filename = current_comic_src[6]
78 |                 filename = filename.replace('?reload','')
79 |                 path = os.path.join(comicdir,filename)
80 |                 with open (path,"wb") as data:
81 |                     data.write(img_data)
82 | print "Completed Download of Comic :"+comicname


--------------------------------------------------------------------------------