├── Readme.txt └── someecards.com-downloader.py /Readme.txt: -------------------------------------------------------------------------------- 1 | Multi threaded python program to download all pictures from someecards.com,category wise. 2 | 3 | Download pictures in each category to a seperate folder. 4 | Incase the script is run again, it does not download duplicates. 5 | 6 | If the program finds 40 duplicate files in a row, it assumes that all the comics are downloaded 7 | and stops downloading that specific category. 8 | 9 | Each thread is used to parse and download data from each page of a single category. 10 | 11 | Usage: 12 | 13 | Just run the python script and input the number of the category you want to download. 14 | Enter 0 to download all categories. 15 | 16 | Dependencies: 17 | 18 | BeautifulSoup 4 19 | 20 | Note: 21 | 22 | I had to include time.sleep(2) so that the CDN does not block the IP when continous 23 | requests are given. -------------------------------------------------------------------------------- /someecards.com-downloader.py: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------- 2 | # Name: Someecards Downloader 3 | # Purpose: 4 | # 5 | # Author: Manoj M J 6 | # 7 | # Created: 8 | # Copyright: www.manojmj.com 9 | # Licence: 10 | #------------------------------------------------------------------------------- 11 | 12 | from bs4 import BeautifulSoup 13 | import urllib 14 | import os 15 | import sys 16 | import thread 17 | import threading 18 | import time 19 | 20 | 21 | dir = os.path.dirname(os.path.abspath(__file__)) 22 | someecardsdir = dir +"\\Someecards" 23 | 24 | if not os.path.exists(someecardsdir): 25 | os.makedirs(someecardsdir) 26 | class DonwloadThread(threading.Thread): 27 | def __init__(self,category,low,high): 28 | threading.Thread.__init__(self) 29 | self.category = category 30 | self.low=low 31 | self.high=high 32 | 33 | def run(self): 34 | print "Downloading Category: "+ self.category 35 | category_dir = someecardsdir+"\\"+self.category 36 | try: 37 | 38 | if not os.path.exists(category_dir): 39 | print "Making New Directory for Category "+self.category+"\n" 40 | os.makedirs(category_dir) 41 | except: 42 | pass 43 | 44 | for pageno in range(self.low,self.high): 45 | time.sleep(2) 46 | print "Entered page number"+str(pageno) 47 | main_url = "http://www.someecards.com/"+self.category+"/newest/"+str(pageno)+"/40" 48 | url_parts = main_url.split("/") 49 | category_name = url_parts[3] 50 | 51 | 52 | main_url_response = urllib.urlopen(main_url).read() 53 | main_url_soup = BeautifulSoup(main_url_response) 54 | 55 | 56 | if not main_url_soup.find_all('a',{'class':'cardthumb'}): 57 | print " Page no " + str(pageno) +" of " + category_name + "is blank \n" 58 | break 59 | 60 | duplicate_counter = 0 61 | breaker=False 62 | mainbreaker=False 63 | 64 | for comiclink in main_url_soup.find_all('a',{'class':'cardthumb'}): 65 | time.sleep(2) 66 | full_url ="http://www.someecards.com/" +comiclink['href'] 67 | full_url_response = urllib.urlopen(full_url).read() 68 | full_url_soup = BeautifulSoup(full_url_response) 69 | 70 | for imagelink in full_url_soup.find_all('link',{'rel':'image_src'}): 71 | 72 | imageurl = imagelink['href'] 73 | filename = imageurl.split('/')[5] 74 | filename = filename.replace('?','') 75 | filename = filename.replace(':','') 76 | filename = filename.replace('*','') 77 | filename = filename.replace('"','') 78 | path = os.path.join(category_dir,filename) 79 | if not os.path.exists(path): 80 | image_response = urllib.urlopen(imageurl).read() 81 | duplicate_counter=0 82 | with open (path,"wb") as data: 83 | time.sleep(2) 84 | data.write(image_response) 85 | print "Downloaded file "+ filename 86 | else: 87 | duplicate_counter +=1 88 | print " Duplicate found: "+ filename 89 | if duplicate_counter == 40: 90 | print "40 Duplicate Files Found One after the other. Looks like you have all files downloaded in this page. Exiting." 91 | breaker = True 92 | print " Break 1" 93 | break 94 | if breaker == True: 95 | mainbreaker=True 96 | print "Break 2" 97 | break 98 | if mainbreaker==True: 99 | print "Break 3" 100 | break 101 | 102 | def thread_initialize(choice): 103 | thread1=DonwloadThread(category_list[int(choice)],1,2) 104 | thread2=DonwloadThread(category_list[int(choice)],2,3) 105 | thread3=DonwloadThread(category_list[int(choice)],3,4) 106 | thread4=DonwloadThread(category_list[int(choice)],4,5) 107 | thread5=DonwloadThread(category_list[int(choice)],5,6) 108 | thread6=DonwloadThread(category_list[int(choice)],6,7) 109 | thread7=DonwloadThread(category_list[int(choice)],7,8) 110 | thread8=DonwloadThread(category_list[int(choice)],8,9) 111 | thread9=DonwloadThread(category_list[int(choice)],9,10) 112 | thread10=DonwloadThread(category_list[int(choice)],10,11) 113 | thread11=DonwloadThread(category_list[int(choice)],11,12) 114 | thread12=DonwloadThread(category_list[int(choice)],12,13) 115 | thread13=DonwloadThread(category_list[int(choice)],13,14) 116 | thread14=DonwloadThread(category_list[int(choice)],14,15) 117 | thread15=DonwloadThread(category_list[int(choice)],15,16) 118 | thread16=DonwloadThread(category_list[int(choice)],16,17) 119 | thread17=DonwloadThread(category_list[int(choice)],17,18) 120 | thread18=DonwloadThread(category_list[int(choice)],18,19) 121 | thread19=DonwloadThread(category_list[int(choice)],19,20) 122 | thread20=DonwloadThread(category_list[int(choice)],20,21) 123 | 124 | thread1.start() 125 | thread2.start() 126 | thread3.start() 127 | thread4.start() 128 | thread5.start() 129 | thread6.start() 130 | thread7.start() 131 | thread8.start() 132 | thread9.start() 133 | thread10.start() 134 | thread11.start() 135 | thread12.start() 136 | thread13.start() 137 | thread14.start() 138 | thread15.start() 139 | thread16.start() 140 | thread17.start() 141 | thread18.start() 142 | thread19.start() 143 | thread20.start() 144 | 145 | 146 | thread1.join() 147 | thread2.join() 148 | thread3.join() 149 | thread4.join() 150 | thread5.join() 151 | thread6.join() 152 | thread7.join() 153 | thread8.join() 154 | thread9.join() 155 | thread10.join() 156 | thread11.join() 157 | thread12.join() 158 | thread13.join() 159 | thread14.join() 160 | thread15.join() 161 | thread16.join() 162 | thread17.join() 163 | thread18.join() 164 | thread19.join() 165 | thread20.join() 166 | 167 | 168 | category_list = ['All','anniversary-cards','apology-collection-cards','baby-cards', 169 | 'better-like-buttons-cards','birthday-cards','breakup-cards','censored-cards', 170 | 'college-cards','confession-cards','congratulations-cards','courtesy-hello-cards', 171 | 'cry-for-help-cards','divorce-cards','drinking-cards','ecard-museum-cards','encouragement-cards', 172 | 'family-cards','fantasy-sports-cards','farewell-cards','flirting-cards','friendship-cards','get-well-cards', 173 | 'graduation-cards','honest-autocorrects-cards','honest-popups-cards','lgbt-cards','miss-you-cards','movies-cards', 174 | 'pets-cards','pregnancy-cards','psas-cards','ransom-cards-cards','reminders-cards','seasonal-cards','sports-cards', 175 | 'sympathy-cards','thanks-cards','thinking-of-you-cards','tv-cards','wedding-cards','weekend-cards','workplace-cards', 176 | 'christmas-cards','kwanzaa-cards','new-years-cards','mlk-day-cards','chinese-new-year-cards','black-history-month-cards', 177 | 'rake-cards','cougar-town-cards','dance-moms-cards','clorox-bleach-it-away-cards'] 178 | 179 | for i in range(0,53): 180 | print str(i)+"."+category_list[i] 181 | 182 | choice = raw_input("Enter a choice") 183 | if choice=="0": 184 | for x in range(5,53): 185 | thread_initialize(x) 186 | elif int(choice)>=1 and int(choice)<=52: 187 | thread_initialize(choice) 188 | 189 | else: 190 | print "Invalid choice" 191 | --------------------------------------------------------------------------------