├── README.md
└── imgur-hosted-reddit-posted-downloader.py


/README.md:
--------------------------------------------------------------------------------
1 | imgur-hosted-reddit-posted-downloader
2 | =====================================
3 | 
4 | A Python script that checks Reddit for Imgur posts and downloads the corresponding images.
5 | 


--------------------------------------------------------------------------------
/imgur-hosted-reddit-posted-downloader.py:
--------------------------------------------------------------------------------
 1 | import re, praw, requests, os, glob, sys
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | MIN_SCORE = 100 # the default minimum score before it is downloaded
 5 | 
 6 | if len(sys.argv) < 2:
 7 |     # no command line options sent:
 8 |     print('Usage:')
 9 |     print('  python %s subreddit [minimum score]' % (sys.argv[0]))
10 |     sys.exit()
11 | elif len(sys.argv) >= 2:
12 |     # the subreddit was specified:
13 |     targetSubreddit = sys.argv[1]
14 |     if len(sys.argv) >= 3:
15 |         # the desired minimum score was also specified:
16 |         MIN_SCORE = int(sys.argv[2])
17 | 
18 | 
19 | imgurUrlPattern = re.compile(r'(http://i.imgur.com/(.*))(\?.*)?')
20 | 
21 | 
22 | def downloadImage(imageUrl, localFileName):
23 |     response = requests.get(imageUrl)
24 |     if response.status_code == 200:
25 |         print('Downloading %s...' % (localFileName))
26 |         with open(localFileName, 'wb') as fo:
27 |             for chunk in response.iter_content(4096):
28 |                 fo.write(chunk)
29 | 
30 | # Connect to reddit and download the subreddit front page
31 | r = praw.Reddit(user_agent='CHANGE THIS TO A UNIQUE VALUE') # Note: Be sure to change the user-agent to something unique.
32 | submissions = r.get_subreddit(targetSubreddit).get_hot(limit=25)
33 | # Or use one of these functions:
34 | #                                       .get_top_from_year(limit=25)
35 | #                                       .get_top_from_month(limit=25)
36 | #                                       .get_top_from_week(limit=25)
37 | #                                       .get_top_from_day(limit=25)
38 | #                                       .get_top_from_hour(limit=25)
39 | #                                       .get_top_from_all(limit=25)
40 | 
41 | # Process all the submissions from the front page
42 | for submission in submissions:
43 |     # Check for all the cases where we will skip a submission:
44 |     if "imgur.com/" not in submission.url:
45 |         continue # skip non-imgur submissions
46 |     if submission.score < MIN_SCORE:
47 |         continue # skip submissions that haven't even reached 100 (thought this should be rare if we're collecting the "hot" submission)
48 |     if len(glob.glob('reddit_%s_%s_*' % (targetSubreddit, submission.id))) > 0:
49 |         continue # we've already downloaded files for this reddit submission
50 | 
51 |     if 'http://imgur.com/a/' in submission.url:
52 |         # This is an album submission.
53 |         albumId = submission.url[len('http://imgur.com/a/'):]
54 |         htmlSource = requests.get(submission.url).text
55 | 
56 |         soup = BeautifulSoup(htmlSource)
57 |         matches = soup.select('.album-view-image-link a')
58 |         for match in matches:
59 |             imageUrl = match['href']
60 |             if '?' in imageUrl:
61 |                 imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')]
62 |             else:
63 |                 imageFile = imageUrl[imageUrl.rfind('/') + 1:]
64 |             localFileName = 'reddit_%s_%s_album_%s_imgur_%s' % (targetSubreddit, submission.id, albumId, imageFile)
65 |             downloadImage('http:' + match['href'], localFileName)
66 | 
67 |     elif 'http://i.imgur.com/' in submission.url:
68 |         # The URL is a direct link to the image.
69 |         mo = imgurUrlPattern.search(submission.url) # using regex here instead of BeautifulSoup because we are pasing a url, not html
70 | 
71 |         imgurFilename = mo.group(2)
72 |         if '?' in imgurFilename:
73 |             # The regex doesn't catch a "?" at the end of the filename, so we remove it here.
74 |             imgurFilename = imgurFilename[:imgurFilename.find('?')]
75 | 
76 |         localFileName = 'reddit_%s_%s_album_None_imgur_%s' % (targetSubreddit, submission.id, imgurFilename)
77 |         downloadImage(submission.url, localFileName)
78 | 
79 |     elif 'http://imgur.com/' in submission.url:
80 |         # This is an Imgur page with a single image.
81 |         htmlSource = requests.get(submission.url).text # download the image's page
82 |         soup = BeautifulSoup(htmlSource)
83 |         imageUrl = soup.select('.image a')[0]['href']
84 |         if imageUrl.startswith('//'):
85 |             # if no schema is supplied in the url, prepend 'http:' to it
86 |             imageUrl = 'http:' + imageUrl
87 |         imageId = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('.')]
88 | 
89 |         if '?' in imageUrl:
90 |             imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')]
91 |         else:
92 |             imageFile = imageUrl[imageUrl.rfind('/') + 1:]
93 | 
94 |         localFileName = 'reddit_%s_%s_album_None_imgur_%s' % (targetSubreddit, submission.id, imageFile)
95 |         downloadImage(imageUrl, localFileName)
96 | 


--------------------------------------------------------------------------------