├── README.md └── imgur-hosted-reddit-posted-downloader.py /README.md: -------------------------------------------------------------------------------- 1 | imgur-hosted-reddit-posted-downloader 2 | ===================================== 3 | 4 | A Python script that checks Reddit for Imgur posts and downloads the corresponding images. 5 | -------------------------------------------------------------------------------- /imgur-hosted-reddit-posted-downloader.py: -------------------------------------------------------------------------------- 1 | import re, praw, requests, os, glob, sys 2 | from bs4 import BeautifulSoup 3 | 4 | MIN_SCORE = 100 # the default minimum score before it is downloaded 5 | 6 | if len(sys.argv) < 2: 7 | # no command line options sent: 8 | print('Usage:') 9 | print(' python %s subreddit [minimum score]' % (sys.argv[0])) 10 | sys.exit() 11 | elif len(sys.argv) >= 2: 12 | # the subreddit was specified: 13 | targetSubreddit = sys.argv[1] 14 | if len(sys.argv) >= 3: 15 | # the desired minimum score was also specified: 16 | MIN_SCORE = int(sys.argv[2]) 17 | 18 | 19 | imgurUrlPattern = re.compile(r'(http://i.imgur.com/(.*))(\?.*)?') 20 | 21 | 22 | def downloadImage(imageUrl, localFileName): 23 | response = requests.get(imageUrl) 24 | if response.status_code == 200: 25 | print('Downloading %s...' % (localFileName)) 26 | with open(localFileName, 'wb') as fo: 27 | for chunk in response.iter_content(4096): 28 | fo.write(chunk) 29 | 30 | # Connect to reddit and download the subreddit front page 31 | r = praw.Reddit(user_agent='CHANGE THIS TO A UNIQUE VALUE') # Note: Be sure to change the user-agent to something unique. 32 | submissions = r.get_subreddit(targetSubreddit).get_hot(limit=25) 33 | # Or use one of these functions: 34 | # .get_top_from_year(limit=25) 35 | # .get_top_from_month(limit=25) 36 | # .get_top_from_week(limit=25) 37 | # .get_top_from_day(limit=25) 38 | # .get_top_from_hour(limit=25) 39 | # .get_top_from_all(limit=25) 40 | 41 | # Process all the submissions from the front page 42 | for submission in submissions: 43 | # Check for all the cases where we will skip a submission: 44 | if "imgur.com/" not in submission.url: 45 | continue # skip non-imgur submissions 46 | if submission.score < MIN_SCORE: 47 | continue # skip submissions that haven't even reached 100 (thought this should be rare if we're collecting the "hot" submission) 48 | if len(glob.glob('reddit_%s_%s_*' % (targetSubreddit, submission.id))) > 0: 49 | continue # we've already downloaded files for this reddit submission 50 | 51 | if 'http://imgur.com/a/' in submission.url: 52 | # This is an album submission. 53 | albumId = submission.url[len('http://imgur.com/a/'):] 54 | htmlSource = requests.get(submission.url).text 55 | 56 | soup = BeautifulSoup(htmlSource) 57 | matches = soup.select('.album-view-image-link a') 58 | for match in matches: 59 | imageUrl = match['href'] 60 | if '?' in imageUrl: 61 | imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')] 62 | else: 63 | imageFile = imageUrl[imageUrl.rfind('/') + 1:] 64 | localFileName = 'reddit_%s_%s_album_%s_imgur_%s' % (targetSubreddit, submission.id, albumId, imageFile) 65 | downloadImage('http:' + match['href'], localFileName) 66 | 67 | elif 'http://i.imgur.com/' in submission.url: 68 | # The URL is a direct link to the image. 69 | mo = imgurUrlPattern.search(submission.url) # using regex here instead of BeautifulSoup because we are pasing a url, not html 70 | 71 | imgurFilename = mo.group(2) 72 | if '?' in imgurFilename: 73 | # The regex doesn't catch a "?" at the end of the filename, so we remove it here. 74 | imgurFilename = imgurFilename[:imgurFilename.find('?')] 75 | 76 | localFileName = 'reddit_%s_%s_album_None_imgur_%s' % (targetSubreddit, submission.id, imgurFilename) 77 | downloadImage(submission.url, localFileName) 78 | 79 | elif 'http://imgur.com/' in submission.url: 80 | # This is an Imgur page with a single image. 81 | htmlSource = requests.get(submission.url).text # download the image's page 82 | soup = BeautifulSoup(htmlSource) 83 | imageUrl = soup.select('.image a')[0]['href'] 84 | if imageUrl.startswith('//'): 85 | # if no schema is supplied in the url, prepend 'http:' to it 86 | imageUrl = 'http:' + imageUrl 87 | imageId = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('.')] 88 | 89 | if '?' in imageUrl: 90 | imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')] 91 | else: 92 | imageFile = imageUrl[imageUrl.rfind('/') + 1:] 93 | 94 | localFileName = 'reddit_%s_%s_album_None_imgur_%s' % (targetSubreddit, submission.id, imageFile) 95 | downloadImage(imageUrl, localFileName) 96 | --------------------------------------------------------------------------------