├── picdescbot ├── __init__.py ├── logger.py ├── twitter.py ├── tumblr.py └── common.py ├── requirements.txt ├── README.md ├── .gitignore ├── LICENSE └── bot.py /picdescbot/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tweepy 2 | requests 3 | wordfilter 4 | python-tumblpy 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # picdescbot 2 | 3 | "picdescbot" is a twitter/tumblr bot that downloads random images from Wikimedia Commons, and tweeets them with a description from Microsoft Cognitive Services Computer vision API. 4 | 5 | You can see it in action at https://twitter.com/picdescbot or here https://picdescbot.tumblr.com 6 | 7 | Made by Elad Alfassa. 8 | 9 | Dependencies 10 | ============ 11 | * tweepy 12 | * wordfilter 13 | * requests 14 | -------------------------------------------------------------------------------- /picdescbot/logger.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # picdescbot: a tiny twitter/tumblr bot that tweets random pictures from wikipedia and their descriptions 3 | # this file contains logging related functionality 4 | # Copyright (C) 2017 Elad Alfassa 5 | import logging 6 | setup_done = False 7 | 8 | fomatstr = '%(asctime)s : %(name)s: %(levelname)s: %(message)s' 9 | datefmt = "%Y-%m-%d %H:%M:%S" 10 | 11 | logging.basicConfig(level=logging.INFO, 12 | format=fomatstr, 13 | datefmt=datefmt, 14 | filename="all.log") 15 | 16 | formatter = logging.Formatter(fomatstr, datefmt=datefmt) 17 | 18 | console = logging.StreamHandler() 19 | console.setLevel(logging.INFO) 20 | console.setFormatter(formatter) 21 | logging.getLogger('').addHandler(console) 22 | 23 | filtered = logging.FileHandler("filtered.log") 24 | filtered.setLevel(logging.WARNING) 25 | filtered.setFormatter(formatter) 26 | logging.getLogger('').addHandler(filtered) 27 | 28 | 29 | def get(name): 30 | return logging.getLogger(name) 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | config.ini 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *,cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | 57 | # Sphinx documentation 58 | docs/_build/ 59 | 60 | # PyBuilder 61 | target/ 62 | 63 | #Ipython Notebook 64 | .ipynb_checkpoints 65 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Elad Alfassa 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /picdescbot/twitter.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # picdescbot: a tiny twitter/tumblr bot that tweets random pictures from wikipedia and their descriptions 3 | # this file implements twitter-related functionality 4 | # Copyright (C) 2016 Elad Alfassa 5 | 6 | import time 7 | import tweepy 8 | from . import logger 9 | 10 | 11 | class Client(object): 12 | name = "twitter" 13 | 14 | def __init__(self, config): 15 | auth = tweepy.OAuthHandler(config['consumer_key'], 16 | config['consumer_secret']) 17 | auth.set_access_token(config['token'], config['token_secret']) 18 | self.api = tweepy.API(auth) 19 | self.log = logger.get(__name__) 20 | 21 | def send(self, picture): 22 | "Send a tweet. `picture` is a `Result` object from `picdescbot.common`" 23 | retries = 0 24 | status = None 25 | filename = picture.url.split('/')[-1] 26 | data = picture.download_picture() 27 | try: 28 | while retries < 3 and not status: 29 | if retries > 0: 30 | self.log.info('retrying...') 31 | data.seek(0) 32 | try: 33 | text = f"{picture.caption}\n\n{picture.source_url}" 34 | status = self.api.update_with_media(filename=filename, 35 | status=text, 36 | file=data) 37 | except tweepy.TweepError as e: 38 | self.log.error("Error when sending tweet: %s" % e) 39 | retries += 1 40 | if retries >= 3: 41 | raise 42 | else: 43 | time.sleep(5) 44 | finally: 45 | data.close(really=True) 46 | return status.id 47 | -------------------------------------------------------------------------------- /picdescbot/tumblr.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # picdescbot: a tiny twitter/tumblr bot that tweets random pictures from wikipedia and their descriptions 3 | # this file implements tumblr-related functionality 4 | # Copyright (C) 2016 Elad Alfassa 5 | 6 | from tumblpy import Tumblpy 7 | import tumblpy.exceptions 8 | import time 9 | from . import common 10 | from . import logger 11 | 12 | DEFAULT_PARAMS = {'type': 'photo', 'state': 'queue', 13 | 'native_inline_images': True} 14 | 15 | # using the goo.gl link in the template instead of the actual link, because 16 | # tumblr on mobile doesn't show pages when linked with the actual link - 17 | # it just sends the user to the blog's front page instead. 18 | # The goo.gl redirect makes the tumblr app think it's an external website, 19 | # and open it in the browser instead. 20 | 21 | TEMPLATE = "

{description}

" + \ 22 | "

@picdescbot" + \ 23 | " | " + \ 24 | "about this bot | " + \ 25 | "picture source

" + \ 26 | "

all text in this post is 100% computer-generated, including tags

" 27 | DEFAULT_TAGS = ['picdescbot', 'bot'] 28 | 29 | # All kinds of tags that should be filtered from the bot's post 30 | tag_blacklist = {'woman', 'black', 'white', 'man', 'body', 'large', 'tall', 31 | 'small', 'young', 'old', 'top', 'boy', 'girl'} 32 | 33 | 34 | def filter_tags(tags): 35 | filtered = [] 36 | for tag in tags: 37 | if tag not in tag_blacklist and not common.word_filter.blacklisted(tag): 38 | filtered.append(tag) 39 | return filtered 40 | 41 | 42 | class Client(object): 43 | name = "tumblr" 44 | 45 | def __init__(self, config): 46 | self.client = Tumblpy(config['consumer_key'], config['consumer_secret'], 47 | config['token'], config['token_secret']) 48 | self.blog_id = config['blog_id'] 49 | self.log = logger.get(__name__) 50 | 51 | def send(self, picture): 52 | "Post a post. `picture` is a `Result` object from `picdescbot.common`" 53 | 54 | post_text = TEMPLATE.format(description=picture.caption, 55 | source=picture.source_url) 56 | 57 | tags = DEFAULT_TAGS + filter_tags(picture.tags) 58 | 59 | params = {'caption': post_text, 60 | 'source': picture.url, 61 | 'tags': ','.join(tags)} 62 | params.update(DEFAULT_PARAMS) 63 | 64 | retries = 0 65 | post = None 66 | while retries < 3 and post is None: 67 | if retries > 0: 68 | self.log.info('retrying...') 69 | try: 70 | post = self.client.post("post", blog_url=self.blog_id, 71 | params=params) 72 | except tumblpy.exceptions.TumblpyError as e: 73 | self.log.error("Error when sending tumblr post: %s" % e) 74 | retries += 1 75 | if retries >= 3: 76 | raise 77 | else: 78 | time.sleep(5) 79 | return post['id'] 80 | -------------------------------------------------------------------------------- /bot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # coding=utf-8 3 | # picdescbot: a tiny twitter bot that tweets random pictures from wikipedia and their descriptions 4 | # Copyright (C) 2016 Elad Alfassa 5 | 6 | from __future__ import unicode_literals, absolute_import, print_function 7 | 8 | import argparse 9 | import configparser 10 | import os.path 11 | import picdescbot.common 12 | import picdescbot.logger 13 | import picdescbot.tumblr 14 | import picdescbot.twitter 15 | import sys 16 | import tweepy 17 | 18 | 19 | def main(): 20 | if sys.version_info.major < 3: 21 | print("This program does not support python2", file=sys.stderr) 22 | return 23 | 24 | # Start boring setup stuff (config file handling, etc) 25 | 26 | parser = argparse.ArgumentParser(description='Tweets pictures from wikipedia') 27 | parser.add_argument('config', metavar='config', nargs='?', type=str, 28 | default=None, help='Path to config file') 29 | parser.add_argument('--manual', action="store_true") 30 | parser.add_argument('--tumblr-only', action="store_true") 31 | parser.add_argument('--disable-tag-blacklist', action="store_true") 32 | parser.add_argument('--wikimedia-filename', nargs='?', type=str, 33 | default=None, help='Describe the specified picture from wikimedia, instead of a random one') 34 | args = parser.parse_args() 35 | config_file = "config.ini" 36 | if args.config is not None: 37 | config_file = os.path.expanduser(args.config) 38 | 39 | config = configparser.ConfigParser() 40 | config.read(config_file) 41 | 42 | if not args.tumblr_only: 43 | if (not config.has_section('twitter') or not 44 | config.has_option('twitter', 'consumer_key') or not 45 | config.has_option('twitter', 'consumer_secret')): 46 | 47 | print("You'll need to get a consumer key and a consumer secret" + 48 | "from https://dev.twitter.com/apps") 49 | key = input('Enter twitter consumer key: ') 50 | secret = input('Enter twitter consumer secret: ') 51 | if not config.has_section('twitter'): 52 | config.add_section('twitter') 53 | config.set('twitter', 'consumer_key', key) 54 | config.set('twitter', 'consumer_secret', secret) 55 | with open(config_file, 'w') as f: 56 | config.write(f) 57 | consumer_key = config['twitter']['consumer_key'] 58 | consumer_secret = config['twitter']['consumer_secret'] 59 | 60 | # twitter auth stuff 61 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 62 | if (config.has_option('twitter', 'token') and 63 | config.has_option('twitter', 'token_secret')): 64 | 65 | auth.set_access_token(config['twitter']['token'], 66 | config['twitter']['token_secret']) 67 | else: 68 | print("Please authenticate with twitter:") 69 | print(auth.get_authorization_url()) 70 | code = input('Enter authentication code from twitter: ').strip() 71 | auth.get_access_token(verifier=code) 72 | config.set('twitter', 'token', auth.access_token) 73 | config.set('twitter', 'token_secret', auth.access_token_secret) 74 | with open(config_file, 'w') as f: 75 | config.write(f) 76 | 77 | if (not config.has_section('mscognitive') or not 78 | config.has_option('mscognitive', 'api_key') or not 79 | config.has_option('mscognitive', 'endpoint')): 80 | apikey = input("Please enter your Microsoft Computer Vision API Key:") 81 | if not config.has_section('mscognitive'): 82 | config.add_section('mscognitive') 83 | config.set('mscognitive', 'api_key', apikey) 84 | endpoint = input("Please provide the endpoint for the Computer Vision API:") 85 | config.set('mscognitive', 'endpoint', endpoint) 86 | with open(config_file, 'w') as f: 87 | config.write(f) 88 | else: 89 | apikey = config['mscognitive']['api_key'] 90 | endpoint = config['mscognitive']['endpoint'] 91 | 92 | # end boring setup stuff 93 | 94 | if args.disable_tag_blacklist: 95 | picdescbot.common.tags_blacklist = {} 96 | args.manual = True # less filtering means manual mode is mandatory 97 | 98 | cvapi = picdescbot.common.CVAPIClient(apikey, endpoint) 99 | if args.tumblr_only and not config.has_section('tumblr'): 100 | print('tumblr is not configured') 101 | print("You'll neeed the following fields: ") 102 | print("consumer_key, consumer_secret, token, token_secret, blog_id") 103 | return 104 | 105 | log = picdescbot.logger.get('main') 106 | 107 | providers = [] 108 | if config.has_section('tumblr'): 109 | providers.append(picdescbot.tumblr.Client(config['tumblr'])) 110 | 111 | if not args.tumblr_only: 112 | providers.append(picdescbot.twitter.Client(config['twitter'])) 113 | 114 | post = False 115 | while not post: 116 | result = cvapi.get_picture_and_description(args.wikimedia_filename) 117 | if args.manual: 118 | action = None 119 | print(result.url) 120 | print(result.caption) 121 | while action not in ['y', 'n']: 122 | action = input("Post this? [y/n]: ") 123 | if action == "y": 124 | post = True 125 | else: 126 | post = True 127 | 128 | for provider in providers: 129 | status_id = provider.send(result) 130 | log.info("Sent {0}: {1} ({2})".format(provider.name, status_id, 131 | result.caption)) 132 | 133 | if __name__ == "__main__": 134 | main() 135 | -------------------------------------------------------------------------------- /picdescbot/common.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # picdescbot: a tiny twitter/tumblr bot that tweets random pictures from wikipedia and their descriptions 3 | # this file contains common basic functionality of the bot, such as getting the picture and description 4 | # Copyright (C) 2016-2017 Elad Alfassa 5 | 6 | from __future__ import unicode_literals, absolute_import, print_function 7 | 8 | from wordfilter import Wordfilter 9 | import json 10 | import re 11 | import requests 12 | import time 13 | import lxml.html 14 | from . import logger 15 | from io import BytesIO 16 | 17 | log = logger.get("common") 18 | 19 | MEDIAWIKI_API = "https://commons.wikimedia.org/w/api.php" 20 | 21 | HEADERS = {"User-Agent": "picdescbot, http://github.com/elad661/picdescbot"} 22 | 23 | supported_formats = re.compile('\.(png|jpe?g|gif)$', re.I) 24 | word_filter = Wordfilter() 25 | 26 | # I really don't want the bot to show this kind of imagery! 27 | word_filter.add_words(['nazi', 'hitler', 'reich']) 28 | 29 | # I can't trust Microsoft's algorithm to not be racist, so I should probably 30 | # make the bot avoid posting images with the following words in them. 31 | # I'm not using wordfilter here because it would over-filter in some cases. 32 | # also filter "gun" because this is not the kind of content I want the bot to post 33 | # This is matched only against the caption generated by CVAPI. 34 | extra_filter = {'ape', 'apes', 'monkey', 'monkeys', 'gun'} 35 | 36 | # Blacklisted phrases (instead of words) to blacklist certain phrases 37 | # in the wikimedia description 38 | blacklisted_phrases = {'comic strip', 'logo', 'biblical illustration', 'church', 39 | 'historical document', 'donald trump'} 40 | 41 | # Blacklist some categories, just in case. These are matched on a substring 42 | # basis, against the page's categories and the titles of the wikipages using 43 | # the picture. 44 | category_blacklist = ['september 11', 'hitler', 'nazi', 'antisemit', 'libel', 45 | 'apartheid', 'racism', 'lynching', 'cartoons', 46 | 'holocaust', 'auschwitz', 'stereotypes', 'flags', 'porn', 47 | 'homophobia', 'transphobia', 'logos', 48 | 'scans from google books', 'little nemo', 49 | 'stolperstein', 'songbird specimens', 'terror', 50 | 'bible illustrations', 'jesuit symbols', 51 | 'christian symbols', 'symbols of religion', 52 | 'symbols of islam', 'jewish symbols', 'pistols', 53 | 'corpse', 'victim', 'ultrasound', 'donald trump', 54 | 'pascual marín'] 55 | 56 | # Gender neutralization helps prevent accidental transphobic juxtapositions 57 | # which can occur when CVAPI uses gendered words in the description, but their 58 | # gender detection is wrong. Computers shouldn't try to detect gender, and 59 | # always be neautral. You can't know someone's gender just by how they look! 60 | gendered_words = {'woman': 'person', 61 | 'man': 'person', 62 | 'women': 'people', 63 | 'man\'s': 'person\'s', 64 | 'woman\'s': 'person\'s', 65 | 'mans': 'persons', 66 | 'womans': 'persons', 67 | 'men': 'people', 68 | 'guy': 'person', 69 | 'boy': 'person', 70 | 'girl': 'person', 71 | 'boys': 'people', 72 | 'girls': 'people', 73 | 'lady': 'person', 74 | 'ladies': 'people', 75 | 'gentleman': 'person', 76 | 'gentlemen': 'people', 77 | 'female': '', 78 | 'male': '', 79 | 'she': 'they', 80 | # It's probably more likely to say "woman and her phone" than 81 | # "someone gives a phone to her", so their is probably better 82 | # here. Would need more complex parsing to know for sure. 83 | 'her': 'their', 84 | 'hers': 'theirs', 85 | 'herself': 'themself', 86 | 'he': 'they', 87 | 'him': 'them', 88 | # It's more likely to give "man and his phone" than "this 89 | # phone is his", so "their" is better here than "theirs" 90 | 'his': 'their', 91 | 'himself': 'themself'} 92 | 93 | 94 | def gender_neutralize(phrase): 95 | "Replace gendered words in the phrase with neutral ones" 96 | neutralized = [] 97 | for word in phrase.lower().split(): 98 | if word in gendered_words: 99 | word = gendered_words[word] 100 | if word != '': 101 | neutralized.append(word) 102 | neutralized = ' '.join(neutralized) 103 | if neutralized != phrase: 104 | log.info('Gender neutralized: "{0}" => "{1}"'.format(phrase, neutralized)) 105 | return neutralized 106 | 107 | 108 | tags_blacklist = {'text', 'screenshot', 'military', 'church'} 109 | 110 | 111 | def tag_blacklisted(tags): 112 | for tag in tags: 113 | if tag in tags_blacklist: 114 | return True 115 | return False 116 | 117 | 118 | def is_blacklisted(caption): 119 | """ Check caption for forbidden words""" 120 | if "a suit and tie" in caption: 121 | return True 122 | if word_filter.blacklisted(caption): 123 | return True 124 | for word in caption.split(): 125 | if word in extra_filter: 126 | return True 127 | return False 128 | 129 | 130 | def remove_html_tags(text): 131 | """ Remove all HTML tags (and properties) from a string """ 132 | return ' '.join(lxml.html.fromstring(text).itertext()) 133 | 134 | 135 | def log_discarded(url, reason, description=None): 136 | line = "Discarded {0} because of {1}".format(url, reason) 137 | if description is not None: 138 | line += ' - "{0}"'.format(description) 139 | log.warning(line) 140 | 141 | 142 | def get_picture(filename=None): 143 | """Get a picture from Wikimedia Commons. A random picture will be returned if filename is not specified 144 | Returns None when the result is bad""" 145 | params = {"action": "query", 146 | "prop": "imageinfo|categories|globalusage", 147 | "iiprop": "url|size|extmetadata|mediatype", 148 | "iiurlheight": "1080", 149 | "format": "json"} 150 | if filename is None: 151 | params['generator'] = 'random' 152 | params['grnnamespace'] = '6' 153 | else: 154 | params['titles'] = 'File:%s' % filename 155 | 156 | response = requests.get(MEDIAWIKI_API, 157 | params=params, 158 | headers=HEADERS).json() 159 | page = list(response['query']['pages'].values())[0] # This API is ugly 160 | imageinfo = page['imageinfo'][0] 161 | url = imageinfo['url'] 162 | extra_metadata = imageinfo['extmetadata'] 163 | 164 | # check that the file is actually a picture 165 | if imageinfo['mediatype'] != "BITMAP": 166 | return None 167 | 168 | # Make sure the picture is big enough 169 | if imageinfo['width'] <= 50 or imageinfo['height'] <= 50: 170 | return None 171 | 172 | # Make sure the format is supported 173 | if not supported_formats.search(url): 174 | return None 175 | 176 | # We got a picture, now let's verify we can use it. 177 | if word_filter.blacklisted(page['title']): # Check file name for bad words 178 | log_discarded(url, 'badword in page title: "{0}"'.format(page['title'])) 179 | return None 180 | # Check picture title for bad words 181 | if word_filter.blacklisted(extra_metadata['ObjectName']['value']): 182 | log_discarded(url, 'badword in picture title: "{0}"'.format(extra_metadata['ObjectName']['value'])) 183 | return None 184 | # Check restrictions for more bad words 185 | if word_filter.blacklisted(extra_metadata['Restrictions']['value']): 186 | log_discarded(url, 'badword in restrictions: "{0}"'.format(extra_metadata['Restrictions']['value'])) 187 | return None 188 | 189 | # Check file description for bad words 190 | if 'ImageDescription' in extra_metadata: 191 | cleaned_description = remove_html_tags(extra_metadata['ImageDescription']['value']) 192 | if word_filter.blacklisted(cleaned_description): 193 | log_discarded(url, 'badword in image description: "{0}"'.format(cleaned_description)) 194 | return None 195 | 196 | for phrase in blacklisted_phrases: 197 | if phrase in cleaned_description.lower().strip(): 198 | log_discarded(url, 'blacklisted phrase "{0}" found in description "{1}"'.format(phrase, cleaned_description)) 199 | return None 200 | 201 | # The mediawiki API is awful, there's another list of categories which 202 | # is not the same as the one requested by asking for "categories". 203 | # Fortunately it's still in the API response, under extmetadata. 204 | 205 | extra_categories = extra_metadata['Categories']['value'].lower() 206 | 207 | for blacklisted_category in category_blacklist: 208 | for category in page['categories']: 209 | if blacklisted_category in category['title'].lower(): 210 | log_discarded(url, 'blacklisted category "{0}"'.format(category['title'])) 211 | return None 212 | 213 | if blacklisted_category in extra_categories: 214 | log_discarded(url, 'blacklisted category "{0}" (in extra)'.format(blacklisted_category)) 215 | return None 216 | 217 | # TODO check parent categories for each category in metadata, 218 | # and compare them against the blacklist too. This will require 219 | # extra API calls 220 | 221 | # if the picture is used in any wikipage with unwanted themes, we probably 222 | # don't want to use it. 223 | for wikipage in page['globalusage']: 224 | if word_filter.blacklisted(wikipage['title'].lower()): 225 | log_discarded(url, 'page usage "{0}"'.format(wikipage['title'])) 226 | return None 227 | for blacklisted_category in category_blacklist: 228 | if blacklisted_category in wikipage['title']: # substring matching 229 | log_discarded(url, 'page usage "{0}"'.format(wikipage['title'])) 230 | return None 231 | return imageinfo 232 | 233 | 234 | class CVAPIClient(object): 235 | "Microsoft Cognitive Services Client" 236 | def __init__(self, apikey, endpoint): 237 | self.apikey = apikey 238 | self.endpoint = endpoint + '/analyze' 239 | 240 | def describe_picture(self, url): 241 | "Get description for a picture using Microsoft Cognitive Services" 242 | params = {'visualFeatures': 'Description,Adult'} 243 | json = {'url': url} 244 | headers = {'Content-Type': 'application/json', 245 | 'Ocp-Apim-Subscription-Key': self.apikey} 246 | 247 | result = None 248 | retries = 0 249 | 250 | while retries < 15 and not result: 251 | response = requests.post(self.endpoint, json=json, params=params, 252 | headers=headers) 253 | if response.status_code == 429: 254 | log.error("Error from mscognitive: %s" % (response.json())) 255 | if retries < 15: 256 | time.sleep(2) 257 | retries += 1 258 | else: 259 | log.error('failed after retrying!') 260 | 261 | elif response.status_code == 200 or response.status_code == 201: 262 | result = response.json() if response.content else None 263 | else: 264 | log.error("Error code: %d" % (response.status_code)) 265 | log.error("url: %s" % url) 266 | try: 267 | log.error(response.json()) 268 | except: 269 | log.error(response.text) 270 | retries += 1 271 | sleep = 20 + retries*4 272 | log.info("attempt: {0}, sleeping for {1}".format(retries, sleep)) 273 | time.sleep(sleep) 274 | 275 | return result 276 | 277 | def get_picture_and_description(self, filename=None, max_retries=20): 278 | "Get a picture and a description. Retries until a usable result is produced or max_retries is reached." 279 | pic = None 280 | retries = 0 281 | while retries <= max_retries: # retry max 20 times, until we get something good 282 | while pic is None: 283 | pic = get_picture(filename) 284 | if pic is None: 285 | # We got a bad picture, let's wait a bit to be polite to the API server 286 | time.sleep(1) 287 | url = pic['url'] 288 | # Use a scaled-down image if the original is too big 289 | if pic['size'] > 3000000 or pic['width'] > 8192 or pic['height'] > 8192: 290 | url = pic['thumburl'] 291 | 292 | result = self.describe_picture(url) 293 | 294 | if result is not None: 295 | description = result['description'] 296 | adult = result['adult'] 297 | if not adult['isAdultContent'] and not adult['isRacyContent']: # no nudity and such 298 | if len(description['captions']) > 0: 299 | caption = description['captions'][0]['text'] 300 | caption = gender_neutralize(caption) 301 | if not is_blacklisted(caption): 302 | if not tag_blacklisted(description['tags']): 303 | return Result(caption, 304 | description['tags'], url, 305 | pic['descriptionshorturl']) 306 | else: 307 | log_discarded(url, "tag blacklist", caption) 308 | log.warning('tags: %s' % description['tags']) 309 | else: 310 | log_discarded(url, "caption blacklist", caption) 311 | else: 312 | log.warning("No caption for url: {0}".format(url)) 313 | else: 314 | log_discarded(url, "adult content", description['captions']) 315 | retries += 1 316 | log.warning("Not good, retrying...") 317 | pic = None 318 | time.sleep(3) # sleep to be polite to the API servers 319 | 320 | raise Exception("Maximum retries exceeded, no good picture") 321 | 322 | 323 | class NonClosingBytesIO(BytesIO): 324 | """" Like BytesIO, but doesn't close so easily. 325 | To prevent tweepy from closing the picture on error, this class requires 326 | to be specifically closed by adding a boolean parameter to the close() method. 327 | """ 328 | 329 | def close(self, really=False): 330 | """ Close the BytesIO object, but only if you're really sure """ 331 | if really: 332 | return super().close() 333 | 334 | 335 | class Result(object): 336 | "Represents a picture and its description" 337 | def __init__(self, caption, tags, url, source_url): 338 | self.caption = caption 339 | self.tags = tags 340 | self.url = url 341 | self.source_url = source_url 342 | 343 | def download_picture(self): 344 | "Returns a BytesIO object for an image URL" 345 | retries = 0 346 | picture = None 347 | log.info("downloading " + self.url) 348 | while retries <= 20: 349 | if retries > 0: 350 | log.info('Trying again...') 351 | 352 | try: 353 | response = requests.get(self.url, headers=HEADERS) 354 | except requests.exceptions.RequestException as e: 355 | log.exception(e) 356 | response = None 357 | 358 | if response is not None and response.status_code == 200: 359 | picture = NonClosingBytesIO(response.content) 360 | return picture 361 | else: 362 | log.error("Fetching picture failed: " + response.status_code) 363 | retries += 1 364 | time.sleep(3) 365 | log.error("Maximum retries exceeded when downloading a picture") 366 | raise Exception("Maximum retries exceeded when downloading a picture") 367 | --------------------------------------------------------------------------------