├── picdescbot
    ├── __init__.py
    ├── logger.py
    ├── twitter.py
    ├── tumblr.py
    └── common.py
├── requirements.txt
├── README.md
├── .gitignore
├── LICENSE
└── bot.py


/picdescbot/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tweepy
2 | requests
3 | wordfilter
4 | python-tumblpy
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # picdescbot
 2 | 
 3 | "picdescbot" is a twitter/tumblr bot that downloads random images from Wikimedia Commons, and tweeets them with a description from Microsoft Cognitive Services Computer vision API.
 4 | 
 5 | You can see it in action at https://twitter.com/picdescbot or here https://picdescbot.tumblr.com
 6 | 
 7 | Made by Elad Alfassa.
 8 | 
 9 | Dependencies
10 | ============
11 | * tweepy
12 | * wordfilter
13 | * requests
14 | 


--------------------------------------------------------------------------------
/picdescbot/logger.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # picdescbot: a tiny twitter/tumblr bot that tweets random pictures from wikipedia and their descriptions
 3 | # this file contains logging related functionality
 4 | # Copyright (C) 2017 Elad Alfassa <elad@fedoraproject.org>
 5 | import logging
 6 | setup_done = False
 7 | 
 8 | fomatstr = '%(asctime)s : %(name)s: %(levelname)s: %(message)s'
 9 | datefmt = "%Y-%m-%d %H:%M:%S"
10 | 
11 | logging.basicConfig(level=logging.INFO,
12 |                     format=fomatstr,
13 |                     datefmt=datefmt,
14 |                     filename="all.log")
15 | 
16 | formatter = logging.Formatter(fomatstr, datefmt=datefmt)
17 | 
18 | console = logging.StreamHandler()
19 | console.setLevel(logging.INFO)
20 | console.setFormatter(formatter)
21 | logging.getLogger('').addHandler(console)
22 | 
23 | filtered = logging.FileHandler("filtered.log")
24 | filtered.setLevel(logging.WARNING)
25 | filtered.setFormatter(formatter)
26 | logging.getLogger('').addHandler(filtered)
27 | 
28 | 
29 | def get(name):
30 |     return logging.getLogger(name)
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | config.ini
 2 | 
 3 | # Byte-compiled / optimized / DLL files
 4 | __pycache__/
 5 | *.py[cod]
 6 | *$py.class
 7 | 
 8 | # C extensions
 9 | *.so
10 | 
11 | # Distribution / packaging
12 | .Python
13 | env/
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | 
29 | # PyInstaller
30 | #  Usually these files are written by a python script from a template
31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 | 
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 | 
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *,cover
48 | .hypothesis/
49 | 
50 | # Translations
51 | *.mo
52 | *.pot
53 | 
54 | # Django stuff:
55 | *.log
56 | 
57 | # Sphinx documentation
58 | docs/_build/
59 | 
60 | # PyBuilder
61 | target/
62 | 
63 | #Ipython Notebook
64 | .ipynb_checkpoints
65 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Elad Alfassa
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/picdescbot/twitter.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # picdescbot: a tiny twitter/tumblr bot that tweets random pictures from wikipedia and their descriptions
 3 | # this file implements twitter-related functionality
 4 | # Copyright (C) 2016 Elad Alfassa <elad@fedoraproject.org>
 5 | 
 6 | import time
 7 | import tweepy
 8 | from . import logger
 9 | 
10 | 
11 | class Client(object):
12 |     name = "twitter"
13 | 
14 |     def __init__(self, config):
15 |         auth = tweepy.OAuthHandler(config['consumer_key'],
16 |                                    config['consumer_secret'])
17 |         auth.set_access_token(config['token'], config['token_secret'])
18 |         self.api = tweepy.API(auth)
19 |         self.log = logger.get(__name__)
20 | 
21 |     def send(self, picture):
22 |         "Send a tweet. `picture` is a `Result` object from `picdescbot.common`"
23 |         retries = 0
24 |         status = None
25 |         filename = picture.url.split('/')[-1]
26 |         data = picture.download_picture()
27 |         try:
28 |             while retries < 3 and not status:
29 |                 if retries > 0:
30 |                     self.log.info('retrying...')
31 |                     data.seek(0)
32 |                 try:
33 |                     text = f"{picture.caption}\n\n{picture.source_url}"
34 |                     status = self.api.update_with_media(filename=filename,
35 |                                                         status=text,
36 |                                                         file=data)
37 |                 except tweepy.TweepError as e:
38 |                     self.log.error("Error when sending tweet: %s" % e)
39 |                     retries += 1
40 |                     if retries >= 3:
41 |                         raise
42 |                     else:
43 |                         time.sleep(5)
44 |         finally:
45 |             data.close(really=True)
46 |         return status.id
47 | 


--------------------------------------------------------------------------------
/picdescbot/tumblr.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # picdescbot: a tiny twitter/tumblr bot that tweets random pictures from wikipedia and their descriptions
 3 | # this file implements tumblr-related functionality
 4 | # Copyright (C) 2016 Elad Alfassa <elad@fedoraproject.org>
 5 | 
 6 | from tumblpy import Tumblpy
 7 | import tumblpy.exceptions
 8 | import time
 9 | from . import common
10 | from . import logger
11 | 
12 | DEFAULT_PARAMS = {'type': 'photo', 'state': 'queue',
13 |                   'native_inline_images': True}
14 | 
15 | # using the goo.gl link in the template instead of the actual link, because
16 | # tumblr on mobile doesn't show pages when linked with the actual link -
17 | # it just sends the user to the blog's front page instead.
18 | # The goo.gl redirect makes the tumblr app think it's an external website,
19 | # and open it in the browser instead.
20 | 
21 | TEMPLATE = "<h2><b>{description}</b></h2>" + \
22 |            "<p><a href=\"https://picdescbot.tumblr.com\">@picdescbot</a>" + \
23 |            "&nbsp;|&nbsp;<a href=\"https://goo.gl/qLvF4K\">" + \
24 |            "about this bot</a>&nbsp;|&nbsp;" + \
25 |            "<a href=\"{source}\">picture source</a></p>" + \
26 |            "<p><i>all text in this post is 100% computer-generated, including tags</i></p>"
27 | DEFAULT_TAGS = ['picdescbot', 'bot']
28 | 
29 | # All kinds of tags that should be filtered from the bot's post
30 | tag_blacklist = {'woman', 'black', 'white', 'man', 'body', 'large', 'tall',
31 |                  'small', 'young', 'old', 'top', 'boy', 'girl'}
32 | 
33 | 
34 | def filter_tags(tags):
35 |     filtered = []
36 |     for tag in tags:
37 |         if tag not in tag_blacklist and not common.word_filter.blacklisted(tag):
38 |             filtered.append(tag)
39 |     return filtered
40 | 
41 | 
42 | class Client(object):
43 |     name = "tumblr"
44 | 
45 |     def __init__(self, config):
46 |         self.client = Tumblpy(config['consumer_key'], config['consumer_secret'],
47 |                               config['token'], config['token_secret'])
48 |         self.blog_id = config['blog_id']
49 |         self.log = logger.get(__name__)
50 | 
51 |     def send(self, picture):
52 |         "Post a post. `picture` is a `Result` object from `picdescbot.common`"
53 | 
54 |         post_text = TEMPLATE.format(description=picture.caption,
55 |                                     source=picture.source_url)
56 | 
57 |         tags = DEFAULT_TAGS + filter_tags(picture.tags)
58 | 
59 |         params = {'caption': post_text,
60 |                   'source': picture.url,
61 |                   'tags': ','.join(tags)}
62 |         params.update(DEFAULT_PARAMS)
63 | 
64 |         retries = 0
65 |         post = None
66 |         while retries < 3 and post is None:
67 |             if retries > 0:
68 |                 self.log.info('retrying...')
69 |             try:
70 |                 post = self.client.post("post", blog_url=self.blog_id,
71 |                                         params=params)
72 |             except tumblpy.exceptions.TumblpyError as e:
73 |                 self.log.error("Error when sending tumblr post: %s" % e)
74 |                 retries += 1
75 |                 if retries >= 3:
76 |                     raise
77 |                 else:
78 |                     time.sleep(5)
79 |         return post['id']
80 | 


--------------------------------------------------------------------------------
/bot.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | # coding=utf-8
  3 | # picdescbot: a tiny twitter bot that tweets random pictures from wikipedia and their descriptions
  4 | # Copyright (C) 2016 Elad Alfassa <elad@fedoraproject.org>
  5 | 
  6 | from __future__ import unicode_literals, absolute_import, print_function
  7 | 
  8 | import argparse
  9 | import configparser
 10 | import os.path
 11 | import picdescbot.common
 12 | import picdescbot.logger
 13 | import picdescbot.tumblr
 14 | import picdescbot.twitter
 15 | import sys
 16 | import tweepy
 17 | 
 18 | 
 19 | def main():
 20 |     if sys.version_info.major < 3:
 21 |         print("This program does not support python2", file=sys.stderr)
 22 |         return
 23 | 
 24 |     # Start boring setup stuff (config file handling, etc)
 25 | 
 26 |     parser = argparse.ArgumentParser(description='Tweets pictures from wikipedia')
 27 |     parser.add_argument('config', metavar='config', nargs='?', type=str,
 28 |                         default=None, help='Path to config file')
 29 |     parser.add_argument('--manual', action="store_true")
 30 |     parser.add_argument('--tumblr-only', action="store_true")
 31 |     parser.add_argument('--disable-tag-blacklist', action="store_true")
 32 |     parser.add_argument('--wikimedia-filename', nargs='?', type=str,
 33 |                         default=None, help='Describe the specified picture from wikimedia, instead of a random one')
 34 |     args = parser.parse_args()
 35 |     config_file = "config.ini"
 36 |     if args.config is not None:
 37 |         config_file = os.path.expanduser(args.config)
 38 | 
 39 |     config = configparser.ConfigParser()
 40 |     config.read(config_file)
 41 | 
 42 |     if not args.tumblr_only:
 43 |         if (not config.has_section('twitter') or not
 44 |                 config.has_option('twitter', 'consumer_key') or not
 45 |                 config.has_option('twitter', 'consumer_secret')):
 46 | 
 47 |             print("You'll need to get a consumer key and a consumer secret" +
 48 |                   "from https://dev.twitter.com/apps")
 49 |             key = input('Enter twitter consumer key: ')
 50 |             secret = input('Enter twitter consumer secret: ')
 51 |             if not config.has_section('twitter'):
 52 |                 config.add_section('twitter')
 53 |             config.set('twitter', 'consumer_key', key)
 54 |             config.set('twitter', 'consumer_secret', secret)
 55 |             with open(config_file, 'w') as f:
 56 |                 config.write(f)
 57 |         consumer_key = config['twitter']['consumer_key']
 58 |         consumer_secret = config['twitter']['consumer_secret']
 59 | 
 60 |         # twitter auth stuff
 61 |         auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
 62 |         if (config.has_option('twitter', 'token') and
 63 |                 config.has_option('twitter', 'token_secret')):
 64 | 
 65 |             auth.set_access_token(config['twitter']['token'],
 66 |                                   config['twitter']['token_secret'])
 67 |         else:
 68 |             print("Please authenticate with twitter:")
 69 |             print(auth.get_authorization_url())
 70 |             code = input('Enter authentication code from twitter: ').strip()
 71 |             auth.get_access_token(verifier=code)
 72 |             config.set('twitter', 'token', auth.access_token)
 73 |             config.set('twitter', 'token_secret', auth.access_token_secret)
 74 |             with open(config_file, 'w') as f:
 75 |                 config.write(f)
 76 | 
 77 |     if (not config.has_section('mscognitive') or not
 78 |             config.has_option('mscognitive', 'api_key') or not
 79 |             config.has_option('mscognitive', 'endpoint')):
 80 |         apikey = input("Please enter your Microsoft Computer Vision API Key:")
 81 |         if not config.has_section('mscognitive'):
 82 |             config.add_section('mscognitive')
 83 |         config.set('mscognitive', 'api_key', apikey)
 84 |         endpoint = input("Please provide the endpoint for the Computer Vision API:")
 85 |         config.set('mscognitive', 'endpoint', endpoint)
 86 |         with open(config_file, 'w') as f:
 87 |             config.write(f)
 88 |     else:
 89 |         apikey = config['mscognitive']['api_key']
 90 |         endpoint = config['mscognitive']['endpoint']
 91 | 
 92 |     # end boring setup stuff
 93 | 
 94 |     if args.disable_tag_blacklist:
 95 |         picdescbot.common.tags_blacklist = {}
 96 |         args.manual = True  # less filtering means manual mode is mandatory
 97 | 
 98 |     cvapi = picdescbot.common.CVAPIClient(apikey, endpoint)
 99 |     if args.tumblr_only and not config.has_section('tumblr'):
100 |         print('tumblr is not configured')
101 |         print("You'll neeed the following fields: ")
102 |         print("consumer_key, consumer_secret, token, token_secret, blog_id")
103 |         return
104 | 
105 |     log = picdescbot.logger.get('main')
106 | 
107 |     providers = []
108 |     if config.has_section('tumblr'):
109 |         providers.append(picdescbot.tumblr.Client(config['tumblr']))
110 | 
111 |     if not args.tumblr_only:
112 |         providers.append(picdescbot.twitter.Client(config['twitter']))
113 | 
114 |     post = False
115 |     while not post:
116 |         result = cvapi.get_picture_and_description(args.wikimedia_filename)
117 |         if args.manual:
118 |             action = None
119 |             print(result.url)
120 |             print(result.caption)
121 |             while action not in ['y', 'n']:
122 |                 action = input("Post this? [y/n]: ")
123 |             if action == "y":
124 |                 post = True
125 |         else:
126 |             post = True
127 | 
128 |     for provider in providers:
129 |         status_id = provider.send(result)
130 |         log.info("Sent {0}: {1} ({2})".format(provider.name, status_id,
131 |                                               result.caption))
132 | 
133 | if __name__ == "__main__":
134 |     main()
135 | 


--------------------------------------------------------------------------------
/picdescbot/common.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # picdescbot: a tiny twitter/tumblr bot that tweets random pictures from wikipedia and their descriptions
  3 | # this file contains common basic functionality of the bot, such as getting the picture and description
  4 | # Copyright (C) 2016-2017 Elad Alfassa <elad@fedoraproject.org>
  5 | 
  6 | from __future__ import unicode_literals, absolute_import, print_function
  7 | 
  8 | from wordfilter import Wordfilter
  9 | import json
 10 | import re
 11 | import requests
 12 | import time
 13 | import lxml.html
 14 | from . import logger
 15 | from io import BytesIO
 16 | 
 17 | log = logger.get("common")
 18 | 
 19 | MEDIAWIKI_API = "https://commons.wikimedia.org/w/api.php"
 20 | 
 21 | HEADERS = {"User-Agent":  "picdescbot, http://github.com/elad661/picdescbot"}
 22 | 
 23 | supported_formats = re.compile('\.(png|jpe?g|gif)$', re.I)
 24 | word_filter = Wordfilter()
 25 | 
 26 | # I really don't want the bot to show this kind of imagery!
 27 | word_filter.add_words(['nazi', 'hitler', 'reich'])
 28 | 
 29 | # I can't trust Microsoft's algorithm to not be racist, so I should probably
 30 | # make the bot avoid posting images with the following words in them.
 31 | # I'm not using wordfilter here because it would over-filter in some cases.
 32 | # also filter "gun" because this is not the kind of content I want the bot to post
 33 | # This is matched only against the caption generated by CVAPI.
 34 | extra_filter = {'ape', 'apes', 'monkey', 'monkeys', 'gun'}
 35 | 
 36 | # Blacklisted phrases (instead of words) to blacklist certain phrases
 37 | # in the wikimedia description
 38 | blacklisted_phrases = {'comic strip', 'logo', 'biblical illustration', 'church',
 39 |                        'historical document', 'donald trump'}
 40 | 
 41 | # Blacklist some categories, just in case. These are matched on a substring
 42 | # basis, against the page's categories and the titles of the wikipages using
 43 | # the picture.
 44 | category_blacklist = ['september 11', 'hitler', 'nazi', 'antisemit', 'libel',
 45 |                       'apartheid', 'racism', 'lynching', 'cartoons',
 46 |                       'holocaust', 'auschwitz', 'stereotypes', 'flags', 'porn',
 47 |                       'homophobia', 'transphobia', 'logos',
 48 |                       'scans from google books', 'little nemo',
 49 |                       'stolperstein', 'songbird specimens', 'terror',
 50 |                       'bible illustrations', 'jesuit symbols',
 51 |                       'christian symbols', 'symbols of religion',
 52 |                       'symbols of islam', 'jewish symbols', 'pistols',
 53 |                       'corpse', 'victim', 'ultrasound', 'donald trump',
 54 |                       'pascual marín']
 55 | 
 56 | # Gender neutralization helps prevent accidental transphobic juxtapositions
 57 | # which can occur when CVAPI uses gendered words in the description, but their
 58 | # gender detection is wrong. Computers shouldn't try to detect gender, and
 59 | # always be neautral. You can't know someone's gender just by how they look!
 60 | gendered_words = {'woman': 'person',
 61 |                   'man': 'person',
 62 |                   'women': 'people',
 63 |                   'man\'s': 'person\'s',
 64 |                   'woman\'s': 'person\'s',
 65 |                   'mans': 'persons',
 66 |                   'womans': 'persons',
 67 |                   'men': 'people',
 68 |                   'guy': 'person',
 69 |                   'boy': 'person',
 70 |                   'girl': 'person',
 71 |                   'boys': 'people',
 72 |                   'girls': 'people',
 73 |                   'lady': 'person',
 74 |                   'ladies': 'people',
 75 |                   'gentleman': 'person',
 76 |                   'gentlemen': 'people',
 77 |                   'female': '',
 78 |                   'male': '',
 79 |                   'she': 'they',
 80 |                   # It's probably more likely to say "woman and her phone" than
 81 |                   # "someone gives a phone to her", so their is probably better
 82 |                   # here. Would need more complex parsing to know for sure.
 83 |                   'her': 'their',
 84 |                   'hers': 'theirs',
 85 |                   'herself': 'themself',
 86 |                   'he': 'they',
 87 |                   'him': 'them',
 88 |                   # It's more likely to give "man and his phone" than "this
 89 |                   # phone is his", so "their" is better here than "theirs"
 90 |                   'his': 'their',
 91 |                   'himself': 'themself'}
 92 | 
 93 | 
 94 | def gender_neutralize(phrase):
 95 |     "Replace gendered words in the phrase with neutral ones"
 96 |     neutralized = []
 97 |     for word in phrase.lower().split():
 98 |         if word in gendered_words:
 99 |             word = gendered_words[word]
100 |         if word != '':
101 |             neutralized.append(word)
102 |     neutralized = ' '.join(neutralized)
103 |     if neutralized != phrase:
104 |         log.info('Gender neutralized: "{0}" => "{1}"'.format(phrase, neutralized))
105 |     return neutralized
106 | 
107 | 
108 | tags_blacklist = {'text', 'screenshot', 'military', 'church'}
109 | 
110 | 
111 | def tag_blacklisted(tags):
112 |     for tag in tags:
113 |         if tag in tags_blacklist:
114 |             return True
115 |     return False
116 | 
117 | 
118 | def is_blacklisted(caption):
119 |     """ Check caption for forbidden words"""
120 |     if "a suit and tie" in caption:
121 |         return True
122 |     if word_filter.blacklisted(caption):
123 |         return True
124 |     for word in caption.split():
125 |         if word in extra_filter:
126 |             return True
127 |     return False
128 | 
129 | 
130 | def remove_html_tags(text):
131 |     """ Remove all HTML tags (and properties) from a string """
132 |     return ' '.join(lxml.html.fromstring(text).itertext())
133 | 
134 | 
135 | def log_discarded(url, reason, description=None):
136 |     line = "Discarded {0} because of {1}".format(url, reason)
137 |     if description is not None:
138 |         line += ' - "{0}"'.format(description)
139 |     log.warning(line)
140 | 
141 | 
142 | def get_picture(filename=None):
143 |     """Get a picture from Wikimedia Commons. A random picture will be returned if filename is not specified
144 |     Returns None when the result is bad"""
145 |     params = {"action": "query",
146 |               "prop": "imageinfo|categories|globalusage",
147 |               "iiprop": "url|size|extmetadata|mediatype",
148 |               "iiurlheight": "1080",
149 |               "format": "json"}
150 |     if filename is None:
151 |         params['generator'] = 'random'
152 |         params['grnnamespace'] = '6'
153 |     else:
154 |         params['titles'] = 'File:%s' % filename
155 | 
156 |     response = requests.get(MEDIAWIKI_API,
157 |                             params=params,
158 |                             headers=HEADERS).json()
159 |     page = list(response['query']['pages'].values())[0]  # This API is ugly
160 |     imageinfo = page['imageinfo'][0]
161 |     url = imageinfo['url']
162 |     extra_metadata = imageinfo['extmetadata']
163 | 
164 |     # check that the file is actually a picture
165 |     if imageinfo['mediatype'] != "BITMAP":
166 |         return None
167 | 
168 |     # Make sure the picture is big enough
169 |     if imageinfo['width'] <= 50 or imageinfo['height'] <= 50:
170 |         return None
171 | 
172 |     # Make sure the format is supported
173 |     if not supported_formats.search(url):
174 |         return None
175 | 
176 |     # We got a picture, now let's verify we can use it.
177 |     if word_filter.blacklisted(page['title']):  # Check file name for bad words
178 |         log_discarded(url, 'badword in page title: "{0}"'.format(page['title']))
179 |         return None
180 |     # Check picture title for bad words
181 |     if word_filter.blacklisted(extra_metadata['ObjectName']['value']):
182 |         log_discarded(url, 'badword in picture title: "{0}"'.format(extra_metadata['ObjectName']['value']))
183 |         return None
184 |     # Check restrictions for more bad words
185 |     if word_filter.blacklisted(extra_metadata['Restrictions']['value']):
186 |         log_discarded(url, 'badword in restrictions: "{0}"'.format(extra_metadata['Restrictions']['value']))
187 |         return None
188 | 
189 |     # Check file description for bad words
190 |     if 'ImageDescription' in extra_metadata:
191 |         cleaned_description = remove_html_tags(extra_metadata['ImageDescription']['value'])
192 |         if word_filter.blacklisted(cleaned_description):
193 |             log_discarded(url, 'badword in image description: "{0}"'.format(cleaned_description))
194 |             return None
195 | 
196 |         for phrase in blacklisted_phrases:
197 |             if phrase in cleaned_description.lower().strip():
198 |                 log_discarded(url, 'blacklisted phrase "{0}" found in description "{1}"'.format(phrase, cleaned_description))
199 |                 return None
200 | 
201 |     # The mediawiki API is awful, there's another list of categories which
202 |     # is not the same as the one requested by asking for "categories".
203 |     # Fortunately it's still in the API response, under extmetadata.
204 | 
205 |     extra_categories = extra_metadata['Categories']['value'].lower()
206 | 
207 |     for blacklisted_category in category_blacklist:
208 |         for category in page['categories']:
209 |             if blacklisted_category in category['title'].lower():
210 |                 log_discarded(url, 'blacklisted category "{0}"'.format(category['title']))
211 |                 return None
212 | 
213 |         if blacklisted_category in extra_categories:
214 |             log_discarded(url, 'blacklisted category "{0}" (in extra)'.format(blacklisted_category))
215 |             return None
216 | 
217 |     # TODO check parent categories for each category in metadata,
218 |     # and compare them against the blacklist too. This will require
219 |     # extra API calls
220 | 
221 |     # if the picture is used in any wikipage with unwanted themes, we probably
222 |     # don't want to use it.
223 |     for wikipage in page['globalusage']:
224 |         if word_filter.blacklisted(wikipage['title'].lower()):
225 |             log_discarded(url, 'page usage "{0}"'.format(wikipage['title']))
226 |             return None
227 |         for blacklisted_category in category_blacklist:
228 |             if blacklisted_category in wikipage['title']:  # substring matching
229 |                 log_discarded(url, 'page usage "{0}"'.format(wikipage['title']))
230 |                 return None
231 |     return imageinfo
232 | 
233 | 
234 | class CVAPIClient(object):
235 |     "Microsoft Cognitive Services Client"
236 |     def __init__(self, apikey, endpoint):
237 |         self.apikey = apikey
238 |         self.endpoint = endpoint + '/analyze'
239 | 
240 |     def describe_picture(self, url):
241 |         "Get description for a picture using Microsoft Cognitive Services"
242 |         params = {'visualFeatures': 'Description,Adult'}
243 |         json = {'url': url}
244 |         headers = {'Content-Type': 'application/json',
245 |                    'Ocp-Apim-Subscription-Key': self.apikey}
246 | 
247 |         result = None
248 |         retries = 0
249 | 
250 |         while retries < 15 and not result:
251 |             response = requests.post(self.endpoint, json=json, params=params,
252 |                                      headers=headers)
253 |             if response.status_code == 429:
254 |                 log.error("Error from mscognitive: %s" % (response.json()))
255 |                 if retries < 15:
256 |                     time.sleep(2)
257 |                     retries += 1
258 |                 else:
259 |                     log.error('failed after retrying!')
260 | 
261 |             elif response.status_code == 200 or response.status_code == 201:
262 |                 result = response.json() if response.content else None
263 |             else:
264 |                 log.error("Error code: %d" % (response.status_code))
265 |                 log.error("url: %s" % url)
266 |                 try:
267 |                     log.error(response.json())
268 |                 except:
269 |                     log.error(response.text)
270 |                 retries += 1
271 |                 sleep = 20 + retries*4
272 |                 log.info("attempt: {0}, sleeping for {1}".format(retries, sleep))
273 |                 time.sleep(sleep)
274 | 
275 |         return result
276 | 
277 |     def get_picture_and_description(self, filename=None, max_retries=20):
278 |         "Get a picture and a description. Retries until a usable result is produced or max_retries is reached."
279 |         pic = None
280 |         retries = 0
281 |         while retries <= max_retries:  # retry max 20 times, until we get something good
282 |             while pic is None:
283 |                 pic = get_picture(filename)
284 |                 if pic is None:
285 |                     # We got a bad picture, let's wait a bit to be polite to the API server
286 |                     time.sleep(1)
287 |             url = pic['url']
288 |             # Use a scaled-down image if the original is too big
289 |             if pic['size'] > 3000000 or pic['width'] > 8192 or pic['height'] > 8192:
290 |                 url = pic['thumburl']
291 | 
292 |             result = self.describe_picture(url)
293 | 
294 |             if result is not None:
295 |                 description = result['description']
296 |                 adult = result['adult']
297 |                 if not adult['isAdultContent'] and not adult['isRacyContent']:  # no nudity and such
298 |                     if len(description['captions']) > 0:
299 |                         caption = description['captions'][0]['text']
300 |                         caption = gender_neutralize(caption)
301 |                         if not is_blacklisted(caption):
302 |                             if not tag_blacklisted(description['tags']):
303 |                                 return Result(caption,
304 |                                               description['tags'], url,
305 |                                               pic['descriptionshorturl'])
306 |                             else:
307 |                                 log_discarded(url, "tag blacklist", caption)
308 |                                 log.warning('tags: %s' % description['tags'])
309 |                         else:
310 |                             log_discarded(url, "caption blacklist", caption)
311 |                     else:
312 |                         log.warning("No caption for url: {0}".format(url))
313 |                 else:
314 |                     log_discarded(url, "adult content", description['captions'])
315 |             retries += 1
316 |             log.warning("Not good, retrying...")
317 |             pic = None
318 |             time.sleep(3)  # sleep to be polite to the API servers
319 | 
320 |         raise Exception("Maximum retries exceeded, no good picture")
321 | 
322 | 
323 | class NonClosingBytesIO(BytesIO):
324 |     """" Like BytesIO, but doesn't close so easily.
325 |     To prevent tweepy from closing the picture on error, this class requires
326 |     to be specifically closed by adding a boolean parameter to the close() method.
327 |     """
328 | 
329 |     def close(self, really=False):
330 |         """ Close the BytesIO object, but only if you're really sure """
331 |         if really:
332 |             return super().close()
333 | 
334 | 
335 | class Result(object):
336 |     "Represents a picture and its description"
337 |     def __init__(self, caption, tags, url, source_url):
338 |         self.caption = caption
339 |         self.tags = tags
340 |         self.url = url
341 |         self.source_url = source_url
342 | 
343 |     def download_picture(self):
344 |         "Returns a BytesIO object for an image URL"
345 |         retries = 0
346 |         picture = None
347 |         log.info("downloading " + self.url)
348 |         while retries <= 20:
349 |             if retries > 0:
350 |                 log.info('Trying again...')
351 | 
352 |             try:
353 |                 response = requests.get(self.url, headers=HEADERS)
354 |             except requests.exceptions.RequestException as e:
355 |                 log.exception(e)
356 |                 response = None
357 | 
358 |             if response is not None and response.status_code == 200:
359 |                 picture = NonClosingBytesIO(response.content)
360 |                 return picture
361 |             else:
362 |                 log.error("Fetching picture failed: " + response.status_code)
363 |                 retries += 1
364 |                 time.sleep(3)
365 |         log.error("Maximum retries exceeded when downloading a picture")
366 |         raise Exception("Maximum retries exceeded when downloading a picture")
367 | 


--------------------------------------------------------------------------------