├── .gitignore ├── README.md ├── scan_episodes.py ├── import_shows.py ├── export_data.py ├── schema.sql └── podcast_lib.py /.gitignore: -------------------------------------------------------------------------------- 1 | build_lists/sources 2 | config.py 3 | data 4 | checkpoint.log 5 | *.py[cod] 6 | .DS_Store 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Podcasts Dataset 2 | Scripts to compile a large dataset of podcasts and episodes for analysis. 3 | 4 | ## Podcast List Sources 5 | * http://www.allpodcasts.com/Tools/OPMLViewer.aspx?url=http%3a%2f%2fwww.digitalpodcast.com%2fopml%2fdigitalpodcast.opml 6 | * http://www.publicradiofan.com/podcasts.html 7 | * http://newtimeradio.com/ 8 | * https://rss.itunes.apple.com/us/?urlDesc=%2Fgenerator 9 | * https://www.podcastpedia.org/categories 10 | * http://www.godcast1000.com/ 11 | -------------------------------------------------------------------------------- /scan_episodes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import requests, glob, sys, os, datetime, json, uuid, sqlalchemy, re, hashlib 4 | import xml.etree.ElementTree as ET 5 | from sqlalchemy import create_engine, select, MetaData, Table, Column, String, Integer, DateTime 6 | from config import conn_string 7 | from podcast_lib import PodcastLib 8 | 9 | engine = create_engine(conn_string) 10 | db_conn = engine.connect() 11 | metadata = MetaData(engine) 12 | 13 | shows_table = Table('shows', metadata, 14 | Column('id', String(36), primary_key=True), 15 | Column('feed_url', String(255)), 16 | ) 17 | 18 | episodes_table = Table('episodes', metadata, 19 | Column('id', String(36), primary_key=True), 20 | Column('show_id', String(36)), 21 | Column('title', String(255)), 22 | Column('link', String(255)), 23 | Column('guid', String(255)), 24 | Column('subtitle', String()), 25 | Column('description', String()), 26 | Column('summary', String()), 27 | Column('author', String(255)), 28 | Column('audio_url', String(255)), 29 | Column('audio_file_size', Integer), 30 | Column('audio_mime_type', String(50)), 31 | Column('category', String(200)), 32 | Column('explicit', Integer), 33 | Column('length', Integer), 34 | Column('pub_date', DateTime()), 35 | Column('keywords', String(800)) 36 | ) 37 | 38 | s = select([shows_table]) 39 | shows = db_conn.execute(s) 40 | 41 | for show in shows: 42 | try: 43 | obj = PodcastLib.process_feed(show[1].encode('utf-8'), process_episodes=True) 44 | if obj: 45 | for episode in obj['episodes']: 46 | PodcastLib.save_to_db(db_conn, episodes_table, episode) 47 | 48 | except KeyboardInterrupt: 49 | sys.exit(0) 50 | -------------------------------------------------------------------------------- /import_shows.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import glob, sys, os, sqlalchemy 4 | from sqlalchemy import create_engine, MetaData, Table, Column, String, Integer, DateTime 5 | from config import conn_string 6 | from podcast_lib import PodcastLib 7 | 8 | engine = create_engine(conn_string) 9 | db_conn = engine.connect() 10 | metadata = MetaData(engine) 11 | 12 | shows_table = Table('shows', metadata, 13 | Column('id', String(36), primary_key=True), 14 | Column('feed_url', String(255)), 15 | Column('title', String(255)), 16 | Column('subtitle', String()), 17 | Column('description', String()), 18 | Column('summary', String()), 19 | Column('author', String(250)), 20 | Column('email', String(255)), 21 | Column('link', String(255)), 22 | Column('language', String(10)), 23 | Column('explicit', Integer), 24 | Column('image', String(500)), 25 | Column('category', String(200)), 26 | Column('subcategory', String(200)), 27 | Column('created_at', DateTime()), 28 | Column('last_build_date', DateTime()) 29 | ) 30 | 31 | if __name__ == '__main__': 32 | with open('checkpoint.log', 'r') as chk: 33 | checkpoint = chk.read() 34 | 35 | for f in glob.glob('data/*.txt'): 36 | with open(f) as fh: 37 | for line in fh.read().split("\n"): 38 | 39 | # Failed run checkpoint 40 | if checkpoint != '' and checkpoint.strip() != line.strip(): 41 | continue 42 | else: 43 | checkpoint = '' 44 | 45 | if line == '': 46 | continue 47 | 48 | try: 49 | obj = PodcastLib.process_feed(line) 50 | if obj: 51 | PodcastLib.save_to_db(db_conn, shows_table, obj) 52 | 53 | except KeyboardInterrupt: 54 | with open('checkpoint.log', 'w') as chk: 55 | chk.write(line) 56 | sys.exit(0) 57 | 58 | except: 59 | with open('checkpoint.log', 'w') as chk: 60 | chk.write(line) 61 | 62 | raise 63 | -------------------------------------------------------------------------------- /export_data.py: -------------------------------------------------------------------------------- 1 | import csv, sqlalchemy, io, random, cStringIO, codecs 2 | from datetime import datetime 3 | from sqlalchemy import create_engine, Table, MetaData 4 | from sqlalchemy.orm import sessionmaker 5 | from config import conn_string 6 | 7 | def page_query(q): 8 | """ Page through query to prevent killed queries """ 9 | offset = 0 10 | while True: 11 | r = False 12 | for elem in q.limit(1000).offset(offset): 13 | r = True 14 | yield elem 15 | offset += 1000 16 | if not r: 17 | break 18 | 19 | class UnicodeWriter: 20 | """ 21 | A CSV writer which will write rows to CSV file "f", 22 | which is encoded in the given encoding. 23 | """ 24 | 25 | def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): 26 | # Redirect output to a queue 27 | self.queue = cStringIO.StringIO() 28 | self.writer = csv.writer(self.queue, dialect=dialect, **kwds) 29 | self.stream = f 30 | self.encoder = codecs.getincrementalencoder(encoding)() 31 | 32 | def writerow(self, row): 33 | self.writer.writerow([s.encode("utf-8") for s in row]) 34 | # Fetch UTF-8 output from the queue ... 35 | data = self.queue.getvalue() 36 | data = data.decode("utf-8") 37 | # ... and reencode it into the target encoding 38 | data = self.encoder.encode(data) 39 | # write to the target stream 40 | self.stream.write(data) 41 | # empty queue 42 | self.queue.truncate(0) 43 | 44 | def writerows(self, rows): 45 | for row in rows: 46 | self.writerow(row) 47 | 48 | def write_row(row, writer, key_index): 49 | row_dict = dict(zip(row.keys(), row)) 50 | 51 | row_arr = [] 52 | for k in key_index: 53 | v = row_dict[k] 54 | if v == None: 55 | row_arr.append('') 56 | elif type(v) == datetime: 57 | row_arr.append(v.strftime('%Y-%m-%dT%H:%M:%S%Z')) 58 | elif type(v) in (int, float, long): 59 | row_arr.append(str(v)) 60 | else: 61 | row_arr.append(v) 62 | 63 | return writer.writerow(row_arr) 64 | 65 | engine = create_engine(conn_string) 66 | db_conn = engine.connect() 67 | metadata = MetaData(engine) 68 | Session = sessionmaker(bind=engine) 69 | session = Session() 70 | 71 | shows_table = Table('shows', metadata, autoload=True) 72 | shows_columns = ['id', 'feed_url', 'title', 'subtitle', 'description', 'summary', 'author', 'email', 'link', 'language', 'explicit', 'image', 'category', 'subcategory', 'created_at', 'last_build_date'] 73 | 74 | episodes_table = Table('episodes', metadata, autoload=True) 75 | episodes_columns = ['id', 'show_id', 'title', 'link', 'guid', 'subtitle', 'description', 'summary', 'author', 'audio_url', 'audio_file_size', 'audio_mime_type', 'category', 'explicit', 'length', 'pub_date', 'keywords'] 76 | 77 | select = session.query(shows_table) 78 | with io.open('data/shows.csv', 'wb') as fh: 79 | writer = UnicodeWriter(fh, encoding='utf-8') 80 | writer.writerow(shows_columns) 81 | 82 | for row in page_query(select): 83 | write_row(row, writer, shows_columns) 84 | 85 | select = session.query(episodes_table) 86 | with io.open('data/episodes.csv', 'wb') as fh: 87 | writer = UnicodeWriter(fh, encoding='utf-8') 88 | writer.writerow(episodes_columns) 89 | 90 | for row in page_query(select): 91 | write_row(row, writer, episodes_columns) 92 | -------------------------------------------------------------------------------- /schema.sql: -------------------------------------------------------------------------------- 1 | -- MySQL Script generated by MySQL Workbench 2 | -- Thu Nov 24 21:45:42 2016 3 | -- Model: New Model Version: 1.0 4 | -- MySQL Workbench Forward Engineering 5 | 6 | SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0; 7 | SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0; 8 | SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='TRADITIONAL,ALLOW_INVALID_DATES'; 9 | 10 | -- ----------------------------------------------------- 11 | -- Schema podcasts 12 | -- ----------------------------------------------------- 13 | 14 | -- ----------------------------------------------------- 15 | -- Schema podcasts 16 | -- ----------------------------------------------------- 17 | CREATE SCHEMA IF NOT EXISTS `podcasts` DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci ; 18 | USE `podcasts` ; 19 | 20 | -- ----------------------------------------------------- 21 | -- Table `podcasts`.`shows` 22 | -- ----------------------------------------------------- 23 | DROP TABLE IF EXISTS `podcasts`.`shows` ; 24 | 25 | CREATE TABLE IF NOT EXISTS `podcasts`.`shows` ( 26 | `id` VARCHAR(36) NOT NULL COMMENT '', 27 | `feed_url` VARCHAR(255) NOT NULL COMMENT '', 28 | `title` VARCHAR(255) NOT NULL COMMENT '', 29 | `subtitle` TEXT NULL COMMENT '', 30 | `description` TEXT NULL COMMENT '', 31 | `summary` TEXT NULL COMMENT '', 32 | `author` VARCHAR(512) NULL COMMENT '', 33 | `email` VARCHAR(255) NULL COMMENT '', 34 | `link` VARCHAR(255) NOT NULL COMMENT '', 35 | `language` VARCHAR(10) NOT NULL DEFAULT 'en-us' COMMENT '', 36 | `explicit` TINYINT(1) NOT NULL DEFAULT 0 COMMENT '', 37 | `image` VARCHAR(255) NULL COMMENT '', 38 | `category` VARCHAR(200) NULL COMMENT '', 39 | `subcategory` VARCHAR(200) NULL COMMENT '', 40 | `created_at` DATETIME NOT NULL COMMENT '', 41 | `last_build_date` DATETIME NOT NULL COMMENT '', 42 | PRIMARY KEY (`id`), 43 | UNIQUE INDEX `feed_url_UNIQUE` (`feed_url` ASC)) 44 | ENGINE = InnoDB; 45 | 46 | 47 | -- ----------------------------------------------------- 48 | -- Table `podcasts`.`episodes` 49 | -- ----------------------------------------------------- 50 | DROP TABLE IF EXISTS `podcasts`.`episodes` ; 51 | 52 | CREATE TABLE IF NOT EXISTS `podcasts`.`episodes` ( 53 | `id` VARCHAR(36) NOT NULL COMMENT '', 54 | `show_id` VARCHAR(36) NOT NULL COMMENT '', 55 | `title` VARCHAR(1024) NULL COMMENT '', 56 | `link` VARCHAR(555) NULL COMMENT '', 57 | `guid` VARCHAR(512) NULL COMMENT 'Theoretically a GUID, but not enforced by many platforms so not unique', 58 | `subtitle` TEXT NULL COMMENT '', 59 | `description` VARCHAR(255) NULL COMMENT '', 60 | `summary` TEXT NULL COMMENT '', 61 | `author` VARCHAR(255) NULL COMMENT '', 62 | `audio_url` VARCHAR(1024) NOT NULL COMMENT '', 63 | `audio_file_size` BIGINT NOT NULL COMMENT '', 64 | `audio_mime_type` VARCHAR(255) NOT NULL DEFAULT 'audio/mp3' COMMENT '', 65 | `category` VARCHAR(200) NULL COMMENT '', 66 | `explicit` TINYINT(1) NOT NULL DEFAULT 0 COMMENT '', 67 | `length` INT NOT NULL COMMENT '', 68 | `pub_date` DATETIME NOT NULL COMMENT '', 69 | `keywords` VARCHAR(1800) NULL COMMENT '', 70 | PRIMARY KEY (`id`), 71 | INDEX `show_fk_idx` (`show_id` ASC), 72 | UNIQUE INDEX `mp3_url_UNIQUE` (`audio_url` ASC), 73 | CONSTRAINT `show_fk` 74 | FOREIGN KEY (`show_id`) 75 | REFERENCES `podcasts`.`shows` (`id`) 76 | ON DELETE NO ACTION 77 | ON UPDATE NO ACTION) 78 | ENGINE = InnoDB; 79 | 80 | USE `podcasts` ; 81 | 82 | -- ----------------------------------------------------- 83 | -- View `podcasts`.`episodes_flat` 84 | -- ----------------------------------------------------- 85 | DROP VIEW IF EXISTS `podcasts`.`episodes_flat` ; 86 | USE `podcasts`; 87 | CREATE OR REPLACE VIEW `episodes_flat` AS 88 | SELECT 89 | episodes.*, 90 | shows.title as show_title, 91 | shows.subtitle as show_subtitle, 92 | shows.description as show_description, 93 | shows.summary as show_summary, 94 | shows.author as show_author, 95 | shows.email as show_email, 96 | shows.link as show_link, 97 | shows.language, 98 | shows.explicit as show_explicit, 99 | shows.image, 100 | shows.category as show_category, 101 | shows.subcategory as show_subcategory, 102 | shows.feed_url as feed_url 103 | FROM episodes 104 | INNER JOIN shows ON episodes.show_id=shows.id; 105 | 106 | SET SQL_MODE=@OLD_SQL_MODE; 107 | SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; 108 | SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS; 109 | -------------------------------------------------------------------------------- /podcast_lib.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import requests, glob, sys, os, datetime, json, uuid, re, hashlib, sqlalchemy 3 | import xml.etree.ElementTree as ET 4 | 5 | class PodcastLib: 6 | show_fields = [ 7 | 'feed_url', 8 | 'title', 9 | 'subtitle', 10 | 'description', 11 | 'summary', 12 | 'author', 13 | 'email', 14 | 'link', 15 | 'language' 16 | ] 17 | 18 | episode_fields = [ 19 | 'title', 20 | 'link', 21 | 'guid', 22 | 'subtitle', 23 | 'description', 24 | 'summary', 25 | 'author', 26 | 'category', 27 | 'keywords' 28 | ] 29 | 30 | tmp_dir = '/tmp/podcasts/feeds/' 31 | dtd = '{http://www.itunes.com/DTDs/Podcast-1.0.dtd}'.lower() 32 | 33 | @staticmethod 34 | def process_episode(node): 35 | """ 36 | Parse an XML node into an episode object. 37 | 38 | Params: 39 | node -- An item node from parsed XML feed 40 | 41 | Returns: 42 | Episode object. 43 | """ 44 | 45 | obj = {} 46 | for child in node: 47 | tag = child.tag.lower().replace(PodcastLib.dtd, '').split('}')[-1] 48 | if tag in PodcastLib.episode_fields: 49 | try: 50 | obj[tag] = child.attrib if (child.attrib and type(child.attrib) == str) else child.text.strip() if child.text else '' 51 | except AttributeError: 52 | obj[tag] = '' 53 | elif tag == 'enclosure': 54 | if 'url' in child.attrib: 55 | obj['audio_url'] = child.attrib['url'] 56 | if 'length' in child.attrib: 57 | obj['audio_file_size'] = child.attrib['length'].replace(',', '') 58 | if 'type' in child.attrib: 59 | obj['audio_mime_type'] = child.attrib['type'] 60 | 61 | elif tag == 'explicit': 62 | obj['explicit'] = child.text.strip().lower() == 'yes' if child.text is not None else 0 63 | 64 | elif tag == 'pubdate': 65 | try: 66 | if re.search('[+-][0-9]+$', child.text.strip()): 67 | dt = datetime.datetime.strptime(child.text.strip()[0:-5].strip(), '%a, %d %b %Y %H:%M:%S') 68 | else: 69 | dt = datetime.datetime.strptime(child.text.strip().strip(), '%a, %d %b %Y %H:%M:%S') 70 | except (ValueError,AttributeError): 71 | dt = None 72 | obj['pub_date'] = dt 73 | 74 | elif tag == 'duration': 75 | if child.text and ':' in child.text: 76 | lengths = child.text.split(':')[::-1] 77 | duration = 0 78 | 79 | for i in range(0, len(lengths)): 80 | try: 81 | duration += max((i*60), 1) * int(float(lengths[i])) 82 | except (ValueError, TypeError): 83 | pass 84 | else: 85 | try: 86 | duration = int(child.text) 87 | except (ValueError, TypeError): 88 | duration = None 89 | 90 | obj['length'] = duration 91 | 92 | if 'length' not in obj: 93 | obj['length'] = None 94 | 95 | if type(obj['length']) is str and ':' in obj['length']: 96 | obj['length'] = None 97 | 98 | if 'description' in obj: 99 | obj['description'] = obj['description'][:255] 100 | 101 | if 'author' in obj: 102 | obj['author'] = obj['author'][:255] 103 | 104 | if 'keywords' in obj: 105 | keywords = ','.join([r.strip() for r in obj['keywords'].split(',')]) 106 | 107 | if 'audio_file_size' in obj: 108 | if type(obj['audio_file_size']) is str: 109 | try: 110 | if obj['audio_file_size'] and int(obj['audio_file_size']) < 0: 111 | obj['audio_file_size'] = '' 112 | except ValueError: 113 | obj['audio_file_size'] = '' 114 | 115 | return obj 116 | 117 | @staticmethod 118 | def parse_xml(xml, process_episodes=False): 119 | """ 120 | Parse XML string into a show object 121 | 122 | Params: 123 | xml -- XML string to parse 124 | process_episodes -- If true, process episode items and add them to show object. 125 | 126 | Returns: 127 | A show object 128 | """ 129 | 130 | parser = ET.XMLParser(encoding="utf-8") 131 | tree = ET.fromstring(xml, parser=parser) 132 | obj = {} 133 | 134 | for channel in tree.findall('channel'): 135 | for child in channel: 136 | tag = child.tag.lower().replace(PodcastLib.dtd, '').split('}')[-1] 137 | 138 | if tag in PodcastLib.show_fields: 139 | try: 140 | obj[tag] = child.attrib if (child.attrib and type(child.attrib) == str) else child.text.strip() if child.text else '' 141 | except AttributeError: 142 | obj[tag] = '' 143 | 144 | elif tag == 'lastbuilddate': 145 | try: 146 | if re.search('[+-][0-9]+$', child.text.strip()): 147 | dt = datetime.datetime.strptime(child.text.strip()[0:-5].strip(), '%a, %d %b %Y %H:%M:%S') 148 | else: 149 | dt = datetime.datetime.strptime(child.text.strip().strip(), '%a, %d %b %Y %H:%M:%S') 150 | except (ValueError,AttributeError): 151 | dt = datetime.datetime.now() 152 | 153 | obj['last_build_date'] = dt.strftime('%Y-%m-%d %H:%M:%S') 154 | 155 | elif tag == 'owner': 156 | if child.find(PodcastLib.dtd+'email') is not None: 157 | obj['email'] = child.find(PodcastLib.dtd+'email').text 158 | 159 | elif tag == 'category': 160 | tag = 'category' if 'category' not in obj else 'subcategory' 161 | if child.attrib and 'text' in child.attrib: 162 | obj[tag] = child.attrib['text'] 163 | elif type(child.text) == str: 164 | obj[tag] = child.text.strip() 165 | 166 | elif tag == 'explicit': 167 | obj['explicit'] = child.text.strip().lower() == 'yes' if child.text is not None else 0 168 | 169 | elif tag == 'image': 170 | if child.find('url') is not None: 171 | obj['image'] = child.find('url').text 172 | elif type(child.attrib) == dict and 'href' in child.attrib: 173 | obj['image'] = child.attrib['href'] 174 | elif child.text and child.text.strip() != '': 175 | obj['image'] = child.text.strip() 176 | 177 | obj['created_at'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 178 | 179 | if 'last_build_date' not in obj: 180 | obj['last_build_date'] = obj['created_at'] 181 | 182 | if 'language' in obj: 183 | obj['language'] = obj['language'].lower().split(',')[0][0:10] 184 | 185 | if 'email' in obj and obj['email']: 186 | obj['email'] = obj['email'][0:255] 187 | 188 | if 'category' in obj and obj['category']: 189 | obj['category'] = obj['category'][0:200] 190 | 191 | if 'author' in obj and obj['author']: 192 | obj['author'] = obj['author'][0:250] 193 | 194 | if 'title' in obj and obj['title']: 195 | obj['title'] = obj['title'][0:255] 196 | 197 | if 'image' in obj and obj['image']: 198 | obj['image'] = obj['image'][0:255] 199 | 200 | if 'image' in obj and type(obj['image']) != str: 201 | if type(obj['image']) == dict and 'href' in obj['image']: 202 | obj['image'] = obj['image']['href'] 203 | else: 204 | obj['image'] = '' 205 | 206 | if 'link' in obj and type(obj['link']) != str: 207 | if type(obj['link']) == dict and 'href' in obj['link']: 208 | obj['link'] = obj['link']['href'] 209 | else: 210 | obj['link'] = '' 211 | 212 | if process_episodes: 213 | obj['episodes'] = [] 214 | 215 | for channel in tree.findall('channel'): 216 | for child in channel.findall('item'): 217 | try: 218 | obj['episodes'].append(PodcastLib.process_episode(child)) 219 | except: 220 | print('could not process episode') 221 | raise 222 | 223 | return obj 224 | 225 | @staticmethod 226 | def save_to_file(filename, contents): 227 | """ 228 | Save file contents to tmp file. 229 | 230 | Params: 231 | filename -- Local file to write to. 232 | contents -- Binary file contents 233 | """ 234 | 235 | if not os.path.exists(filename.rsplit('/', 1)[0]): 236 | os.mkdir(filename.rsplit('/', 1)[0]) 237 | 238 | with open(filename, 'wb') as fh: 239 | fh.write(contents) 240 | 241 | return True 242 | 243 | @staticmethod 244 | def process_feed(url, process_episodes=False): 245 | """ 246 | Download and process a feed URL. 247 | 248 | Params: 249 | url -- RSS Feed URL to process 250 | process_episodes -- Flag to determine if the parser will parse episode items. 251 | Default false. 252 | 253 | Returns: 254 | A show object. 255 | """ 256 | 257 | try: 258 | feed_id = str(uuid.uuid3(uuid.NAMESPACE_URL, url)) 259 | tmp_location = PodcastLib.tmp_dir+feed_id+'/'+hashlib.sha256(url.lower()).hexdigest()+'.xml' 260 | 261 | if os.path.exists(tmp_location): 262 | print('cache hit') 263 | with open(tmp_location, 'rb') as fh: 264 | contents = fh.read() 265 | else: 266 | req = requests.get(url, timeout=30) 267 | contents = req.content 268 | PodcastLib.save_to_file(tmp_location, contents) 269 | except (requests.exceptions.ConnectionError, 270 | requests.exceptions.TooManyRedirects, 271 | requests.exceptions.ReadTimeout, 272 | requests.exceptions.InvalidSchema, 273 | UnicodeDecodeError): 274 | print('Connection error') 275 | return False 276 | 277 | if os.path.exists(tmp_location) or req.status_code == 200: 278 | try: 279 | obj = PodcastLib.parse_xml(contents, process_episodes) 280 | obj['feed_url'] = url.lower() 281 | obj['id'] = feed_id 282 | 283 | if 'episodes' in obj: 284 | for i in range(0, len(obj['episodes'])): 285 | obj['episodes'][i]['show_id'] = obj['id'] 286 | try: 287 | hash = hashlib.sha256(obj['episodes'][i]['audio_url'].decode('utf-8').lower() + obj['episodes'][i]['audio_file_size'].decode('UTF-8')).hexdigest() 288 | id = str(uuid.uuid3(uuid.NAMESPACE_URL, hash)) 289 | except (KeyError, UnicodeEncodeError): 290 | id = str(uuid.uuid4()) 291 | obj['episodes'][i]['id'] = id 292 | 293 | return obj 294 | 295 | except ET.ParseError as e: 296 | print('Bad XML document, parse failed.') 297 | print(e) 298 | 299 | return False 300 | 301 | @staticmethod 302 | def save_to_db(db_conn, tab, obj): 303 | """ 304 | Save an object to the database. 305 | 306 | Params: 307 | db_conn -- SQLAlchemy connection. 308 | tab -- Table definition. 309 | obj -- Data to insert. 310 | """ 311 | ins = tab.insert(obj) 312 | try: 313 | db_conn.execute(ins) 314 | return True 315 | except sqlalchemy.exc.IntegrityError: 316 | print('Non-unique feed, skipping') 317 | except sqlalchemy.exc.CompileError: 318 | print('Table error') 319 | print(ins.compile().params) 320 | raise 321 | except sqlalchemy.exc.OperationalError: 322 | print('Missing required field') 323 | #raise 324 | except: 325 | print('Unexpected exception') 326 | print(ins.compile().params) 327 | raise 328 | return False 329 | --------------------------------------------------------------------------------