├── .gitignore
├── README.md
├── scan_episodes.py
├── import_shows.py
├── export_data.py
├── schema.sql
└── podcast_lib.py


/.gitignore:
--------------------------------------------------------------------------------
1 | build_lists/sources
2 | config.py
3 | data
4 | checkpoint.log
5 | *.py[cod]
6 | .DS_Store
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Podcasts Dataset
 2 | Scripts to compile a large dataset of podcasts and episodes for analysis.
 3 | 
 4 | ## Podcast List Sources
 5 | * http://www.allpodcasts.com/Tools/OPMLViewer.aspx?url=http%3a%2f%2fwww.digitalpodcast.com%2fopml%2fdigitalpodcast.opml
 6 | * http://www.publicradiofan.com/podcasts.html
 7 | * http://newtimeradio.com/
 8 | * https://rss.itunes.apple.com/us/?urlDesc=%2Fgenerator
 9 | * https://www.podcastpedia.org/categories
10 | * http://www.godcast1000.com/
11 | 


--------------------------------------------------------------------------------
/scan_episodes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | import requests, glob, sys, os, datetime, json, uuid, sqlalchemy, re, hashlib
 4 | import xml.etree.ElementTree as ET
 5 | from sqlalchemy import create_engine, select, MetaData, Table, Column, String, Integer, DateTime
 6 | from config import conn_string
 7 | from podcast_lib import PodcastLib
 8 | 
 9 | engine = create_engine(conn_string)
10 | db_conn = engine.connect()
11 | metadata = MetaData(engine)
12 | 
13 | shows_table = Table('shows', metadata,
14 | 	Column('id', String(36), primary_key=True),
15 | 	Column('feed_url', String(255)),
16 | )
17 | 
18 | episodes_table = Table('episodes', metadata,
19 | 	Column('id', String(36), primary_key=True),
20 | 	Column('show_id', String(36)),
21 | 	Column('title', String(255)),
22 | 	Column('link', String(255)),
23 | 	Column('guid', String(255)),
24 | 	Column('subtitle', String()),
25 | 	Column('description', String()),
26 | 	Column('summary', String()),
27 | 	Column('author', String(255)),
28 | 	Column('audio_url', String(255)),
29 | 	Column('audio_file_size', Integer),
30 | 	Column('audio_mime_type', String(50)),
31 | 	Column('category', String(200)),
32 | 	Column('explicit', Integer),
33 | 	Column('length', Integer),
34 | 	Column('pub_date', DateTime()),
35 | 	Column('keywords', String(800))
36 | )
37 | 
38 | s = select([shows_table])
39 | shows = db_conn.execute(s)
40 | 
41 | for show in shows:
42 | 	try:
43 | 		obj = PodcastLib.process_feed(show[1].encode('utf-8'), process_episodes=True)
44 | 		if obj:
45 | 			for episode in obj['episodes']:
46 | 				PodcastLib.save_to_db(db_conn, episodes_table, episode)
47 | 	
48 | 	except KeyboardInterrupt:
49 | 		sys.exit(0)
50 | 


--------------------------------------------------------------------------------
/import_shows.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | import glob, sys, os, sqlalchemy
 4 | from sqlalchemy import create_engine, MetaData, Table, Column, String, Integer, DateTime
 5 | from config import conn_string
 6 | from podcast_lib import PodcastLib
 7 | 
 8 | engine = create_engine(conn_string)
 9 | db_conn = engine.connect()
10 | metadata = MetaData(engine)
11 | 
12 | shows_table = Table('shows', metadata,
13 | 	Column('id', String(36), primary_key=True),
14 | 	Column('feed_url', String(255)),
15 | 	Column('title', String(255)),
16 | 	Column('subtitle', String()),
17 | 	Column('description', String()),
18 | 	Column('summary', String()),
19 | 	Column('author', String(250)),
20 | 	Column('email', String(255)),
21 | 	Column('link', String(255)),
22 | 	Column('language', String(10)),
23 | 	Column('explicit', Integer),
24 | 	Column('image', String(500)),
25 | 	Column('category', String(200)),
26 | 	Column('subcategory', String(200)),
27 | 	Column('created_at', DateTime()),
28 | 	Column('last_build_date', DateTime())
29 | )
30 | 
31 | if __name__ == '__main__':
32 | 	with open('checkpoint.log', 'r') as chk:
33 | 		checkpoint = chk.read()
34 | 	
35 | 	for f in glob.glob('data/*.txt'):
36 | 		with open(f) as fh:
37 | 			for line in fh.read().split("\n"):
38 | 				
39 | 				# Failed run checkpoint 
40 | 				if checkpoint != '' and checkpoint.strip() != line.strip():
41 | 					continue
42 | 				else:
43 | 					checkpoint = ''
44 | 				
45 | 				if line == '':
46 | 					continue
47 | 				
48 | 				try:
49 | 					obj = PodcastLib.process_feed(line)
50 | 					if obj:
51 | 						PodcastLib.save_to_db(db_conn, shows_table, obj)
52 | 				
53 | 				except KeyboardInterrupt:
54 | 					with open('checkpoint.log', 'w') as chk:
55 | 						chk.write(line)
56 | 					sys.exit(0)
57 | 				
58 | 				except:
59 | 					with open('checkpoint.log', 'w') as chk:
60 | 						chk.write(line)
61 | 					
62 | 					raise
63 | 


--------------------------------------------------------------------------------
/export_data.py:
--------------------------------------------------------------------------------
 1 | import csv, sqlalchemy, io, random, cStringIO, codecs
 2 | from datetime import datetime
 3 | from sqlalchemy import create_engine, Table, MetaData
 4 | from sqlalchemy.orm import sessionmaker
 5 | from config import conn_string
 6 | 
 7 | def page_query(q):
 8 | 	""" Page through query to prevent killed queries """
 9 | 	offset = 0
10 | 	while True:
11 | 		r = False
12 | 		for elem in q.limit(1000).offset(offset):
13 | 		   r = True
14 | 		   yield elem
15 | 		offset += 1000
16 | 		if not r:
17 | 			break
18 | 		
19 | class UnicodeWriter:
20 | 	"""
21 | 	A CSV writer which will write rows to CSV file "f",
22 | 	which is encoded in the given encoding.
23 | 	"""
24 | 
25 | 	def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
26 | 		# Redirect output to a queue
27 | 		self.queue = cStringIO.StringIO()
28 | 		self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
29 | 		self.stream = f
30 | 		self.encoder = codecs.getincrementalencoder(encoding)()
31 | 
32 | 	def writerow(self, row):
33 | 		self.writer.writerow([s.encode("utf-8") for s in row])
34 | 		# Fetch UTF-8 output from the queue ...
35 | 		data = self.queue.getvalue()
36 | 		data = data.decode("utf-8")
37 | 		# ... and reencode it into the target encoding
38 | 		data = self.encoder.encode(data)
39 | 		# write to the target stream
40 | 		self.stream.write(data)
41 | 		# empty queue
42 | 		self.queue.truncate(0)
43 | 
44 | 	def writerows(self, rows):
45 | 		for row in rows:
46 | 			self.writerow(row)
47 | 
48 | def write_row(row, writer, key_index):
49 | 	row_dict = dict(zip(row.keys(), row))
50 | 		
51 | 	row_arr = []
52 | 	for k in key_index:
53 | 		v = row_dict[k]
54 | 		if v == None:
55 | 			row_arr.append('')
56 | 		elif type(v) == datetime:
57 | 			row_arr.append(v.strftime('%Y-%m-%dT%H:%M:%S%Z'))
58 | 		elif type(v) in (int, float, long):
59 | 			row_arr.append(str(v))
60 | 		else:
61 | 			row_arr.append(v)
62 | 	
63 | 	return writer.writerow(row_arr)
64 | 
65 | engine = create_engine(conn_string)
66 | db_conn = engine.connect()
67 | metadata = MetaData(engine)
68 | Session = sessionmaker(bind=engine)
69 | session = Session()
70 | 
71 | shows_table = Table('shows', metadata, autoload=True)
72 | shows_columns = ['id', 'feed_url', 'title', 'subtitle', 'description', 'summary', 'author', 'email', 'link', 'language', 'explicit', 'image', 'category', 'subcategory', 'created_at', 'last_build_date']
73 | 
74 | episodes_table = Table('episodes', metadata, autoload=True)
75 | episodes_columns = ['id', 'show_id', 'title', 'link', 'guid', 'subtitle', 'description', 'summary', 'author', 'audio_url', 'audio_file_size', 'audio_mime_type', 'category', 'explicit', 'length', 'pub_date', 'keywords']
76 | 
77 | select = session.query(shows_table)
78 | with io.open('data/shows.csv', 'wb') as fh:
79 | 	writer = UnicodeWriter(fh, encoding='utf-8')
80 | 	writer.writerow(shows_columns)
81 | 	
82 | 	for row in page_query(select):
83 | 		write_row(row, writer, shows_columns)
84 | 
85 | select = session.query(episodes_table)
86 | with io.open('data/episodes.csv', 'wb') as fh:
87 | 	writer = UnicodeWriter(fh, encoding='utf-8')
88 | 	writer.writerow(episodes_columns)
89 | 	
90 | 	for row in page_query(select):
91 | 		write_row(row, writer, episodes_columns)
92 | 


--------------------------------------------------------------------------------
/schema.sql:
--------------------------------------------------------------------------------
  1 | -- MySQL Script generated by MySQL Workbench
  2 | -- Thu Nov 24 21:45:42 2016
  3 | -- Model: New Model    Version: 1.0
  4 | -- MySQL Workbench Forward Engineering
  5 | 
  6 | SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0;
  7 | SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0;
  8 | SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='TRADITIONAL,ALLOW_INVALID_DATES';
  9 | 
 10 | -- -----------------------------------------------------
 11 | -- Schema podcasts
 12 | -- -----------------------------------------------------
 13 | 
 14 | -- -----------------------------------------------------
 15 | -- Schema podcasts
 16 | -- -----------------------------------------------------
 17 | CREATE SCHEMA IF NOT EXISTS `podcasts` DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci ;
 18 | USE `podcasts` ;
 19 | 
 20 | -- -----------------------------------------------------
 21 | -- Table `podcasts`.`shows`
 22 | -- -----------------------------------------------------
 23 | DROP TABLE IF EXISTS `podcasts`.`shows` ;
 24 | 
 25 | CREATE TABLE IF NOT EXISTS `podcasts`.`shows` (
 26 |   `id` VARCHAR(36) NOT NULL COMMENT '',
 27 |   `feed_url` VARCHAR(255) NOT NULL COMMENT '',
 28 |   `title` VARCHAR(255) NOT NULL COMMENT '',
 29 |   `subtitle` TEXT NULL COMMENT '',
 30 |   `description` TEXT NULL COMMENT '',
 31 |   `summary` TEXT NULL COMMENT '',
 32 |   `author` VARCHAR(512) NULL COMMENT '',
 33 |   `email` VARCHAR(255) NULL COMMENT '',
 34 |   `link` VARCHAR(255) NOT NULL COMMENT '',
 35 |   `language` VARCHAR(10) NOT NULL DEFAULT 'en-us' COMMENT '',
 36 |   `explicit` TINYINT(1) NOT NULL DEFAULT 0 COMMENT '',
 37 |   `image` VARCHAR(255) NULL COMMENT '',
 38 |   `category` VARCHAR(200) NULL COMMENT '',
 39 |   `subcategory` VARCHAR(200) NULL COMMENT '',
 40 |   `created_at` DATETIME NOT NULL COMMENT '',
 41 |   `last_build_date` DATETIME NOT NULL COMMENT '',
 42 |   PRIMARY KEY (`id`),
 43 |   UNIQUE INDEX `feed_url_UNIQUE` (`feed_url` ASC))
 44 | ENGINE = InnoDB;
 45 | 
 46 | 
 47 | -- -----------------------------------------------------
 48 | -- Table `podcasts`.`episodes`
 49 | -- -----------------------------------------------------
 50 | DROP TABLE IF EXISTS `podcasts`.`episodes` ;
 51 | 
 52 | CREATE TABLE IF NOT EXISTS `podcasts`.`episodes` (
 53 |   `id` VARCHAR(36) NOT NULL COMMENT '',
 54 |   `show_id` VARCHAR(36) NOT NULL COMMENT '',
 55 |   `title` VARCHAR(1024) NULL COMMENT '',
 56 |   `link` VARCHAR(555) NULL COMMENT '',
 57 |   `guid` VARCHAR(512) NULL COMMENT 'Theoretically a GUID, but not enforced by many platforms so not unique',
 58 |   `subtitle` TEXT NULL COMMENT '',
 59 |   `description` VARCHAR(255) NULL COMMENT '',
 60 |   `summary` TEXT NULL COMMENT '',
 61 |   `author` VARCHAR(255) NULL COMMENT '',
 62 |   `audio_url` VARCHAR(1024) NOT NULL COMMENT '',
 63 |   `audio_file_size` BIGINT NOT NULL COMMENT '',
 64 |   `audio_mime_type` VARCHAR(255) NOT NULL DEFAULT 'audio/mp3' COMMENT '',
 65 |   `category` VARCHAR(200) NULL COMMENT '',
 66 |   `explicit` TINYINT(1) NOT NULL DEFAULT 0 COMMENT '',
 67 |   `length` INT NOT NULL COMMENT '',
 68 |   `pub_date` DATETIME NOT NULL COMMENT '',
 69 |   `keywords` VARCHAR(1800) NULL COMMENT '',
 70 |   PRIMARY KEY (`id`),
 71 |   INDEX `show_fk_idx` (`show_id` ASC),
 72 |   UNIQUE INDEX `mp3_url_UNIQUE` (`audio_url` ASC),
 73 |   CONSTRAINT `show_fk`
 74 |     FOREIGN KEY (`show_id`)
 75 |     REFERENCES `podcasts`.`shows` (`id`)
 76 |     ON DELETE NO ACTION
 77 |     ON UPDATE NO ACTION)
 78 | ENGINE = InnoDB;
 79 | 
 80 | USE `podcasts` ;
 81 | 
 82 | -- -----------------------------------------------------
 83 | -- View `podcasts`.`episodes_flat`
 84 | -- -----------------------------------------------------
 85 | DROP VIEW IF EXISTS `podcasts`.`episodes_flat` ;
 86 | USE `podcasts`;
 87 | CREATE  OR REPLACE VIEW `episodes_flat` AS
 88 | SELECT 
 89 | 	episodes.*, 
 90 |     shows.title as show_title,
 91 |     shows.subtitle as show_subtitle,
 92 |     shows.description as show_description,
 93 |     shows.summary as show_summary,
 94 |     shows.author as show_author,
 95 |     shows.email as show_email,
 96 |     shows.link as show_link,
 97 |     shows.language, 
 98 |     shows.explicit as show_explicit,
 99 |     shows.image,
100 |     shows.category as show_category,
101 |     shows.subcategory as show_subcategory,
102 |     shows.feed_url as feed_url
103 | FROM episodes 
104 | INNER JOIN shows ON episodes.show_id=shows.id;
105 | 
106 | SET SQL_MODE=@OLD_SQL_MODE;
107 | SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;
108 | SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS;
109 | 


--------------------------------------------------------------------------------
/podcast_lib.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import requests, glob, sys, os, datetime, json, uuid, re, hashlib, sqlalchemy
  3 | import xml.etree.ElementTree as ET
  4 | 
  5 | class PodcastLib:
  6 | 	show_fields = [
  7 | 		'feed_url',
  8 | 		'title',
  9 | 		'subtitle',
 10 | 		'description',
 11 | 		'summary',
 12 | 		'author',
 13 | 		'email',
 14 | 		'link',
 15 | 		'language'
 16 | 	]
 17 | 	
 18 | 	episode_fields = [
 19 | 		'title',
 20 | 		'link',
 21 | 		'guid',
 22 | 		'subtitle',
 23 | 		'description',
 24 | 		'summary',
 25 | 		'author',
 26 | 		'category',
 27 | 		'keywords'
 28 | 	]
 29 | 
 30 | 	tmp_dir = '/tmp/podcasts/feeds/'
 31 | 	dtd = '{http://www.itunes.com/DTDs/Podcast-1.0.dtd}'.lower()
 32 | 	
 33 | 	@staticmethod
 34 | 	def process_episode(node):
 35 | 		"""
 36 | 		Parse an XML node into an episode object.
 37 | 		
 38 | 		Params:
 39 | 			node -- An item node from parsed XML feed
 40 | 		
 41 | 		Returns:
 42 | 			Episode object.
 43 | 		"""
 44 | 		
 45 | 		obj = {}
 46 | 		for child in node:
 47 | 			tag = child.tag.lower().replace(PodcastLib.dtd, '').split('}')[-1]
 48 | 			if tag in PodcastLib.episode_fields:
 49 | 				try:
 50 | 					obj[tag] = child.attrib if (child.attrib and type(child.attrib) == str) else child.text.strip() if child.text else ''	
 51 | 				except AttributeError:
 52 | 					obj[tag] = ''
 53 | 			elif tag == 'enclosure':
 54 | 				if 'url' in child.attrib:
 55 | 					obj['audio_url'] = child.attrib['url']
 56 | 				if 'length' in child.attrib:
 57 | 					obj['audio_file_size'] = child.attrib['length'].replace(',', '')
 58 | 				if 'type' in child.attrib:
 59 | 					obj['audio_mime_type'] = child.attrib['type']
 60 | 			
 61 | 			elif tag == 'explicit':
 62 | 				obj['explicit'] = child.text.strip().lower() == 'yes' if child.text is not None else 0
 63 | 			
 64 | 			elif tag == 'pubdate':
 65 | 				try:
 66 | 					if re.search('[+-][0-9]+$', child.text.strip()):
 67 | 						dt = datetime.datetime.strptime(child.text.strip()[0:-5].strip(), '%a, %d %b %Y %H:%M:%S')
 68 | 					else:
 69 | 						dt = datetime.datetime.strptime(child.text.strip().strip(), '%a, %d %b %Y %H:%M:%S')
 70 | 				except (ValueError,AttributeError):
 71 | 					dt = None
 72 | 				obj['pub_date'] = dt
 73 | 		
 74 | 			elif tag == 'duration': 
 75 | 				if child.text and ':' in child.text:
 76 | 					lengths = child.text.split(':')[::-1]
 77 | 					duration = 0
 78 | 			
 79 | 					for i in range(0, len(lengths)):
 80 | 						try:
 81 | 							duration += max((i*60), 1) * int(float(lengths[i]))
 82 | 						except (ValueError, TypeError):
 83 | 							pass
 84 | 				else:
 85 | 					try:
 86 | 						duration = int(child.text)
 87 | 					except (ValueError, TypeError):
 88 | 						duration = None
 89 | 
 90 | 				obj['length'] = duration
 91 | 			
 92 | 			if 'length' not in obj:
 93 | 				obj['length'] = None
 94 | 			
 95 | 			if type(obj['length']) is str and ':' in obj['length']:
 96 | 				obj['length'] = None
 97 | 			
 98 | 			if 'description' in obj:
 99 | 				obj['description'] = obj['description'][:255]
100 | 			
101 | 			if 'author' in obj:
102 | 				obj['author'] = obj['author'][:255]
103 | 			
104 | 			if 'keywords' in obj:
105 | 				keywords = ','.join([r.strip() for r in obj['keywords'].split(',')])
106 | 			
107 | 			if 'audio_file_size' in obj:
108 | 				if type(obj['audio_file_size']) is str:
109 | 					try:
110 | 						if obj['audio_file_size'] and int(obj['audio_file_size']) < 0:
111 | 							obj['audio_file_size'] = ''
112 | 					except ValueError:
113 | 						obj['audio_file_size'] = ''
114 | 				
115 | 		return obj
116 | 	
117 | 	@staticmethod
118 | 	def parse_xml(xml, process_episodes=False):
119 | 		"""
120 | 		Parse XML string into a show object
121 | 	
122 | 		Params:
123 | 			xml -- XML string to parse
124 | 			process_episodes -- If true, process episode items and add them to show object.
125 | 	
126 | 		Returns:
127 | 			A show object
128 | 		"""
129 | 		
130 | 		parser = ET.XMLParser(encoding="utf-8")
131 | 		tree = ET.fromstring(xml, parser=parser)
132 | 		obj = {}
133 | 	
134 | 		for channel in tree.findall('channel'):
135 | 			for child in channel:
136 | 				tag = child.tag.lower().replace(PodcastLib.dtd, '').split('}')[-1]
137 | 			
138 | 				if tag in PodcastLib.show_fields:
139 | 					try:
140 | 						obj[tag] = child.attrib if (child.attrib and type(child.attrib) == str) else child.text.strip() if child.text else ''	
141 | 					except AttributeError:
142 | 						obj[tag] = ''
143 | 				
144 | 				elif tag == 'lastbuilddate':
145 | 					try:
146 | 						if re.search('[+-][0-9]+$', child.text.strip()):
147 | 							dt = datetime.datetime.strptime(child.text.strip()[0:-5].strip(), '%a, %d %b %Y %H:%M:%S')
148 | 						else:
149 | 							dt = datetime.datetime.strptime(child.text.strip().strip(), '%a, %d %b %Y %H:%M:%S')
150 | 					except (ValueError,AttributeError):
151 | 						dt = datetime.datetime.now()
152 | 				
153 | 					obj['last_build_date'] = dt.strftime('%Y-%m-%d %H:%M:%S')
154 | 			
155 | 				elif tag == 'owner':
156 | 					if child.find(PodcastLib.dtd+'email') is not None:
157 | 						obj['email'] = child.find(PodcastLib.dtd+'email').text
158 | 			
159 | 				elif tag == 'category':
160 | 					tag = 'category' if 'category' not in obj else 'subcategory'
161 | 					if child.attrib and 'text' in child.attrib:
162 | 						obj[tag] = child.attrib['text']
163 | 					elif type(child.text) == str:
164 | 						obj[tag] = child.text.strip()
165 | 			
166 | 				elif tag == 'explicit':
167 | 					obj['explicit'] = child.text.strip().lower() == 'yes' if child.text is not None else 0
168 | 			
169 | 				elif tag == 'image':
170 | 					if child.find('url') is not None:
171 | 						obj['image'] = child.find('url').text
172 | 					elif type(child.attrib) == dict and 'href' in child.attrib:
173 | 						obj['image'] = child.attrib['href']
174 | 					elif child.text and child.text.strip() != '':
175 | 						obj['image'] = child.text.strip()
176 | 	
177 | 		obj['created_at'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
178 | 	
179 | 		if 'last_build_date' not in obj:
180 | 			obj['last_build_date'] = obj['created_at']
181 | 	
182 | 		if 'language' in obj:
183 | 			obj['language'] = obj['language'].lower().split(',')[0][0:10]
184 | 	
185 | 		if 'email' in obj and obj['email']:
186 | 			obj['email'] = obj['email'][0:255]
187 | 		
188 | 		if 'category' in obj and obj['category']:
189 | 			obj['category'] = obj['category'][0:200]
190 | 	
191 | 		if 'author' in obj and obj['author']:
192 | 			obj['author'] = obj['author'][0:250]
193 | 		
194 | 		if 'title' in obj and obj['title']:
195 | 			obj['title'] = obj['title'][0:255]
196 | 		
197 | 		if 'image' in obj and obj['image']:
198 | 			obj['image'] = obj['image'][0:255]
199 | 	
200 | 		if 'image' in obj and type(obj['image']) != str:
201 | 			if type(obj['image']) == dict and 'href' in obj['image']:
202 | 				obj['image'] = obj['image']['href']
203 | 			else:
204 | 				obj['image'] = ''
205 | 	
206 | 		if 'link' in obj and type(obj['link']) != str:
207 | 			if type(obj['link']) == dict and 'href' in obj['link']:
208 | 				obj['link'] = obj['link']['href']
209 | 			else:
210 | 				obj['link'] = ''
211 | 		
212 | 		if process_episodes:
213 | 			obj['episodes'] = []
214 | 
215 | 			for channel in tree.findall('channel'):
216 | 				for child in channel.findall('item'):
217 | 					try:
218 | 						obj['episodes'].append(PodcastLib.process_episode(child))
219 | 					except:
220 | 						print('could not process episode')
221 | 						raise
222 | 						
223 | 		return obj
224 | 	
225 | 	@staticmethod
226 | 	def save_to_file(filename, contents):
227 | 		"""
228 | 		Save file contents to tmp file.
229 | 		
230 | 		Params:
231 | 			filename -- Local file to write to.
232 | 			contents -- Binary file contents
233 | 		"""
234 | 		
235 | 		if not os.path.exists(filename.rsplit('/', 1)[0]):
236 | 			os.mkdir(filename.rsplit('/', 1)[0])
237 | 	
238 | 		with open(filename, 'wb') as fh:
239 | 			fh.write(contents)
240 | 	
241 | 		return True
242 | 	
243 | 	@staticmethod
244 | 	def process_feed(url, process_episodes=False):
245 | 		"""
246 | 		Download and process a feed URL.
247 | 	
248 | 		Params:
249 | 			url -- RSS Feed URL to process
250 | 			process_episodes -- Flag to determine if the parser will parse episode items. 
251 | 								Default false.
252 | 	
253 | 		Returns:
254 | 			A show object.
255 | 		"""
256 | 		
257 | 		try:
258 | 			feed_id = str(uuid.uuid3(uuid.NAMESPACE_URL, url))
259 | 			tmp_location = PodcastLib.tmp_dir+feed_id+'/'+hashlib.sha256(url.lower()).hexdigest()+'.xml'
260 | 
261 | 			if os.path.exists(tmp_location):
262 | 				print('cache hit')
263 | 				with open(tmp_location, 'rb') as fh:
264 | 					contents = fh.read()
265 | 			else:
266 | 				req = requests.get(url, timeout=30)
267 | 				contents = req.content
268 | 				PodcastLib.save_to_file(tmp_location, contents)
269 | 		except (requests.exceptions.ConnectionError, 
270 | 				requests.exceptions.TooManyRedirects, 
271 | 				requests.exceptions.ReadTimeout,
272 | 				requests.exceptions.InvalidSchema,
273 | 				UnicodeDecodeError):
274 | 			print('Connection error')
275 | 			return False
276 | 	
277 | 		if os.path.exists(tmp_location) or req.status_code == 200:
278 | 			try:
279 | 				obj = PodcastLib.parse_xml(contents, process_episodes)
280 | 				obj['feed_url'] = url.lower()
281 | 				obj['id'] = feed_id
282 | 				
283 | 				if 'episodes' in obj:
284 | 					for i in range(0, len(obj['episodes'])):
285 | 						obj['episodes'][i]['show_id'] = obj['id']
286 | 						try:
287 | 							hash = hashlib.sha256(obj['episodes'][i]['audio_url'].decode('utf-8').lower() + obj['episodes'][i]['audio_file_size'].decode('UTF-8')).hexdigest()
288 | 							id = str(uuid.uuid3(uuid.NAMESPACE_URL, hash))
289 | 						except (KeyError, UnicodeEncodeError):
290 | 							id = str(uuid.uuid4())
291 | 						obj['episodes'][i]['id'] = id
292 | 				
293 | 				return obj
294 | 			
295 | 			except ET.ParseError as e:
296 | 				print('Bad XML document, parse failed.')
297 | 				print(e)
298 | 		
299 | 		return False
300 | 	
301 | 	@staticmethod
302 | 	def save_to_db(db_conn, tab, obj):
303 | 		"""
304 | 		Save an object to the database.
305 | 		
306 | 		Params:
307 | 			db_conn -- SQLAlchemy connection.
308 | 			tab -- Table definition.
309 | 			obj -- Data to insert.
310 | 		"""
311 | 		ins = tab.insert(obj)
312 | 		try:
313 | 			db_conn.execute(ins)
314 | 			return True
315 | 		except sqlalchemy.exc.IntegrityError:
316 | 			print('Non-unique feed, skipping')
317 | 		except sqlalchemy.exc.CompileError:
318 | 			print('Table error')
319 | 			print(ins.compile().params)
320 | 			raise
321 | 		except sqlalchemy.exc.OperationalError:
322 | 			print('Missing required field')
323 | 			#raise
324 | 		except:
325 | 			print('Unexpected exception')
326 | 			print(ins.compile().params)
327 | 			raise
328 | 		return False
329 | 


--------------------------------------------------------------------------------