49 |
50 | {% if languages|length > 1 %}
51 |
52 | Any languages
53 |
54 |
55 | {% else %}
56 |
57 |
58 | {% endif %}
59 |
82 |
83 | {% if show_books %}
84 |
85 |
86 |
87 |
88 |
89 | {% endif %}
90 |
91 |
92 |
93 |
94 |
95 | {% block precontent %}{% endblock %}
96 |
97 | {% block content %}
98 | {% endblock %}
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/gutenbergtozim/database.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim: ai ts=4 sts=4 et sw=4 nu
4 |
5 | from __future__ import (unicode_literals, absolute_import,
6 | division, print_function)
7 |
8 | from peewee import (Model, # SqliteDatabase,
9 | CharField, BooleanField,
10 | IntegerField, ForeignKeyField, TextField)
11 | from playhouse.apsw_ext import APSWDatabase
12 |
13 | from gutenbergtozim import logger
14 |
15 | # db = SqliteDatabase('gutenberg.db')
16 | timeout = 1000 * 60 * 5 * 16
17 | db = APSWDatabase('gutenberg.db', pragmas=(
18 | ('journal_mode', 'WAL'),
19 | ('cache_size', 10000),
20 | ('mmap_size', 1024 * 1024 * 32)),
21 | timeout=timeout)
22 | db.connect()
23 | db.execute_sql("PRAGMA journal_mode=WAL;")
24 |
25 |
26 | class BaseModel(Model):
27 | @classmethod
28 | def get_or_none(cls, *query, **kwargs):
29 | try:
30 | return cls.get(*query, **kwargs)
31 | except cls.DoesNotExist:
32 | return None
33 |
34 |
35 | class License(BaseModel):
36 |
37 | class Meta:
38 | database = db
39 | fixtures = [
40 | {'slug': 'PD', 'name': "Public domain in the USA."},
41 | {'slug': 'None', 'name': "None"},
42 | {'slug': 'Copyright', 'name': "Copyrighted. Read the copyright "
43 | "notice inside this book "
44 | "for details."},
45 | ]
46 |
47 | slug = CharField(max_length=20, primary_key=True)
48 | name = CharField()
49 |
50 | def __unicode__(self):
51 | return self.name
52 |
53 |
54 | class Format(BaseModel):
55 |
56 | class Meta:
57 | database = db
58 |
59 | mime = CharField(max_length=100)
60 | images = BooleanField(default=True)
61 | pattern = CharField(max_length=100)
62 |
63 | def __unicode__(self):
64 | return self.mime
65 |
66 |
67 | class Author(BaseModel):
68 |
69 | class Meta:
70 | database = db
71 | fixtures = [
72 | {
73 | 'gut_id': '116',
74 | 'last_name': "Various",
75 | },
76 | {
77 | 'gut_id': '216',
78 | 'last_name': "Anonymous",
79 | },
80 | ]
81 |
82 | gut_id = CharField(primary_key=True, max_length=100)
83 | last_name = CharField(max_length=150)
84 | first_names = CharField(max_length=300, null=True)
85 | birth_year = CharField(max_length=10, null=True)
86 | death_year = CharField(max_length=10, null=True)
87 |
88 | def __unicode__(self):
89 | return self.name()
90 |
91 | def fname(self):
92 | return "{name}.{id}".format(name=self.name(), id=self.gut_id)
93 |
94 | def name(self):
95 | def sanitize(text):
96 | return text.strip().replace('/', '-')[:230]
97 | if not self.first_names and not self.last_name:
98 | return sanitize("Anonymous")
99 |
100 | if not self.first_names:
101 | return sanitize(self.last_name)
102 |
103 | if not self.last_name:
104 | return sanitize(self.first_names)
105 |
106 | return sanitize("{fn} {ln}".format(ln=self.last_name, fn=self.first_names))
107 |
108 | def to_dict(self):
109 | return {'label': self.name(),
110 | 'id': self.gut_id,
111 | 'last_name': self.last_name,
112 | 'first_names': self.first_names,
113 | 'birth_year': self.birth_year,
114 | 'death_year': self.death_year}
115 |
116 | def to_array(self):
117 | return [
118 | self.name(),
119 | self.gut_id,
120 | # self.last_name,
121 | # self.first_names,
122 | # self.birth_year,
123 | # self.death_year,
124 | ]
125 |
126 |
127 | class Book(BaseModel):
128 |
129 | class Meta:
130 | database = db
131 |
132 | id = IntegerField(primary_key=True)
133 | title = CharField(max_length=500)
134 | subtitle = CharField(max_length=500, null=True)
135 | author = ForeignKeyField(Author, related_name='books')
136 | license = ForeignKeyField(License, related_name='books')
137 | language = CharField(max_length=10)
138 | downloads = IntegerField(default=0)
139 | bookshelf = CharField(max_length=500, null=True)
140 | cover_page = IntegerField(default=0)
141 |
142 | popularity = 0
143 |
144 | def __unicode__(self):
145 | return "{}/{}/{}".format(self.id, self.title,self.bookshelf)
146 |
147 | def to_dict(self):
148 | return {'title': self.title,
149 | 'subtitle': self.subtitle,
150 | 'author': self.author.name(),
151 | 'license': self.license,
152 | 'language': self.language,
153 | 'downloads': self.downloads,
154 | 'bookshelf': self.bookshelf,
155 | 'cover_page': self.cover_page}
156 |
157 | def to_array(self):
158 | fmts = self.formats()
159 | return [
160 | self.title,
161 | # self.subtitle,
162 | self.author.name(),
163 | # self.license,
164 | # self.language,
165 | # self.downloads
166 | "{html}{epub}{pdf}".format(
167 | html=int('html' in fmts),
168 | epub=int('epub' in fmts),
169 | pdf=int('pdf' in fmts)),
170 | self.id,
171 | self.bookshelf
172 | ]
173 |
174 | def formats(self):
175 | from gutenbergtozim.utils import main_formats_for
176 | return main_formats_for(self)
177 |
178 |
179 | class BookFormat(BaseModel):
180 |
181 | class Meta:
182 | database = db
183 |
184 | book = ForeignKeyField(Book, related_name='bookformats')
185 | format = ForeignKeyField(Format, related_name='bookformats')
186 | downloaded_from = CharField(max_length=300, null=True)
187 |
188 | def __unicode__(self):
189 | return "[{}] {}".format(self.format, self.book.title)
190 |
191 | class Url(BaseModel):
192 |
193 | class Meta:
194 | database = db
195 |
196 | url = TextField(index=True)
197 |
198 | def __unicode__(self):
199 | return self.url
200 |
201 | def load_fixtures(model):
202 | logger.info("Loading fixtures for {}".format(model._meta.name))
203 |
204 | for fixture in getattr(model._meta, 'fixtures', []):
205 | f = model.create(**fixture)
206 | logger.debug("[fixtures] Created {}".format(f))
207 |
208 |
209 | def setup_database(wipe=False):
210 | logger.info("Setting up the database")
211 |
212 | for model in (License, Format, Author, Book, BookFormat, Url):
213 | if wipe:
214 | model.drop_table(fail_silently=True)
215 | if not model.table_exists():
216 | model.create_table()
217 | logger.debug("Created table for {}".format(model._meta.name))
218 | load_fixtures(model)
219 | else:
220 | logger.debug("{} table already exists.".format(model._meta.name))
221 |
--------------------------------------------------------------------------------
/gutenbergtozim/iso639.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim: ai ts=4 sts=4 et sw=4 nu
4 |
5 | from __future__ import (unicode_literals, absolute_import,
6 | division, print_function)
7 |
8 | import babel
9 |
10 |
11 | def language_name(code):
12 | try:
13 | return babel.Locale(code).get_language_name(code).title()
14 | except Exception:
15 | return other_language_names.get(code, code)
16 |
17 |
18 | # Autonyms unknown by babel
19 | # collected from CLDR, MediaWiki languages/Names.php, and English Wikipedia
20 | other_language_names = {'fy': "Frysk",
21 | 'iu': "ᐃᓄᒃᑎᑐᑦ / inuktitut",
22 | 'la': "Latina",
23 | 'mi': "Māori",
24 | 'no': "norsk bokmål",
25 | 'oc': "occitan",
26 | 'sa': "संस्कृतम्",
27 | 'tl': "Tagalog",
28 | 'yi': "ייִדיש",
29 | 'ale': "Unangam Tunuu / Унáҥам Тунý",
30 | 'ang': "Ænglisc",
31 | 'arp': "Hinónoʼeitíít",
32 | 'bgi': "Giangan",
33 | 'ceb': "Cebuano",
34 | 'csb': "kaszëbsczi",
35 | 'enm': "Middle English",
36 | 'fur': "furlan",
37 | 'gla': "Gàidhlig",
38 | 'grc': "Ἀρχαία ἑλληνικὴ",
39 | 'ilo': "Ilokano",
40 | 'kha': "Khasi",
41 | 'kld': "Gamilaraay",
42 | 'myn': "Maya",
43 | 'nah': "Nāhuatl",
44 | 'nai': "Amérindien",
45 | 'nap': "Napulitano",
46 | 'nav': "Diné bizaad",
47 | 'oji': "Ojibwe",
48 | 'rmr': "Caló"}
49 |
50 |
51 | # Imported from http://www-01.sil.org/iso639-3/download.asp
52 | ISO_MATRIX = {'aa': 'aar',
53 | 'ab': 'abk',
54 | 'ae': 'ave',
55 | 'af': 'afr',
56 | 'ak': 'aka',
57 | 'am': 'amh',
58 | 'an': 'arg',
59 | 'ar': 'ara',
60 | 'as': 'asm',
61 | 'av': 'ava',
62 | 'ay': 'aym',
63 | 'az': 'aze',
64 | 'ba': 'bak',
65 | 'be': 'bel',
66 | 'bg': 'bul',
67 | 'bi': 'bis',
68 | 'bm': 'bam',
69 | 'bn': 'ben',
70 | 'bo': 'bod',
71 | 'br': 'bre',
72 | 'bs': 'bos',
73 | 'ca': 'cat',
74 | 'ce': 'che',
75 | 'ch': 'cha',
76 | 'co': 'cos',
77 | 'cr': 'cre',
78 | 'cs': 'ces',
79 | 'cu': 'chu',
80 | 'cv': 'chv',
81 | 'cy': 'cym',
82 | 'da': 'dan',
83 | 'de': 'deu',
84 | 'dv': 'div',
85 | 'dz': 'dzo',
86 | 'ee': 'ewe',
87 | 'el': 'ell',
88 | 'en': 'eng',
89 | 'eo': 'epo',
90 | 'es': 'spa',
91 | 'et': 'est',
92 | 'eu': 'eus',
93 | 'fa': 'fas',
94 | 'ff': 'ful',
95 | 'fi': 'fin',
96 | 'fj': 'fij',
97 | 'fo': 'fao',
98 | 'fr': 'fra',
99 | 'fy': 'fry',
100 | 'ga': 'gle',
101 | 'gd': 'gla',
102 | 'gl': 'glg',
103 | 'gn': 'grn',
104 | 'gu': 'guj',
105 | 'gv': 'glv',
106 | 'ha': 'hau',
107 | 'he': 'heb',
108 | 'hi': 'hin',
109 | 'ho': 'hmo',
110 | 'hr': 'hrv',
111 | 'ht': 'hat',
112 | 'hu': 'hun',
113 | 'hy': 'hye',
114 | 'hz': 'her',
115 | 'ia': 'ina',
116 | 'id': 'ind',
117 | 'ie': 'ile',
118 | 'ig': 'ibo',
119 | 'ii': 'iii',
120 | 'ik': 'ipk',
121 | 'io': 'ido',
122 | 'is': 'isl',
123 | 'it': 'ita',
124 | 'iu': 'iku',
125 | 'ja': 'jpn',
126 | 'jv': 'jav',
127 | 'ka': 'kat',
128 | 'kg': 'kon',
129 | 'ki': 'kik',
130 | 'kj': 'kua',
131 | 'kk': 'kaz',
132 | 'kl': 'kal',
133 | 'km': 'khm',
134 | 'kn': 'kan',
135 | 'ko': 'kor',
136 | 'kr': 'kau',
137 | 'ks': 'kas',
138 | 'ku': 'kur',
139 | 'kv': 'kom',
140 | 'kw': 'cor',
141 | 'ky': 'kir',
142 | 'la': 'lat',
143 | 'lb': 'ltz',
144 | 'lg': 'lug',
145 | 'li': 'lim',
146 | 'ln': 'lin',
147 | 'lo': 'lao',
148 | 'lt': 'lit',
149 | 'lu': 'lub',
150 | 'lv': 'lav',
151 | 'mg': 'mlg',
152 | 'mh': 'mah',
153 | 'mi': 'mri',
154 | 'mk': 'mkd',
155 | 'ml': 'mal',
156 | 'mn': 'mon',
157 | 'mr': 'mar',
158 | 'ms': 'msa',
159 | 'mt': 'mlt',
160 | 'my': 'mya',
161 | 'na': 'nau',
162 | 'nb': 'nob',
163 | 'nd': 'nde',
164 | 'ne': 'nep',
165 | 'ng': 'ndo',
166 | 'nl': 'nld',
167 | 'nn': 'nno',
168 | 'no': 'nor',
169 | 'nr': 'nbl',
170 | 'nv': 'nav',
171 | 'ny': 'nya',
172 | 'oc': 'oci',
173 | 'oj': 'oji',
174 | 'om': 'orm',
175 | 'or': 'ori',
176 | 'os': 'oss',
177 | 'pa': 'pan',
178 | 'pi': 'pli',
179 | 'pl': 'pol',
180 | 'ps': 'pus',
181 | 'pt': 'por',
182 | 'qu': 'que',
183 | 'rm': 'roh',
184 | 'rn': 'run',
185 | 'ro': 'ron',
186 | 'ru': 'rus',
187 | 'rw': 'kin',
188 | 'sa': 'san',
189 | 'sc': 'srd',
190 | 'sd': 'snd',
191 | 'se': 'sme',
192 | 'sg': 'sag',
193 | 'sh': 'hbs',
194 | 'si': 'sin',
195 | 'sk': 'slk',
196 | 'sl': 'slv',
197 | 'sm': 'smo',
198 | 'sn': 'sna',
199 | 'so': 'som',
200 | 'sq': 'sqi',
201 | 'sr': 'srp',
202 | 'ss': 'ssw',
203 | 'st': 'sot',
204 | 'su': 'sun',
205 | 'sv': 'swe',
206 | 'sw': 'swa',
207 | 'ta': 'tam',
208 | 'te': 'tel',
209 | 'tg': 'tgk',
210 | 'th': 'tha',
211 | 'ti': 'tir',
212 | 'tk': 'tuk',
213 | 'tl': 'tgl',
214 | 'tn': 'tsn',
215 | 'to': 'ton',
216 | 'tr': 'tur',
217 | 'ts': 'tso',
218 | 'tt': 'tat',
219 | 'tw': 'twi',
220 | 'ty': 'tah',
221 | 'ug': 'uig',
222 | 'uk': 'ukr',
223 | 'ur': 'urd',
224 | 'uz': 'uzb',
225 | 've': 'ven',
226 | 'vi': 'vie',
227 | 'vo': 'vol',
228 | 'wa': 'wln',
229 | 'wo': 'wol',
230 | 'xh': 'xho',
231 | 'yi': 'yid',
232 | 'yo': 'yor',
233 | 'za': 'zha',
234 | 'zh': 'zho',
235 | 'zu': 'zul'}
236 |
--------------------------------------------------------------------------------
/gutenbergtozim/templates/jquery/jquery.persist.js:
--------------------------------------------------------------------------------
1 | /*
2 | jquery-persist 201203*pike
3 |
4 | persist form values in cookies
5 |
6 | example usage:
7 |
8 | $('input,select,textarea').persist(options);
9 |
10 | $('input,select,textarea').unpersist(options);
11 |
12 | options
13 |
14 | context : 'def', // a context or namespace for each field
15 | replace : true, // replace existing field contents if any
16 | cookie : 'jqpersist', // cookies basename
17 | path : '/', // cookie path
18 | domain : null, // cookie domain
19 | expires : null, // cookie expiry (eg 365)
20 |
21 | */
22 |
23 |
24 |
25 | jQuery.fn.persist = function(options) {
26 |
27 | options = jQuery.extend({}, jQuery.persist.defaults, options);
28 | return jQuery(this).each(function() {
29 | var name = $(this).attr('name');
30 | var val =jQuery.persistedValue(name,options);
31 | if(val) {
32 | switch(this.tagName.toLowerCase()) {
33 | case 'input':
34 | switch($(this).attr('type')) {
35 | case 'submit':
36 | // Do nothing
37 | break;
38 | case 'radio':
39 | // if we can replace anything or there are no checked radio buttons
40 | if (options['replace']||$(this).parents('form').eq(0).find('input[name="'+name+'"]:checked').size()==0) {
41 | $(this).parents('form').eq(0)
42 | .find('input[name="'+name+'"]').each(function() {
43 | this.checked = ($(this).val()==val);
44 | });
45 | }
46 | break;
47 | case 'checkbox':
48 | var vals = val.split(jQuery.persist.arrsep);
49 | $(this).parents('form').eq(0)
50 | .find('input[name="'+name+'"]').each(function() {
51 | // if we can replace this value or it was checked by itself
52 | this.checked = ((jQuery.inArray($(this).val(),vals)!=-1)||(this.checked&&!options['replace']));
53 | });
54 | break;
55 | default:
56 | // if we can replace it or it is empty or 0
57 | if (options['replace']||!$(this).val()) {
58 | $(this).val(val);
59 | }
60 | }
61 | break;
62 | case 'select':
63 | if ($(this).attr('multiple')) {
64 | var vals = val.split(jQuery.persist.arrsep);
65 | $(this).children('option').each(function() {
66 | // if we can replace this value or it was selected by itself
67 | this.selected = ((jQuery.inArray($(this).val(),vals)!=-1)||(this.selected&&!options['replace']));
68 | });
69 | } else {
70 | // if we can replace it or it is empty or 0
71 | if (options['replace']||!$(this).val()) {
72 | $(this).val(val);
73 | }
74 | }
75 | break;
76 | default:
77 | // if we can replace it or it is empty or 0
78 | if (options['replace']||!$(this).val()) {
79 | $(this).val(val);
80 | }
81 | }
82 | }
83 | }).on('change.persist', function(){
84 | var name = $(this).attr('name');
85 | switch(this.tagName.toLowerCase()) {
86 | case 'input':
87 | switch($(this).attr('type')) {
88 | case "checkbox":
89 | var vals = [];
90 | $(this).parents('form').eq(0)
91 | .find('input[name="'+name+'"]').each(function() {
92 | if (this.checked) vals.push($(this).val());
93 | });
94 | jQuery.persistValue(name,vals.join(jQuery.persist.arrsep),options);
95 | break;
96 | default:
97 | jQuery.persistValue(name, $(this).val(), options);
98 | }
99 | break;
100 | case "select":
101 | if ($(this).attr('multiple')) {
102 | var vals = [];
103 | $(this).children('option').each(function() {
104 | if (this.selected) vals.push($(this).val());
105 | });
106 | jQuery.persistValue(name,vals.join(jQuery.persist.arrsep),options);
107 | } else {
108 | jQuery.persistValue(name, $(this).val(), options);
109 | }
110 | break;
111 | default:
112 | jQuery.persistValue(name, $(this).val(), options);
113 | }
114 | });
115 | }
116 |
117 | jQuery.fn.unpersist = function(options) {
118 | options = jQuery.extend({}, jQuery.persist.defaults, options);
119 | $(this).each(function() {
120 | var name = $(this).attr('name');
121 | jQuery.persistValue(name,null,options);
122 | }).off('change.persist');
123 | return $(this);
124 | }
125 |
126 | jQuery.persistValue = function (key, value, options) {
127 |
128 | options = jQuery.extend({}, jQuery.persist.defaults, options);
129 | var ctx = options['context'];
130 |
131 | if (!jQuery.persist.keys.length) {
132 | if (!jQuery.persistInit(options)) return false;
133 | }
134 | var idx = jQuery.inArray(ctx+jQuery.persist.ctxsep+key,jQuery.persist.keys);
135 | if (idx!=-1) {
136 | if (value === null || value === undefined) {
137 | // remove value
138 | if (jQuery.persist.debug) console.log('unpersist '+key);
139 | jQuery.persist.keys.splice(idx,1);
140 | jQuery.persist.vals.splice(idx,1);
141 | } else {
142 | if (jQuery.persist.debug) console.log('persist '+key+':'+value);
143 | jQuery.persist.vals[idx]=value;
144 | }
145 | } else {
146 | if (!(value === null || value === undefined)) {
147 | if (jQuery.persist.debug) console.log('add persist '+key+':'+value);
148 | jQuery.persist.keys.push(ctx+jQuery.persist.ctxsep+key);
149 | jQuery.persist.vals.push(value);
150 | }
151 | }
152 | if (jQuery.persist.keys.length) {
153 | // store keys/vals
154 | jQuery.cookie(options.cookie+'_keys',jQuery.persist.keys.join(jQuery.persist.elmsep),options);
155 | jQuery.cookie(options.cookie+'_vals',jQuery.persist.vals.join(jQuery.persist.elmsep),options);
156 | } else {
157 | // remove the whole cookie
158 | options['expire']=null;
159 | jQuery.cookie(options.cookie+'_keys','',options);
160 | jQuery.cookie(options.cookie+'_vals','',options);
161 | }
162 | }
163 |
164 |
165 | jQuery.persistedValue = function(key,options) {
166 |
167 | options = jQuery.extend({}, jQuery.persist.defaults, options);
168 | var ctx = options['context'];
169 |
170 | if (!jQuery.persist.keys.length) {
171 | if (!jQuery.persistInit(options)) return false;
172 | }
173 |
174 | var idx = jQuery.inArray(ctx+jQuery.persist.ctxsep+key,jQuery.persist.keys);
175 | if (idx!=-1) {
176 | if (jQuery.persist.debug) console.log('persisted '+key+':'+ jQuery.persist.vals[idx]);
177 | return jQuery.persist.vals[idx];
178 | } else {
179 | if (jQuery.persist.debug) console.log('persisted '+key+': nop');
180 | return null; //undefined
181 | }
182 |
183 | }
184 |
185 | jQuery.persistInit = function(options) {
186 | if (jQuery.persist.debug) console.log('persist init ');
187 | options = jQuery.extend({}, jQuery.persist.defaults, options);
188 | var skeys = jQuery.cookie(options.cookie+'_keys') || '';
189 | var svals = jQuery.cookie(options.cookie+'_vals') || '';
190 | jQuery.persist.keys = skeys.split(jQuery.persist.elmsep);
191 | jQuery.persist.vals = svals.split(jQuery.persist.elmsep);
192 | if (jQuery.persist.keys.length!=jQuery.persist.vals.length) {
193 | // this should never happen
194 | alert('persist error - erasing');
195 | options['expire']=null;
196 | jQuery.cookie(options.cookie+'_keys',null,options);
197 | jQuery.cookie(options.cookie+'_vals',null,options);
198 | jQuery.persist.keys = [];
199 | jQuery.persist.vals = [];
200 | return false;
201 | }
202 | if (jQuery.persist.debug) console.log(jQuery.persist.keys);
203 | if (jQuery.persist.debug) console.log(jQuery.persist.vals);
204 | return true;
205 | }
206 |
207 | jQuery.persist = {
208 | debug : true,
209 | defaults: {
210 | context : 'def', // a context or namespace for each field
211 | replace : true, // replace existing field contents if any
212 | cookie : 'jqpersist', // cookies basename
213 | path : '/', // cookie path
214 | domain : null, // cookie domain
215 | expires : null // cookie expiry (eg 365)
216 | },
217 | elmsep : '##',
218 | ctxsep : '::',
219 | arrsep : '//',
220 | keys : [],
221 | vals : []
222 | };
223 |
--------------------------------------------------------------------------------
/gutenbergtozim/urls.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim: ai ts=4 sts=4 et sw=4 nu
4 |
5 | from __future__ import (unicode_literals, absolute_import,
6 | division, print_function)
7 |
8 | import os
9 | import platform
10 |
11 | from collections import defaultdict
12 |
13 | from gutenbergtozim.database import Book, BookFormat, Url
14 | from gutenbergtozim.utils import FORMAT_MATRIX, exec_cmd
15 | from gutenbergtozim import logger
16 |
17 | try:
18 | import urlparse
19 | except ImportError:
20 | import urllib.parse as urlparse
21 | from playhouse.csv_loader import load_csv
22 |
23 | class UrlBuilder:
24 |
25 | """
26 | Url builder for the files of a Gutenberg book.
27 | Example:
28 | >>> builder = UrlBuilder()
29 | >>> builder.with_id(
)
30 | >>> builder.with_base(UrlBuilder.BASE_{ONE|TWO|THREE})
31 | >>> url = builder.build()
32 | """
33 | SERVER_NAME = "aleph_gutenberg_org"
34 | RSYNC = "rsync://aleph.gutenberg.org/gutenberg/"
35 | BASE_ONE = 'http://aleph.gutenberg.org/'
36 | BASE_TWO = 'http://aleph.gutenberg.org/cache/epub/'
37 | BASE_THREE = 'http://aleph.gutenberg.org/etext'
38 |
39 | def __init__(self):
40 | self.base = self.BASE_ONE
41 |
42 | def build(self):
43 | """
44 | Build either an url depending on whether the base url
45 | is `BASE_ONE` or `BASE_TWO`.
46 | The former generates urls according to the Url pattern:
47 | id: 10023 -> pattern: /1/0/0/2/10023
48 | The latter generates urls according to the Url pattern:
49 | id: 10023 -> pattern: /10023
50 | There's no implementation for the book Id's 0-10, because
51 | these books do not exist.
52 |
53 | """
54 | if int(self.b_id) > 10:
55 | if self.base == self.BASE_ONE:
56 | base_url = os.path.join(
57 | os.path.join(*list(str(self.b_id))[:-1]), str(self.b_id))
58 | url = os.path.join(self.base, base_url)
59 | elif self.base == self.BASE_TWO:
60 | url = os.path.join(self.base, str(self.b_id))
61 | elif self.base == self.BASE_THREE:
62 | url = self.base
63 |
64 | else:
65 | logger.warning('Figuring out the url of books \
66 | with an ID of {ID <= 10} is not implemented')
67 | return None
68 |
69 | return url
70 |
71 | def with_base(self, base):
72 | self.base = base
73 |
74 | def with_id(self, b_id):
75 | self.b_id = b_id
76 |
77 | def __unicode__(self):
78 | return self.build_url()
79 |
80 |
81 | def get_urls(book):
82 | """
83 | Get all possible urls that could point to the
84 | book on either of the two mirrors.
85 | param: book: The book you want the possible urls from
86 | returns: a list of all possible urls sorted by their probability
87 | """
88 | filtered_book = [bf.format for bf in
89 | BookFormat.select().where(BookFormat.book == book)]
90 |
91 | # Strip out the encoding of the file
92 | def f(x):
93 | return x.mime.split(';')[0].strip()
94 | available_formats = [
95 | {x.pattern.format(id=book.id): {'mime': f(x), 'id': book.id}}
96 | for x in filtered_book
97 | if f(x) in FORMAT_MATRIX.values()]
98 | files = sort_by_mime_type(available_formats)
99 | return build_urls(files)
100 |
101 |
102 | def sort_by_mime_type(files):
103 | """
104 | Reverse the passed in `files` dict and return a dict
105 | that is sorted by `{mimetype: {filetype, id}}` instead of
106 | by `{filetype: mimetype}`.
107 | """
108 | mime = defaultdict(list)
109 | for f in files:
110 | for k, v in f.items():
111 | mime[v['mime']].append({'name': k, 'id': v['id']})
112 | return dict(mime)
113 |
114 |
115 | def build_urls(files):
116 | mapping = {
117 | 'application/epub+zip': build_epub,
118 | 'application/pdf': build_pdf,
119 | 'text/html': build_html
120 | }
121 |
122 | for i in mapping:
123 | if i in files:
124 | possible_url = mapping[i](files[i])
125 | filtre = [u for u in possible_url if Url.get_or_none(url=urlparse.urlparse(u).path[1:])]
126 | if len(filtre) == 0 and len(possible_url) != 0:
127 | files[i] = possible_url
128 | else:
129 | files[i] = filtre
130 |
131 | return files
132 |
133 | def index_of_substring(lst, substrings):
134 | for i, s in enumerate(lst):
135 | for substring in substrings:
136 | if substring in s:
137 | return i
138 | return -1
139 |
140 |
141 | def build_epub(files):
142 | """
143 | Build the posssible urls of the epub file.
144 | """
145 | urls = []
146 | b_id = str(files[0]['id'])
147 | u = UrlBuilder()
148 | u.with_id(b_id)
149 | u.with_base(UrlBuilder.BASE_TWO)
150 |
151 | if not u.build():
152 | return []
153 |
154 | name = ''.join(['pg', b_id])
155 | url = os.path.join(u.build(), name + '.epub')
156 | urls.append(url)
157 | return urls
158 |
159 |
160 | def build_pdf(files):
161 | """
162 | Build the posssible urls of the pdf files.
163 | """
164 | urls = []
165 | b_id = str(files[0]['id'])
166 | u = UrlBuilder()
167 | u.with_base(UrlBuilder.BASE_TWO)
168 | u.with_id(b_id)
169 |
170 | u1 = UrlBuilder()
171 | u1.with_base(UrlBuilder.BASE_ONE)
172 | u1.with_id(b_id)
173 |
174 | if not u.build():
175 | return []
176 |
177 | for i in files:
178 | if 'images' not in i['name']:
179 | url = os.path.join(u.build(), i['name'])
180 | urls.append(url)
181 |
182 | url_dash1 = os.path.join(u1.build(), b_id + '-' + 'pdf' + '.pdf')
183 | url_dash = os.path.join(u.build(), b_id + '-' + 'pdf' + '.pdf')
184 | url_normal = os.path.join(u.build(), b_id + '.pdf')
185 | url_pg = os.path.join(u.build(), 'pg' + b_id + '.pdf')
186 |
187 | urls.extend([url_dash, url_normal, url_pg, url_dash1])
188 | return list(set(urls))
189 |
190 |
191 | def build_html(files):
192 | """
193 | Build the posssible urls of the html files.
194 | """
195 | urls = []
196 | b_id = str(files[0]['id'])
197 | file_names = [i['name'] for i in files]
198 | u = UrlBuilder()
199 | u.with_id(b_id)
200 |
201 | if not u.build():
202 | return []
203 |
204 | if all(['-h.html' not in file_names, '-h.zip' in file_names]):
205 | for i in files:
206 | url = os.path.join(u.build(), i['name'])
207 | urls.append(url)
208 |
209 | url_zip = os.path.join(u.build(), b_id + '-h' + '.zip')
210 | # url_utf8 = os.path.join(u.build(), b_id + '-8' + '.zip')
211 | url_html = os.path.join(u.build(), b_id + '-h' + '.html')
212 | url_htm = os.path.join(u.build(), b_id + '-h' + '.htm')
213 |
214 | u.with_base(UrlBuilder.BASE_TWO)
215 | name = ''.join(['pg', b_id])
216 | html_utf8 = os.path.join(u.build(), name + '.html.utf8')
217 |
218 | u.with_base(UrlBuilder.BASE_THREE)
219 | file_index = index_of_substring(files, ['html', 'htm'])
220 | file_name = files[file_index]['name']
221 | etext_nums = []
222 | etext_nums.extend(range(90, 100))
223 | etext_nums.extend(range(0, 6))
224 | etext_names = ["{0:0=2d}".format(i) for i in etext_nums]
225 | etext_urls = []
226 | for i in etext_names:
227 | etext_urls.append(os.path.join(u.build() + i, file_name))
228 |
229 | urls.extend([url_zip, url_htm, url_html, html_utf8])
230 | urls.extend(etext_urls)
231 | return list(set(urls))
232 |
233 | def setup_urls():
234 |
235 | file_with_url = os.path.join("tmp", "file_on_{}".format(UrlBuilder.SERVER_NAME))
236 | cmd = ["bash", "-c", "rsync -a --list-only {} > {}".format(UrlBuilder.RSYNC, file_with_url)]
237 | exec_cmd(cmd)
238 | in_place_opt = ["-i", ".bak"] if platform.system() == "Darwin" else ["-i"]
239 | cmd = ["sed"] + in_place_opt + [r"s#.* \(.*\)$#\\1#", file_with_url]
240 | exec_cmd(cmd)
241 |
242 | field_names = ['url']
243 | load_csv(Url, file_with_url, field_names=field_names)
244 |
245 |
246 | if __name__ == '__main__':
247 | book = Book.get(id=9)
248 | print(get_urls(book))
249 |
--------------------------------------------------------------------------------
/gutenbergtozim/templates/css/grids-responsive-min.css:
--------------------------------------------------------------------------------
1 | /*!
2 | Pure v0.5.0
3 | Copyright 2014 Yahoo! Inc. All rights reserved.
4 | Licensed under the BSD License.
5 | https://github.com/yui/pure/blob/master/LICENSE.md
6 | */
7 | @media screen and (min-width:35.5em){.pure-u-sm-1,.pure-u-sm-1-1,.pure-u-sm-1-2,.pure-u-sm-1-3,.pure-u-sm-2-3,.pure-u-sm-1-4,.pure-u-sm-3-4,.pure-u-sm-1-5,.pure-u-sm-2-5,.pure-u-sm-3-5,.pure-u-sm-4-5,.pure-u-sm-5-5,.pure-u-sm-1-6,.pure-u-sm-5-6,.pure-u-sm-1-8,.pure-u-sm-3-8,.pure-u-sm-5-8,.pure-u-sm-7-8,.pure-u-sm-1-12,.pure-u-sm-5-12,.pure-u-sm-7-12,.pure-u-sm-11-12,.pure-u-sm-1-24,.pure-u-sm-2-24,.pure-u-sm-3-24,.pure-u-sm-4-24,.pure-u-sm-5-24,.pure-u-sm-6-24,.pure-u-sm-7-24,.pure-u-sm-8-24,.pure-u-sm-9-24,.pure-u-sm-10-24,.pure-u-sm-11-24,.pure-u-sm-12-24,.pure-u-sm-13-24,.pure-u-sm-14-24,.pure-u-sm-15-24,.pure-u-sm-16-24,.pure-u-sm-17-24,.pure-u-sm-18-24,.pure-u-sm-19-24,.pure-u-sm-20-24,.pure-u-sm-21-24,.pure-u-sm-22-24,.pure-u-sm-23-24,.pure-u-sm-24-24{display:inline-block;*display:inline;zoom:1;letter-spacing:normal;word-spacing:normal;vertical-align:top;text-rendering:auto}.pure-u-sm-1-24{width:4.1667%;*width:4.1357%}.pure-u-sm-1-12,.pure-u-sm-2-24{width:8.3333%;*width:8.3023%}.pure-u-sm-1-8,.pure-u-sm-3-24{width:12.5%;*width:12.469%}.pure-u-sm-1-6,.pure-u-sm-4-24{width:16.6667%;*width:16.6357%}.pure-u-sm-1-5{width:20%;*width:19.969%}.pure-u-sm-5-24{width:20.8333%;*width:20.8023%}.pure-u-sm-1-4,.pure-u-sm-6-24{width:25%;*width:24.969%}.pure-u-sm-7-24{width:29.1667%;*width:29.1357%}.pure-u-sm-1-3,.pure-u-sm-8-24{width:33.3333%;*width:33.3023%}.pure-u-sm-3-8,.pure-u-sm-9-24{width:37.5%;*width:37.469%}.pure-u-sm-2-5{width:40%;*width:39.969%}.pure-u-sm-5-12,.pure-u-sm-10-24{width:41.6667%;*width:41.6357%}.pure-u-sm-11-24{width:45.8333%;*width:45.8023%}.pure-u-sm-1-2,.pure-u-sm-12-24{width:50%;*width:49.969%}.pure-u-sm-13-24{width:54.1667%;*width:54.1357%}.pure-u-sm-7-12,.pure-u-sm-14-24{width:58.3333%;*width:58.3023%}.pure-u-sm-3-5{width:60%;*width:59.969%}.pure-u-sm-5-8,.pure-u-sm-15-24{width:62.5%;*width:62.469%}.pure-u-sm-2-3,.pure-u-sm-16-24{width:66.6667%;*width:66.6357%}.pure-u-sm-17-24{width:70.8333%;*width:70.8023%}.pure-u-sm-3-4,.pure-u-sm-18-24{width:75%;*width:74.969%}.pure-u-sm-19-24{width:79.1667%;*width:79.1357%}.pure-u-sm-4-5{width:80%;*width:79.969%}.pure-u-sm-5-6,.pure-u-sm-20-24{width:83.3333%;*width:83.3023%}.pure-u-sm-7-8,.pure-u-sm-21-24{width:87.5%;*width:87.469%}.pure-u-sm-11-12,.pure-u-sm-22-24{width:91.6667%;*width:91.6357%}.pure-u-sm-23-24{width:95.8333%;*width:95.8023%}.pure-u-sm-1,.pure-u-sm-1-1,.pure-u-sm-5-5,.pure-u-sm-24-24{width:100%}}@media screen and (min-width:48em){.pure-u-md-1,.pure-u-md-1-1,.pure-u-md-1-2,.pure-u-md-1-3,.pure-u-md-2-3,.pure-u-md-1-4,.pure-u-md-3-4,.pure-u-md-1-5,.pure-u-md-2-5,.pure-u-md-3-5,.pure-u-md-4-5,.pure-u-md-5-5,.pure-u-md-1-6,.pure-u-md-5-6,.pure-u-md-1-8,.pure-u-md-3-8,.pure-u-md-5-8,.pure-u-md-7-8,.pure-u-md-1-12,.pure-u-md-5-12,.pure-u-md-7-12,.pure-u-md-11-12,.pure-u-md-1-24,.pure-u-md-2-24,.pure-u-md-3-24,.pure-u-md-4-24,.pure-u-md-5-24,.pure-u-md-6-24,.pure-u-md-7-24,.pure-u-md-8-24,.pure-u-md-9-24,.pure-u-md-10-24,.pure-u-md-11-24,.pure-u-md-12-24,.pure-u-md-13-24,.pure-u-md-14-24,.pure-u-md-15-24,.pure-u-md-16-24,.pure-u-md-17-24,.pure-u-md-18-24,.pure-u-md-19-24,.pure-u-md-20-24,.pure-u-md-21-24,.pure-u-md-22-24,.pure-u-md-23-24,.pure-u-md-24-24{display:inline-block;*display:inline;zoom:1;letter-spacing:normal;word-spacing:normal;vertical-align:top;text-rendering:auto}.pure-u-md-1-24{width:4.1667%;*width:4.1357%}.pure-u-md-1-12,.pure-u-md-2-24{width:8.3333%;*width:8.3023%}.pure-u-md-1-8,.pure-u-md-3-24{width:12.5%;*width:12.469%}.pure-u-md-1-6,.pure-u-md-4-24{width:16.6667%;*width:16.6357%}.pure-u-md-1-5{width:20%;*width:19.969%}.pure-u-md-5-24{width:20.8333%;*width:20.8023%}.pure-u-md-1-4,.pure-u-md-6-24{width:25%;*width:24.969%}.pure-u-md-7-24{width:29.1667%;*width:29.1357%}.pure-u-md-1-3,.pure-u-md-8-24{width:33.3333%;*width:33.3023%}.pure-u-md-3-8,.pure-u-md-9-24{width:37.5%;*width:37.469%}.pure-u-md-2-5{width:40%;*width:39.969%}.pure-u-md-5-12,.pure-u-md-10-24{width:41.6667%;*width:41.6357%}.pure-u-md-11-24{width:45.8333%;*width:45.8023%}.pure-u-md-1-2,.pure-u-md-12-24{width:50%;*width:49.969%}.pure-u-md-13-24{width:54.1667%;*width:54.1357%}.pure-u-md-7-12,.pure-u-md-14-24{width:58.3333%;*width:58.3023%}.pure-u-md-3-5{width:60%;*width:59.969%}.pure-u-md-5-8,.pure-u-md-15-24{width:62.5%;*width:62.469%}.pure-u-md-2-3,.pure-u-md-16-24{width:66.6667%;*width:66.6357%}.pure-u-md-17-24{width:70.8333%;*width:70.8023%}.pure-u-md-3-4,.pure-u-md-18-24{width:75%;*width:74.969%}.pure-u-md-19-24{width:79.1667%;*width:79.1357%}.pure-u-md-4-5{width:80%;*width:79.969%}.pure-u-md-5-6,.pure-u-md-20-24{width:83.3333%;*width:83.3023%}.pure-u-md-7-8,.pure-u-md-21-24{width:87.5%;*width:87.469%}.pure-u-md-11-12,.pure-u-md-22-24{width:91.6667%;*width:91.6357%}.pure-u-md-23-24{width:95.8333%;*width:95.8023%}.pure-u-md-1,.pure-u-md-1-1,.pure-u-md-5-5,.pure-u-md-24-24{width:100%}}@media screen and (min-width:64em){.pure-u-lg-1,.pure-u-lg-1-1,.pure-u-lg-1-2,.pure-u-lg-1-3,.pure-u-lg-2-3,.pure-u-lg-1-4,.pure-u-lg-3-4,.pure-u-lg-1-5,.pure-u-lg-2-5,.pure-u-lg-3-5,.pure-u-lg-4-5,.pure-u-lg-5-5,.pure-u-lg-1-6,.pure-u-lg-5-6,.pure-u-lg-1-8,.pure-u-lg-3-8,.pure-u-lg-5-8,.pure-u-lg-7-8,.pure-u-lg-1-12,.pure-u-lg-5-12,.pure-u-lg-7-12,.pure-u-lg-11-12,.pure-u-lg-1-24,.pure-u-lg-2-24,.pure-u-lg-3-24,.pure-u-lg-4-24,.pure-u-lg-5-24,.pure-u-lg-6-24,.pure-u-lg-7-24,.pure-u-lg-8-24,.pure-u-lg-9-24,.pure-u-lg-10-24,.pure-u-lg-11-24,.pure-u-lg-12-24,.pure-u-lg-13-24,.pure-u-lg-14-24,.pure-u-lg-15-24,.pure-u-lg-16-24,.pure-u-lg-17-24,.pure-u-lg-18-24,.pure-u-lg-19-24,.pure-u-lg-20-24,.pure-u-lg-21-24,.pure-u-lg-22-24,.pure-u-lg-23-24,.pure-u-lg-24-24{display:inline-block;*display:inline;zoom:1;letter-spacing:normal;word-spacing:normal;vertical-align:top;text-rendering:auto}.pure-u-lg-1-24{width:4.1667%;*width:4.1357%}.pure-u-lg-1-12,.pure-u-lg-2-24{width:8.3333%;*width:8.3023%}.pure-u-lg-1-8,.pure-u-lg-3-24{width:12.5%;*width:12.469%}.pure-u-lg-1-6,.pure-u-lg-4-24{width:16.6667%;*width:16.6357%}.pure-u-lg-1-5{width:20%;*width:19.969%}.pure-u-lg-5-24{width:20.8333%;*width:20.8023%}.pure-u-lg-1-4,.pure-u-lg-6-24{width:25%;*width:24.969%}.pure-u-lg-7-24{width:29.1667%;*width:29.1357%}.pure-u-lg-1-3,.pure-u-lg-8-24{width:33.3333%;*width:33.3023%}.pure-u-lg-3-8,.pure-u-lg-9-24{width:37.5%;*width:37.469%}.pure-u-lg-2-5{width:40%;*width:39.969%}.pure-u-lg-5-12,.pure-u-lg-10-24{width:41.6667%;*width:41.6357%}.pure-u-lg-11-24{width:45.8333%;*width:45.8023%}.pure-u-lg-1-2,.pure-u-lg-12-24{width:50%;*width:49.969%}.pure-u-lg-13-24{width:54.1667%;*width:54.1357%}.pure-u-lg-7-12,.pure-u-lg-14-24{width:58.3333%;*width:58.3023%}.pure-u-lg-3-5{width:60%;*width:59.969%}.pure-u-lg-5-8,.pure-u-lg-15-24{width:62.5%;*width:62.469%}.pure-u-lg-2-3,.pure-u-lg-16-24{width:66.6667%;*width:66.6357%}.pure-u-lg-17-24{width:70.8333%;*width:70.8023%}.pure-u-lg-3-4,.pure-u-lg-18-24{width:75%;*width:74.969%}.pure-u-lg-19-24{width:79.1667%;*width:79.1357%}.pure-u-lg-4-5{width:80%;*width:79.969%}.pure-u-lg-5-6,.pure-u-lg-20-24{width:83.3333%;*width:83.3023%}.pure-u-lg-7-8,.pure-u-lg-21-24{width:87.5%;*width:87.469%}.pure-u-lg-11-12,.pure-u-lg-22-24{width:91.6667%;*width:91.6357%}.pure-u-lg-23-24{width:95.8333%;*width:95.8023%}.pure-u-lg-1,.pure-u-lg-1-1,.pure-u-lg-5-5,.pure-u-lg-24-24{width:100%}}@media screen and (min-width:80em){.pure-u-xl-1,.pure-u-xl-1-1,.pure-u-xl-1-2,.pure-u-xl-1-3,.pure-u-xl-2-3,.pure-u-xl-1-4,.pure-u-xl-3-4,.pure-u-xl-1-5,.pure-u-xl-2-5,.pure-u-xl-3-5,.pure-u-xl-4-5,.pure-u-xl-5-5,.pure-u-xl-1-6,.pure-u-xl-5-6,.pure-u-xl-1-8,.pure-u-xl-3-8,.pure-u-xl-5-8,.pure-u-xl-7-8,.pure-u-xl-1-12,.pure-u-xl-5-12,.pure-u-xl-7-12,.pure-u-xl-11-12,.pure-u-xl-1-24,.pure-u-xl-2-24,.pure-u-xl-3-24,.pure-u-xl-4-24,.pure-u-xl-5-24,.pure-u-xl-6-24,.pure-u-xl-7-24,.pure-u-xl-8-24,.pure-u-xl-9-24,.pure-u-xl-10-24,.pure-u-xl-11-24,.pure-u-xl-12-24,.pure-u-xl-13-24,.pure-u-xl-14-24,.pure-u-xl-15-24,.pure-u-xl-16-24,.pure-u-xl-17-24,.pure-u-xl-18-24,.pure-u-xl-19-24,.pure-u-xl-20-24,.pure-u-xl-21-24,.pure-u-xl-22-24,.pure-u-xl-23-24,.pure-u-xl-24-24{display:inline-block;*display:inline;zoom:1;letter-spacing:normal;word-spacing:normal;vertical-align:top;text-rendering:auto}.pure-u-xl-1-24{width:4.1667%;*width:4.1357%}.pure-u-xl-1-12,.pure-u-xl-2-24{width:8.3333%;*width:8.3023%}.pure-u-xl-1-8,.pure-u-xl-3-24{width:12.5%;*width:12.469%}.pure-u-xl-1-6,.pure-u-xl-4-24{width:16.6667%;*width:16.6357%}.pure-u-xl-1-5{width:20%;*width:19.969%}.pure-u-xl-5-24{width:20.8333%;*width:20.8023%}.pure-u-xl-1-4,.pure-u-xl-6-24{width:25%;*width:24.969%}.pure-u-xl-7-24{width:29.1667%;*width:29.1357%}.pure-u-xl-1-3,.pure-u-xl-8-24{width:33.3333%;*width:33.3023%}.pure-u-xl-3-8,.pure-u-xl-9-24{width:37.5%;*width:37.469%}.pure-u-xl-2-5{width:40%;*width:39.969%}.pure-u-xl-5-12,.pure-u-xl-10-24{width:41.6667%;*width:41.6357%}.pure-u-xl-11-24{width:45.8333%;*width:45.8023%}.pure-u-xl-1-2,.pure-u-xl-12-24{width:50%;*width:49.969%}.pure-u-xl-13-24{width:54.1667%;*width:54.1357%}.pure-u-xl-7-12,.pure-u-xl-14-24{width:58.3333%;*width:58.3023%}.pure-u-xl-3-5{width:60%;*width:59.969%}.pure-u-xl-5-8,.pure-u-xl-15-24{width:62.5%;*width:62.469%}.pure-u-xl-2-3,.pure-u-xl-16-24{width:66.6667%;*width:66.6357%}.pure-u-xl-17-24{width:70.8333%;*width:70.8023%}.pure-u-xl-3-4,.pure-u-xl-18-24{width:75%;*width:74.969%}.pure-u-xl-19-24{width:79.1667%;*width:79.1357%}.pure-u-xl-4-5{width:80%;*width:79.969%}.pure-u-xl-5-6,.pure-u-xl-20-24{width:83.3333%;*width:83.3023%}.pure-u-xl-7-8,.pure-u-xl-21-24{width:87.5%;*width:87.469%}.pure-u-xl-11-12,.pure-u-xl-22-24{width:91.6667%;*width:91.6357%}.pure-u-xl-23-24{width:95.8333%;*width:95.8023%}.pure-u-xl-1,.pure-u-xl-1-1,.pure-u-xl-5-5,.pure-u-xl-24-24{width:100%}}
--------------------------------------------------------------------------------
/gutenbergtozim/download.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # vim: ai ts=4 sts=4 et sw=4 nu
4 |
5 | from __future__ import (unicode_literals, absolute_import,
6 | division, print_function)
7 | import os
8 | import tempfile
9 | import zipfile
10 | from pprint import pprint as pp
11 | from multiprocessing.dummy import Pool
12 |
13 | import requests
14 | from path import Path as path
15 |
16 | from gutenbergtozim import logger, TMP_FOLDER
17 | from gutenbergtozim.urls import get_urls
18 | from gutenbergtozim.database import BookFormat, Format, Book
19 | from gutenbergtozim.export import get_list_of_filtered_books, fname_for
20 | from gutenbergtozim.utils import download_file, FORMAT_MATRIX, ensure_unicode
21 |
22 | IMAGE_BASE = 'http://aleph.gutenberg.org/cache/epub/'
23 |
24 | def resource_exists(url):
25 | r = requests.get(url, stream=True, timeout=20) # in seconds
26 | return r.status_code == requests.codes.ok
27 |
28 |
29 | def handle_zipped_epub(zippath,
30 | book,
31 | download_cache):
32 |
33 | def clfn(fn):
34 | return os.path.join(*os.path.split(fn)[1:])
35 |
36 | def is_safe(fname):
37 | fname = ensure_unicode(clfn(fname))
38 | if path(fname).basename() == fname:
39 | return True
40 | return fname == os.path.join("images",
41 | path(fname).splitpath()[-1])
42 |
43 | zipped_files = []
44 | # create temp directory to extract to
45 | tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)
46 | try:
47 | with zipfile.ZipFile(zippath, 'r') as zf:
48 | # check that there is no insecure data (absolute names)
49 | if sum([1 for n in zf.namelist()
50 | if not is_safe(ensure_unicode(n))]):
51 | path(tmpd).rmtree_p()
52 | return False
53 | # zipped_files = [clfn(fn) for fn in zf.namelist()]
54 | zipped_files = zf.namelist()
55 |
56 | # extract files from zip
57 | zf.extractall(tmpd)
58 | except zipfile.BadZipfile:
59 | # file is not a zip file when it should be.
60 | # don't process it anymore as we don't know what to do.
61 | # could this be due to an incorrect/incomplete download?
62 | return
63 |
64 | # is there multiple HTML files in ZIP ? (rare)
65 | mhtml = sum([1 for f in zipped_files
66 | if f.endswith('html') or f.endswith('.htm')]) > 1
67 | # move all extracted files to proper locations
68 | for fname in zipped_files:
69 | # skip folders
70 | if not path(fname).ext:
71 | continue
72 |
73 | src = os.path.join(tmpd, fname)
74 | if os.path.exists(src):
75 | fname = path(fname).basename()
76 |
77 | if fname.endswith('.html') or fname.endswith('.htm'):
78 | if mhtml:
79 | if fname.startswith("{}-h.".format(book.id)):
80 | dst = os.path.join(download_cache,
81 | "{bid}.html".format(bid=book.id))
82 | else:
83 | dst = os.path.join(download_cache,
84 | "{bid}_{fname}".format(bid=book.id,
85 | fname=fname))
86 | else:
87 | dst = os.path.join(download_cache,
88 | "{bid}.html".format(bid=book.id))
89 | else:
90 | dst = os.path.join(download_cache,
91 | "{bid}_{fname}".format(bid=book.id,
92 | fname=fname))
93 | try:
94 | path(src).move(dst)
95 | except Exception as e:
96 | import traceback
97 | print(e)
98 | print("".join(traceback.format_exc()))
99 | raise
100 | # import ipdb; ipdb.set_trace()
101 |
102 | # delete temp directory
103 | path(tmpd).rmtree_p()
104 |
105 |
106 | def download_book(book, download_cache, languages, formats, force):
107 | logger.info("\tDownloading content files for Book #{id}"
108 | .format(id=book.id))
109 |
110 | # apply filters
111 | if not formats:
112 | formats = FORMAT_MATRIX.keys()
113 |
114 | # HTML is our base for ZIM for add it if not present
115 | if 'html' not in formats:
116 | formats.append('html')
117 |
118 | for format in formats:
119 |
120 | fpath = os.path.join(download_cache, fname_for(book, format))
121 |
122 | # check if already downloaded
123 | if path(fpath).exists() and not force:
124 | logger.debug("\t\t{fmt} already exists at {path}"
125 | .format(fmt=format, path=fpath))
126 | continue
127 |
128 | # retrieve corresponding BookFormat
129 | bfs = BookFormat.filter(book=book)
130 |
131 | if format == 'html':
132 | patterns = ['mnsrb10h.htm', '8ledo10h.htm', 'tycho10f.htm',
133 | '8ledo10h.zip', 'salme10h.htm', '8nszr10h.htm',
134 | '{id}-h.html', '{id}.html.gen', '{id}-h.htm',
135 | '8regr10h.zip', '{id}.html.noimages',
136 | '8lgme10h.htm', 'tycho10h.htm', 'tycho10h.zip',
137 | '8lgme10h.zip', '8indn10h.zip', '8resp10h.zip',
138 | '20004-h.htm', '8indn10h.htm', '8memo10h.zip',
139 | 'fondu10h.zip', '{id}-h.zip', '8mort10h.zip']
140 | bfso = bfs
141 | bfs = bfs.join(Format).filter(Format.pattern << patterns)
142 | if not bfs.count():
143 | pp(list([
144 | (b.format.mime, b.format.images, b.format.pattern)
145 | for b in bfs]))
146 | pp(list([
147 | (b.format.mime, b.format.images, b.format.pattern)
148 | for b in bfso]))
149 | logger.error("html not found")
150 | continue
151 | else:
152 | bfs = bfs.filter(BookFormat.format << Format.filter(
153 | mime=FORMAT_MATRIX.get(format)))
154 |
155 | if not bfs.count():
156 | logger.debug("[{}] not avail. for #{}# {}"
157 | .format(format, book.id, book.title).encode("utf-8"))
158 | continue
159 |
160 | if bfs.count() > 1:
161 | try:
162 | bf = bfs.join(Format).filter(Format.images).get()
163 | except Exception:
164 | bf = bfs.get()
165 | else:
166 | bf = bfs.get()
167 |
168 | logger.debug("[{}] Requesting URLs for #{}# {}"
169 | .format(format, book.id, book.title).encode("utf-8"))
170 |
171 | # retrieve list of URLs for format unless we have it in DB
172 | if bf.downloaded_from and not force:
173 | urls = [bf.downloaded_from]
174 | else:
175 | urld = get_urls(book)
176 | urls = list(reversed(urld.get(FORMAT_MATRIX.get(format))))
177 |
178 | import copy
179 | allurls = copy.copy(urls)
180 |
181 | while(urls):
182 | url = urls.pop()
183 |
184 | if len(allurls) != 1:
185 | if not resource_exists(url):
186 | continue
187 |
188 | # HTML files are *sometime* available as ZIP files
189 | if url.endswith('.zip'):
190 | zpath = "{}.zip".format(fpath)
191 |
192 | if not download_file(url, zpath):
193 | logger.error("ZIP file donwload failed: {}"
194 | .format(zpath))
195 | continue
196 |
197 | # extract zipfile
198 | handle_zipped_epub(zippath=zpath, book=book,
199 | download_cache=download_cache)
200 | else:
201 | if not download_file(url, fpath):
202 | logger.error("file donwload failed: {}".format(fpath))
203 | continue
204 |
205 | # store working URL in DB
206 | bf.downloaded_from = url
207 | bf.save()
208 |
209 | if not bf.downloaded_from:
210 | logger.error("NO FILE FOR #{}/{}".format(book.id, format))
211 | pp(allurls)
212 | continue
213 |
214 |
215 | def download_covers(book,download_cache):
216 | cover = '{}_cover.jpg'.format(book.id)
217 | fpath = os.path.join(download_cache, cover)
218 | has_cover = Book.select(Book.cover_page).where( Book.id == book.id)
219 | if has_cover:
220 | title = '{}{}/pg{}.cover.medium.jpg'.format(IMAGE_BASE,book.id,book.id)
221 | logger.debug('Downloading {}'.format(title))
222 | download_file(title,fpath)
223 | else:
224 | logger.debug('No Book Cover found for Book #{}'.format(book.id))
225 | return True
226 |
227 |
228 | def download_all_books(download_cache, concurrency,
229 | languages=[], formats=[],
230 | only_books=[], force=False):
231 | available_books = get_list_of_filtered_books(
232 | languages=languages,
233 | formats=formats,
234 | only_books=only_books)
235 |
236 | # ensure dir exist
237 | path(download_cache).mkdir_p()
238 |
239 | def dlb(b):
240 | return download_book(b, download_cache, languages, formats, force)
241 | def dlb_covers(b):
242 | return download_covers(b,download_cache)
243 | Pool(concurrency).map(dlb, available_books)
244 | Pool(concurrency).map(dlb_covers,available_books)
245 |
--------------------------------------------------------------------------------