├── .gitignore ├── requirements.txt ├── cron.sh ├── free.sh ├── config.py.example ├── node.py ├── README.md ├── status.py ├── book.py ├── kindle.py ├── free_book.py ├── amz.py └── generator.py /.gitignore: -------------------------------------------------------------------------------- 1 | kindle.json 2 | .idea 3 | __pycache__ 4 | venv 5 | config.py 6 | cache 7 | page 8 | data 9 | *.pyc 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.5.1 2 | bottlenose==1.0.1 3 | lxml==3.5.0 4 | python-amazon-simple-product-api==2.1.0 5 | python-dateutil==2.5.3 6 | requests==2.9.1 7 | six==1.10.0 8 | -------------------------------------------------------------------------------- /cron.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo $(date) 4 | 5 | PWD="$(dirname $0)" 6 | 7 | echo "$PWD" 8 | 9 | cd "$PWD" || exit 1 10 | 11 | PYTHONIOENCODING=utf-8:surrogateescape venv/bin/python kindle.py 12 | -------------------------------------------------------------------------------- /free.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo $(date) 4 | 5 | PWD="$(dirname $0)" 6 | 7 | echo "$PWD" 8 | 9 | cd "$PWD" || exit 1 10 | 11 | PYTHONIOENCODING=utf-8:surrogateescape venv/bin/python free_book.py 12 | -------------------------------------------------------------------------------- /config.py.example: -------------------------------------------------------------------------------- 1 | user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/50.0.2661.75 Safari/537.36' 2 | header = {'User-Agent': user_agent} 3 | 4 | KEY_ID = "xxx" 5 | SECRET_KEY = "xxx" 6 | TAG = "xxx" 7 | -------------------------------------------------------------------------------- /node.py: -------------------------------------------------------------------------------- 1 | 2 | class Node: 3 | node = None # ancestor 4 | node_id = None 5 | is_root = False 6 | name = None 7 | 8 | def __init__(self, o=None): 9 | if o is None: 10 | o = dict() 11 | self.__dict__ = o 12 | 13 | if 'node' in o: 14 | self.node = Node(o['node']) 15 | 16 | def dump(self): 17 | return clean_dict(self.__dict__) 18 | 19 | def tuple(self): 20 | return ( 21 | self.node_id, self.name, self.is_root 22 | ) 23 | 24 | 25 | def clean_dict(d): 26 | if not isinstance(d, dict): 27 | return d 28 | return dict((k, clean_dict(v)) for k, v in d.items() if v is not None) 29 | 30 | 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kindle 2 | 3 | ## 配置 4 | 5 | 参考 `config.py.example` ,修改 `config.py` 文件,填写 `API key`, 请在 [Amazon](https://console.aws.amazon.com/iam/home#security_credential 6 | ) 获取。 7 | 8 | ```shell 9 | AWS_ACCESS_KEY_ID = "xxx" 10 | AWS_SECRET_ACCESS_KEY = "xxx" 11 | AWS_ASSOCIATE_TAG = "xxx" 12 | ``` 13 | 14 | ## 运行 15 | 16 | ```shell 17 | virtualenv -p python3 venv 18 | source venv/bin/activate 19 | pip install -r requirements.txt -I 20 | python kindle.py 21 | ``` 22 | 23 | ## 获取免费电子书数据 24 | 25 | ```shell 26 | python free_book.py 27 | ``` 28 | 29 | ## 生成数据库 30 | 31 | ```shell 32 | python generator.py 33 | ``` 34 | 35 | **crontab 定时任务** 36 | 37 | ```shell 38 | 5 0 * * * /path/to/kindle/cron.sh >> /var/log/kindle.log 2>&1 39 | ``` 40 | -------------------------------------------------------------------------------- /status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import hashlib 3 | import json 4 | import os 5 | import time 6 | 7 | from datetime import datetime 8 | from functools import partial 9 | 10 | data_file = "data/status.json" 11 | 12 | 13 | class Status: 14 | version = 0 15 | count = 0 16 | new_count = 0 17 | timestamp = None 18 | md5 = None 19 | 20 | def __init__(self): 21 | if not os.path.exists(data_file): 22 | return 23 | with open(data_file) as f: 24 | self.__dict__ = json.loads(f.read()) 25 | 26 | def dump(self): 27 | print(self.version, self.timestamp) 28 | 29 | def json(self): 30 | return {"version": self.version, "count": self.count, "new_count": self.new_count, "timestamp": self.timestamp, 31 | 'md5': self.md5} 32 | 33 | def update(self, file): 34 | self.timestamp = int(time.mktime(datetime.now().utctimetuple())) 35 | self.md5 = md5sum(file) 36 | with open(data_file, "w") as f: 37 | f.write(json.dumps(self.json())) 38 | 39 | def to_list(self): 40 | if not self.timestamp: 41 | self.timestamp = int(time.mktime(datetime.now().utctimetuple())) 42 | return [self.version, self.count, self.new_count, self.timestamp] 43 | 44 | def bump(self): 45 | self.version += 1 46 | 47 | 48 | def md5sum(filename): 49 | with open(filename, mode='rb') as f: 50 | d = hashlib.md5() 51 | for buf in iter(partial(f.read, 128), b''): 52 | d.update(buf) 53 | return d.hexdigest() 54 | -------------------------------------------------------------------------------- /book.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from node import Node 4 | 5 | 6 | class Book: 7 | title = '' 8 | average = 0 9 | price = 0 10 | author = '' 11 | min = 0 12 | score = 0 13 | url = '' 14 | min_day = '' 15 | 16 | item_id = None 17 | pages = None 18 | publisher = None 19 | brand = None 20 | asin = None 21 | binding = None 22 | edition = None 23 | editorial_review = None 24 | isbn = None 25 | large_image_url = None 26 | medium_image_url = None 27 | small_image_url = None 28 | region = None 29 | release_date = None 30 | publication_date = None 31 | sales_rank = None 32 | languages = None 33 | nodes = None 34 | 35 | def __init__(self, o=None): 36 | if o is None: 37 | o = dict() 38 | self.__dict__ = o 39 | nodes = [] 40 | if 'nodes' in o: 41 | for n in o['nodes']: 42 | node = Node(n) 43 | nodes.append(node) 44 | self.nodes = nodes 45 | 46 | def json(self): 47 | return json.dumps(self, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True) 48 | 49 | def dump(self): 50 | return clean_dict(self.__dict__) 51 | 52 | def tuple(self): 53 | languages = None 54 | if self.languages and len(self.languages) > 0: 55 | languages = self.languages[0] 56 | return ( 57 | self.title, self.author, self.score, self.url, self.item_id, self.pages, self.publisher, self.brand, 58 | self.asin, self.edition, self.isbn, self.large_image_url, self.medium_image_url, self.small_image_url, 59 | self.region, self.release_date, self.publication_date, languages 60 | ) 61 | 62 | 63 | def clean_dict(d): 64 | if not isinstance(d, dict): 65 | return d 66 | return dict((k, clean_dict(v)) for k, v in d.items() if v is not None) 67 | -------------------------------------------------------------------------------- /kindle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import io 3 | import json 4 | import re 5 | 6 | import requests 7 | from bs4 import Tag 8 | 9 | import config 10 | from book import Book 11 | 12 | 13 | def fetch(url, headers, cookies): 14 | r = requests.get(url, headers=headers, cookies=cookies) 15 | from bs4 import BeautifulSoup 16 | import lxml 17 | 18 | bs = BeautifulSoup(r.text, lxml.__name__) 19 | 20 | time = re.match('数据更新于:(.*)', bs.find('span', style='color:#FFF9A8').text).group(1) 21 | 22 | kindle = {'time': time, 'books': []} 23 | 24 | book_items = bs.find_all('div', style='margin-bottom: 0.9em;') 25 | 26 | for book_item in book_items: 27 | 28 | book = Book() 29 | 30 | if isinstance(book_item, Tag): 31 | a = book_item.find('a') 32 | min_day = book_item.find('span', title=re.compile('最近在')).get('title') 33 | book.min_day = re.match('最近在(.*)达到最低价', min_day).group(1) 34 | 35 | if isinstance(a, Tag): 36 | book.url = 'https' + re.match('http(.*)/ref', a.get('href')).group(1) 37 | book.item_id = re.match('.*product/(.*)/ref', a.get('href')).group(1) 38 | book.title = a.get('title') 39 | 40 | matches = re.match('.*历史均价:¥(\d+\.*\d*),现价:¥(\d+\.*\d*)作者:(.*),评分:(\d+\.*\d*),历史最低价:¥(\d+\.*\d*)', 41 | book_item.text) 42 | 43 | book.average = matches.group(1) 44 | book.price = matches.group(2) 45 | book.author = matches.group(3) 46 | book.score = matches.group(4) 47 | book.min = matches.group(5) 48 | 49 | import amz 50 | amz.lookup(book) 51 | 52 | if book.languages and len(book.languages) > 0: 53 | book.languages = book.languages[0] 54 | 55 | kindle['books'].append(book) 56 | 57 | with io.open('kindle.json', 'w', encoding='utf-8') as f: 58 | f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True)) 59 | 60 | 61 | if __name__ == '__main__': 62 | fetch('http://t.bookdna.cn', config.header, {}) 63 | -------------------------------------------------------------------------------- /free_book.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import io 3 | import json 4 | import os 5 | import re 6 | 7 | import requests 8 | 9 | from book import Book 10 | import config 11 | 12 | cn_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,p_36:159125071&page=' 13 | en_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,n:116170071,p_36:159125071&page=' 14 | base_url = 'https://www.amazon.cn/gp/product/' 15 | page_dir = 'page/' 16 | 17 | 18 | def fetch_free_books(url, page): 19 | r = requests.get(url + str(page), headers=config.header) 20 | from bs4 import BeautifulSoup, Tag 21 | import lxml 22 | 23 | bs = BeautifulSoup(r.text, lxml.__name__) 24 | items = bs.find_all('li', attrs={'class': 's-result-item celwidget'}) 25 | 26 | kindle = {'books': []} 27 | 28 | for item in items: 29 | if isinstance(item, Tag): 30 | book = Book() 31 | book.title = item.find('h2').text 32 | # book.item_id = item.find('span', attrs={'name': re.compile('.*')}).get('name') 33 | book.item_id = item.get('data-asin') 34 | book.url = base_url + book.item_id 35 | book.average = 0 36 | book.price = 0 37 | book.min = 0 38 | score = item.find('span', attrs={'class': 'a-icon-alt'}) 39 | if score: 40 | book.score = re.match('平均(.*) 星', score.text).group(1) 41 | 42 | import amz 43 | amz.lookup(book) 44 | 45 | kindle['books'].append(book) 46 | 47 | kindle['count'] = len(kindle['books']) 48 | kindle['page'] = page 49 | return kindle 50 | 51 | 52 | def get_free_cn_books(page): 53 | kindle = fetch_free_books(cn_url, page) 54 | with io.open(page_dir + 'kindle_free_books_cn_' + str(page) + '.json', 'w', encoding='utf-8') as f: 55 | f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True)) 56 | 57 | 58 | def get_free_en_books(page): 59 | kindle = fetch_free_books(en_url, page) 60 | with io.open(page_dir + 'kindle_free_books_en_' + str(page) + '.json', 'w', encoding='utf-8') as f: 61 | f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True)) 62 | 63 | 64 | def get_free_books(): 65 | if not os.path.exists(page_dir): 66 | os.mkdir(page_dir) 67 | 68 | for page in range(1, 400): 69 | get_free_cn_books(page) 70 | 71 | for page in range(1, 400): 72 | get_free_en_books(page) 73 | 74 | get_free_books() 75 | -------------------------------------------------------------------------------- /amz.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import time as t 3 | import os 4 | import re 5 | from urllib.error import HTTPError 6 | 7 | from amazon.api import AmazonAPI 8 | 9 | import config 10 | from node import Node 11 | 12 | cache_dir = 'cache/' 13 | 14 | 15 | def write_query_to_db(cache_url, data): 16 | if not os.path.exists(cache_dir): 17 | os.mkdir(cache_dir) 18 | 19 | file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml' 20 | f = open(file, 'wb') 21 | f.write(data) 22 | 23 | 24 | def read_query_from_db(cache_url): 25 | file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml' 26 | if os.path.exists(file) and t.time() - os.path.getmtime(file) < 100 * 24 * 60 * 60 * 1000: 27 | f = open(file, 'rb') 28 | return f.read() 29 | return None 30 | 31 | 32 | amazon = AmazonAPI(config.KEY_ID, config.SECRET_KEY, config.TAG, 33 | region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db) 34 | 35 | 36 | def lookup(book): 37 | while True: 38 | try: 39 | product = amazon.lookup(ItemId=book.item_id) 40 | 41 | book.author = product.author 42 | book.pages = product.pages 43 | book.publisher = product.publisher 44 | book.brand = product.brand 45 | book.asin = product.asin 46 | book.binding = product.binding 47 | book.edition = product.edition 48 | book.editorial_review = product.editorial_review 49 | book.isbn = product.isbn 50 | book.large_image_url = product.large_image_url 51 | book.region = product.region 52 | book.release_date = product.release_date.strftime("%Y-%m-%d") 53 | if product.publication_date: 54 | book.publication_date = product.publication_date.strftime("%Y-%m-%d") 55 | book.sales_rank = product.sales_rank 56 | book.medium_image_url = product.medium_image_url 57 | book.small_image_url = product.small_image_url 58 | if product.languages: 59 | book.languages = list(product.languages) 60 | 61 | book.nodes = [] 62 | for browse_node in product.browse_nodes: 63 | node = Node() 64 | book.nodes.append(node) 65 | while True: 66 | node.node_id = browse_node.id 67 | node.name = str(browse_node.name) 68 | if not browse_node.is_category_root: 69 | node.node = Node() 70 | node = node.node 71 | browse_node = browse_node.ancestor 72 | else: 73 | node.is_root = True 74 | break 75 | 76 | print('cached: ' + book.item_id + ' -> ' + book.title) 77 | break 78 | except HTTPError as e: 79 | print(e) 80 | t.sleep(3) 81 | pass 82 | -------------------------------------------------------------------------------- /generator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json 4 | import os.path 5 | 6 | import sqlite3 7 | import zipfile 8 | 9 | from book import Book 10 | from status import Status 11 | 12 | data_dir = 'data' 13 | 14 | books_cn = [] 15 | books_en = [] 16 | ids = set() 17 | 18 | nodes = dict() 19 | node_map = [] 20 | node_relation = dict() 21 | 22 | reviews = [] 23 | 24 | 25 | def load_book(file): 26 | if os.path.isfile(file): 27 | with open(file) as json_data: 28 | d = json.load(json_data) 29 | for b in d['books']: 30 | book = Book(b) 31 | if book.item_id in ids: 32 | print('added: ' + book.item_id) 33 | continue 34 | if book.languages and len(book.languages) > 0: 35 | if book.languages[0] == 'chinese' or book.languages[0] == 'traditional_chinese': 36 | books_cn.append(book.tuple()) 37 | else: 38 | books_en.append(book.tuple()) 39 | ids.add(book.item_id) 40 | reviews.append((book.item_id, book.editorial_review)) 41 | 42 | if book.nodes: 43 | for node in book.nodes: 44 | node.node_id = node.id 45 | node_map.append((book.item_id, node.node_id)) 46 | while True: 47 | if node.node_id not in nodes: 48 | nodes[node.node_id] = node.tuple() 49 | if not node.is_root: 50 | node.node.node_id = node.node.id 51 | node_key = str(node.node_id) + '-' + str(node.node.node_id) 52 | if node_key not in node_relation: 53 | node_relation[node_key] = (node.node_id, node.node.node_id) 54 | node = node.node 55 | else: 56 | break 57 | else: 58 | print('no language') 59 | print(book.json()) 60 | 61 | 62 | def compress(file_name): 63 | zip_file = file_name + ".zip" 64 | zf = zipfile.ZipFile(zip_file, "w", zipfile.ZIP_DEFLATED) 65 | zf.write(file_name, arcname=os.path.basename(file_name)) 66 | zf.close() 67 | return zip_file 68 | 69 | 70 | if not os.path.exists(data_dir): 71 | os.mkdir(data_dir) 72 | 73 | # read data to list 74 | 75 | for i in range(1, 401): 76 | f_cn = 'page/kindle_free_books_cn_' + str(i) + '.json' 77 | f_en = 'page/kindle_free_books_en_' + str(i) + '.json' 78 | load_book(f_cn) 79 | load_book(f_en) 80 | 81 | # save to database 82 | 83 | status = Status() 84 | 85 | status.new_count = len(books_cn) + len(books_en) - status.count 86 | 87 | status.count = len(books_cn) + len(books_en) 88 | status.bump() 89 | 90 | conn = sqlite3.connect('data/books_' + str(status.version) + '.db') 91 | cur = conn.cursor() 92 | cur.execute('''CREATE TABLE IF NOT EXISTS book ( 93 | id INTEGER PRIMARY KEY AUTOINCREMENT, 94 | title TEXT, 95 | author TEXT, 96 | score REAL, 97 | url TEXT, 98 | item_id TEXT, 99 | pages TEXT, 100 | publisher TEXT, 101 | brand TEXT, 102 | asin TEXT, 103 | edition TEXT, 104 | isbn TEXT, 105 | large_image_url TEXT, 106 | medium_image_url TEXT, 107 | small_image_url TEXT, 108 | region TEXT, 109 | release_date TEXT, 110 | publication_date TEXT, 111 | languages TEXT 112 | );''') 113 | 114 | cur.executemany('''insert into book ( 115 | title, 116 | author, 117 | score, 118 | url, 119 | item_id, 120 | pages, 121 | publisher, 122 | brand, 123 | asin, 124 | edition, 125 | isbn, 126 | large_image_url, 127 | medium_image_url, 128 | small_image_url, 129 | region, 130 | release_date, 131 | publication_date, 132 | languages 133 | ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) 134 | ''', books_cn + books_en) 135 | 136 | cur.execute('''CREATE TABLE IF NOT EXISTS node ( 137 | id INTEGER PRIMARY KEY AUTOINCREMENT, 138 | node_id INTEGER, 139 | name TEXT, 140 | is_root INTEGER 141 | );''') 142 | 143 | cur.executemany('''insert into node ( 144 | node_id, 145 | name, 146 | is_root 147 | ) values (?, ?, ?) 148 | ''', list(nodes.values())) 149 | 150 | cur.execute('''CREATE TABLE IF NOT EXISTS node_relation ( 151 | id INTEGER PRIMARY KEY AUTOINCREMENT, 152 | descendant INTEGER, 153 | ancestor INTEGER 154 | );''') 155 | 156 | cur.executemany('''insert into node_relation ( 157 | descendant, 158 | ancestor 159 | ) values (?, ?) 160 | ''', list(node_relation.values())) 161 | 162 | cur.execute('''CREATE TABLE IF NOT EXISTS node_map ( 163 | id INTEGER PRIMARY KEY AUTOINCREMENT, 164 | item_id TEXT, 165 | node_id INTEGER 166 | );''') 167 | 168 | cur.executemany('''insert into node_map ( 169 | item_id, 170 | node_id 171 | ) values (?, ?) 172 | ''', node_map) 173 | 174 | cur.execute('''CREATE TABLE IF NOT EXISTS status 175 | ( id INTEGER PRIMARY KEY AUTOINCREMENT, version INTEGER, count INTEGER, new_count INTEGER, time INTEGER );''') 176 | 177 | cur.execute('insert into status (version, count, new_count, time) values (?, ?, ?, ?)', status.to_list()) 178 | 179 | # conn.commit() 180 | # cur.close() 181 | # conn.close() 182 | 183 | # save reviews to database 184 | 185 | # conn = sqlite3.connect('data/reviews_' + str(status.version) + '.db') 186 | # cur = conn.cursor() 187 | cur.execute('''CREATE TABLE IF NOT EXISTS review ( 188 | id INTEGER PRIMARY KEY AUTOINCREMENT, 189 | item_id TEXT, 190 | editorial_review TEXT 191 | );''') 192 | 193 | cur.executemany('''insert into review ( 194 | item_id, 195 | editorial_review 196 | ) values (?, ?) 197 | ''', reviews) 198 | 199 | cur.execute('PRAGMA user_version = {v:d}'.format(v=status.version)) 200 | 201 | conn.commit() 202 | cur.close() 203 | conn.close() 204 | 205 | zip_f = compress('data/books_' + str(status.version) + '.db') 206 | status.update(zip_f) 207 | --------------------------------------------------------------------------------