├── .gitignore
├── requirements.txt
├── cron.sh
├── free.sh
├── config.py.example
├── node.py
├── README.md
├── status.py
├── book.py
├── kindle.py
├── free_book.py
├── amz.py
└── generator.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | kindle.json
 2 | .idea
 3 | __pycache__
 4 | venv
 5 | config.py
 6 | cache
 7 | page
 8 | data
 9 | *.pyc
10 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.5.1
2 | bottlenose==1.0.1
3 | lxml==3.5.0
4 | python-amazon-simple-product-api==2.1.0
5 | python-dateutil==2.5.3
6 | requests==2.9.1
7 | six==1.10.0
8 | 


--------------------------------------------------------------------------------
/cron.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo $(date)
 4 | 
 5 | PWD="$(dirname $0)"
 6 | 
 7 | echo "$PWD"
 8 | 
 9 | cd "$PWD" || exit 1
10 | 
11 | PYTHONIOENCODING=utf-8:surrogateescape venv/bin/python kindle.py
12 | 


--------------------------------------------------------------------------------
/free.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo $(date)
 4 | 
 5 | PWD="$(dirname $0)"
 6 | 
 7 | echo "$PWD"
 8 | 
 9 | cd "$PWD" || exit 1
10 | 
11 | PYTHONIOENCODING=utf-8:surrogateescape venv/bin/python free_book.py
12 | 


--------------------------------------------------------------------------------
/config.py.example:
--------------------------------------------------------------------------------
1 | user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/50.0.2661.75 Safari/537.36'
2 | header = {'User-Agent': user_agent}
3 | 
4 | KEY_ID = "xxx"
5 | SECRET_KEY = "xxx"
6 | TAG = "xxx"
7 | 


--------------------------------------------------------------------------------
/node.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class Node:
 3 |     node = None  # ancestor
 4 |     node_id = None
 5 |     is_root = False
 6 |     name = None
 7 | 
 8 |     def __init__(self, o=None):
 9 |         if o is None:
10 |             o = dict()
11 |         self.__dict__ = o
12 | 
13 |         if 'node' in o:
14 |             self.node = Node(o['node'])
15 | 
16 |     def dump(self):
17 |         return clean_dict(self.__dict__)
18 | 
19 |     def tuple(self):
20 |         return (
21 |             self.node_id, self.name, self.is_root
22 |         )
23 | 
24 | 
25 | def clean_dict(d):
26 |     if not isinstance(d, dict):
27 |         return d
28 |     return dict((k, clean_dict(v)) for k, v in d.items() if v is not None)
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Kindle
 2 | 
 3 | ## 配置
 4 | 
 5 | 参考 `config.py.example` ，修改 `config.py` 文件，填写 `API key`， 请在 [Amazon](https://console.aws.amazon.com/iam/home#security_credential
 6 | ) 获取。 
 7 | 
 8 | ```shell
 9 | AWS_ACCESS_KEY_ID = "xxx"
10 | AWS_SECRET_ACCESS_KEY = "xxx"
11 | AWS_ASSOCIATE_TAG = "xxx"
12 | ```
13 | 
14 | ## 运行
15 | 
16 | ```shell
17 | virtualenv -p python3 venv
18 | source venv/bin/activate
19 | pip install -r requirements.txt -I
20 | python kindle.py
21 | ```
22 | 
23 | ## 获取免费电子书数据
24 | 
25 | ```shell
26 | python free_book.py
27 | ```
28 | 
29 | ## 生成数据库
30 | 
31 | ```shell
32 | python generator.py
33 | ```
34 | 
35 | **crontab 定时任务**
36 | 
37 | ```shell
38 | 5 0 * * * /path/to/kindle/cron.sh >> /var/log/kindle.log 2>&1
39 | ```
40 | 


--------------------------------------------------------------------------------
/status.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import hashlib
 3 | import json
 4 | import os
 5 | import time
 6 | 
 7 | from datetime import datetime
 8 | from functools import partial
 9 | 
10 | data_file = "data/status.json"
11 | 
12 | 
13 | class Status:
14 |     version = 0
15 |     count = 0
16 |     new_count = 0
17 |     timestamp = None
18 |     md5 = None
19 | 
20 |     def __init__(self):
21 |         if not os.path.exists(data_file):
22 |             return
23 |         with open(data_file) as f:
24 |             self.__dict__ = json.loads(f.read())
25 | 
26 |     def dump(self):
27 |         print(self.version, self.timestamp)
28 | 
29 |     def json(self):
30 |         return {"version": self.version, "count": self.count, "new_count": self.new_count, "timestamp": self.timestamp,
31 |                 'md5': self.md5}
32 | 
33 |     def update(self, file):
34 |         self.timestamp = int(time.mktime(datetime.now().utctimetuple()))
35 |         self.md5 = md5sum(file)
36 |         with open(data_file, "w") as f:
37 |             f.write(json.dumps(self.json()))
38 | 
39 |     def to_list(self):
40 |         if not self.timestamp:
41 |             self.timestamp = int(time.mktime(datetime.now().utctimetuple()))
42 |         return [self.version, self.count, self.new_count, self.timestamp]
43 | 
44 |     def bump(self):
45 |         self.version += 1
46 | 
47 | 
48 | def md5sum(filename):
49 |     with open(filename, mode='rb') as f:
50 |         d = hashlib.md5()
51 |         for buf in iter(partial(f.read, 128), b''):
52 |             d.update(buf)
53 |     return d.hexdigest()
54 | 


--------------------------------------------------------------------------------
/book.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from node import Node
 4 | 
 5 | 
 6 | class Book:
 7 |     title = ''
 8 |     average = 0
 9 |     price = 0
10 |     author = ''
11 |     min = 0
12 |     score = 0
13 |     url = ''
14 |     min_day = ''
15 | 
16 |     item_id = None
17 |     pages = None
18 |     publisher = None
19 |     brand = None
20 |     asin = None
21 |     binding = None
22 |     edition = None
23 |     editorial_review = None
24 |     isbn = None
25 |     large_image_url = None
26 |     medium_image_url = None
27 |     small_image_url = None
28 |     region = None
29 |     release_date = None
30 |     publication_date = None
31 |     sales_rank = None
32 |     languages = None
33 |     nodes = None
34 | 
35 |     def __init__(self, o=None):
36 |         if o is None:
37 |             o = dict()
38 |         self.__dict__ = o
39 |         nodes = []
40 |         if 'nodes' in o:
41 |             for n in o['nodes']:
42 |                 node = Node(n)
43 |                 nodes.append(node)
44 |             self.nodes = nodes
45 | 
46 |     def json(self):
47 |         return json.dumps(self, default=lambda o: o.__dict__, indent=2, ensure_ascii=False, sort_keys=True)
48 | 
49 |     def dump(self):
50 |         return clean_dict(self.__dict__)
51 | 
52 |     def tuple(self):
53 |         languages = None
54 |         if self.languages and len(self.languages) > 0:
55 |             languages = self.languages[0]
56 |         return (
57 |             self.title, self.author, self.score, self.url, self.item_id, self.pages, self.publisher, self.brand,
58 |             self.asin, self.edition, self.isbn, self.large_image_url, self.medium_image_url, self.small_image_url,
59 |             self.region, self.release_date, self.publication_date, languages
60 |         )
61 | 
62 | 
63 | def clean_dict(d):
64 |     if not isinstance(d, dict):
65 |         return d
66 |     return dict((k, clean_dict(v)) for k, v in d.items() if v is not None)
67 | 


--------------------------------------------------------------------------------
/kindle.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import io
 3 | import json
 4 | import re
 5 | 
 6 | import requests
 7 | from bs4 import Tag
 8 | 
 9 | import config
10 | from book import Book
11 | 
12 | 
13 | def fetch(url, headers, cookies):
14 |     r = requests.get(url, headers=headers, cookies=cookies)
15 |     from bs4 import BeautifulSoup
16 |     import lxml
17 | 
18 |     bs = BeautifulSoup(r.text, lxml.__name__)
19 | 
20 |     time = re.match('数据更新于：(.*)', bs.find('span', style='color:#FFF9A8').text).group(1)
21 | 
22 |     kindle = {'time': time, 'books': []}
23 | 
24 |     book_items = bs.find_all('div', style='margin-bottom: 0.9em;')
25 | 
26 |     for book_item in book_items:
27 | 
28 |         book = Book()
29 | 
30 |         if isinstance(book_item, Tag):
31 |             a = book_item.find('a')
32 |             min_day = book_item.find('span', title=re.compile('最近在')).get('title')
33 |             book.min_day = re.match('最近在(.*)达到最低价', min_day).group(1)
34 | 
35 |             if isinstance(a, Tag):
36 |                 book.url = 'https' + re.match('http(.*)/ref', a.get('href')).group(1)
37 |                 book.item_id = re.match('.*product/(.*)/ref', a.get('href')).group(1)
38 |                 book.title = a.get('title')
39 | 
40 |             matches = re.match('.*历史均价：￥(\d+\.*\d*)，现价：￥(\d+\.*\d*)作者：(.*)，评分：(\d+\.*\d*)，历史最低价：￥(\d+\.*\d*)',
41 |                                book_item.text)
42 | 
43 |             book.average = matches.group(1)
44 |             book.price = matches.group(2)
45 |             book.author = matches.group(3)
46 |             book.score = matches.group(4)
47 |             book.min = matches.group(5)
48 | 
49 |             import amz
50 |             amz.lookup(book)
51 | 
52 |             if book.languages and len(book.languages) > 0:
53 |                 book.languages = book.languages[0]
54 | 
55 |             kindle['books'].append(book)
56 | 
57 |     with io.open('kindle.json', 'w', encoding='utf-8') as f:
58 |         f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     fetch('http://t.bookdna.cn', config.header, {})
63 | 


--------------------------------------------------------------------------------
/free_book.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import io
 3 | import json
 4 | import os
 5 | import re
 6 | 
 7 | import requests
 8 | 
 9 | from book import Book
10 | import config
11 | 
12 | cn_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,p_36:159125071&page='
13 | en_url = 'https://www.amazon.cn/s/?rh=n:116087071,n:!116088071,n:116169071,n:116170071,p_36:159125071&page='
14 | base_url = 'https://www.amazon.cn/gp/product/'
15 | page_dir = 'page/'
16 | 
17 | 
18 | def fetch_free_books(url, page):
19 |     r = requests.get(url + str(page), headers=config.header)
20 |     from bs4 import BeautifulSoup, Tag
21 |     import lxml
22 | 
23 |     bs = BeautifulSoup(r.text, lxml.__name__)
24 |     items = bs.find_all('li', attrs={'class': 's-result-item celwidget'})
25 | 
26 |     kindle = {'books': []}
27 | 
28 |     for item in items:
29 |         if isinstance(item, Tag):
30 |             book = Book()
31 |             book.title = item.find('h2').text
32 |             # book.item_id = item.find('span', attrs={'name': re.compile('.*')}).get('name')
33 |             book.item_id = item.get('data-asin')
34 |             book.url = base_url + book.item_id
35 |             book.average = 0
36 |             book.price = 0
37 |             book.min = 0
38 |             score = item.find('span', attrs={'class': 'a-icon-alt'})
39 |             if score:
40 |                 book.score = re.match('平均(.*) 星', score.text).group(1)
41 | 
42 |             import amz
43 |             amz.lookup(book)
44 | 
45 |             kindle['books'].append(book)
46 | 
47 |     kindle['count'] = len(kindle['books'])
48 |     kindle['page'] = page
49 |     return kindle
50 | 
51 | 
52 | def get_free_cn_books(page):
53 |     kindle = fetch_free_books(cn_url, page)
54 |     with io.open(page_dir + 'kindle_free_books_cn_' + str(page) + '.json', 'w', encoding='utf-8') as f:
55 |         f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
56 | 
57 | 
58 | def get_free_en_books(page):
59 |     kindle = fetch_free_books(en_url, page)
60 |     with io.open(page_dir + 'kindle_free_books_en_' + str(page) + '.json', 'w', encoding='utf-8') as f:
61 |         f.write(json.dumps(kindle, default=lambda o: o.dump(), indent=2, ensure_ascii=False, sort_keys=True))
62 | 
63 | 
64 | def get_free_books():
65 |     if not os.path.exists(page_dir):
66 |         os.mkdir(page_dir)
67 | 
68 |     for page in range(1, 400):
69 |         get_free_cn_books(page)
70 | 
71 |     for page in range(1, 400):
72 |         get_free_en_books(page)
73 | 
74 | get_free_books()
75 | 


--------------------------------------------------------------------------------
/amz.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import time as t
 3 | import os
 4 | import re
 5 | from urllib.error import HTTPError
 6 | 
 7 | from amazon.api import AmazonAPI
 8 | 
 9 | import config
10 | from node import Node
11 | 
12 | cache_dir = 'cache/'
13 | 
14 | 
15 | def write_query_to_db(cache_url, data):
16 |     if not os.path.exists(cache_dir):
17 |         os.mkdir(cache_dir)
18 | 
19 |     file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
20 |     f = open(file, 'wb')
21 |     f.write(data)
22 | 
23 | 
24 | def read_query_from_db(cache_url):
25 |     file = cache_dir + re.match('.*ItemId=(.*)&Operation', cache_url).group(1) + '.xml'
26 |     if os.path.exists(file) and t.time() - os.path.getmtime(file) < 100 * 24 * 60 * 60 * 1000:
27 |         f = open(file, 'rb')
28 |         return f.read()
29 |     return None
30 | 
31 | 
32 | amazon = AmazonAPI(config.KEY_ID, config.SECRET_KEY, config.TAG,
33 |                    region='CN', MaxQPS=0.9, CacheReader=read_query_from_db, CacheWriter=write_query_to_db)
34 | 
35 | 
36 | def lookup(book):
37 |     while True:
38 |         try:
39 |             product = amazon.lookup(ItemId=book.item_id)
40 | 
41 |             book.author = product.author
42 |             book.pages = product.pages
43 |             book.publisher = product.publisher
44 |             book.brand = product.brand
45 |             book.asin = product.asin
46 |             book.binding = product.binding
47 |             book.edition = product.edition
48 |             book.editorial_review = product.editorial_review
49 |             book.isbn = product.isbn
50 |             book.large_image_url = product.large_image_url
51 |             book.region = product.region
52 |             book.release_date = product.release_date.strftime("%Y-%m-%d")
53 |             if product.publication_date:
54 |                 book.publication_date = product.publication_date.strftime("%Y-%m-%d")
55 |             book.sales_rank = product.sales_rank
56 |             book.medium_image_url = product.medium_image_url
57 |             book.small_image_url = product.small_image_url
58 |             if product.languages:
59 |                 book.languages = list(product.languages)
60 | 
61 |             book.nodes = []
62 |             for browse_node in product.browse_nodes:
63 |                 node = Node()
64 |                 book.nodes.append(node)
65 |                 while True:
66 |                     node.node_id = browse_node.id
67 |                     node.name = str(browse_node.name)
68 |                     if not browse_node.is_category_root:
69 |                         node.node = Node()
70 |                         node = node.node
71 |                         browse_node = browse_node.ancestor
72 |                     else:
73 |                         node.is_root = True
74 |                         break
75 | 
76 |             print('cached: ' + book.item_id + ' -> ' + book.title)
77 |             break
78 |         except HTTPError as e:
79 |             print(e)
80 |             t.sleep(3)
81 |             pass
82 | 


--------------------------------------------------------------------------------
/generator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import json
  4 | import os.path
  5 | 
  6 | import sqlite3
  7 | import zipfile
  8 | 
  9 | from book import Book
 10 | from status import Status
 11 | 
 12 | data_dir = 'data'
 13 | 
 14 | books_cn = []
 15 | books_en = []
 16 | ids = set()
 17 | 
 18 | nodes = dict()
 19 | node_map = []
 20 | node_relation = dict()
 21 | 
 22 | reviews = []
 23 | 
 24 | 
 25 | def load_book(file):
 26 |     if os.path.isfile(file):
 27 |         with open(file) as json_data:
 28 |             d = json.load(json_data)
 29 |             for b in d['books']:
 30 |                 book = Book(b)
 31 |                 if book.item_id in ids:
 32 |                     print('added: ' + book.item_id)
 33 |                     continue
 34 |                 if book.languages and len(book.languages) > 0:
 35 |                     if book.languages[0] == 'chinese' or book.languages[0] == 'traditional_chinese':
 36 |                         books_cn.append(book.tuple())
 37 |                     else:
 38 |                         books_en.append(book.tuple())
 39 |                     ids.add(book.item_id)
 40 |                     reviews.append((book.item_id, book.editorial_review))
 41 | 
 42 |                     if book.nodes:
 43 |                         for node in book.nodes:
 44 |                             node.node_id = node.id
 45 |                             node_map.append((book.item_id, node.node_id))
 46 |                             while True:
 47 |                                 if node.node_id not in nodes:
 48 |                                     nodes[node.node_id] = node.tuple()
 49 |                                 if not node.is_root:
 50 |                                     node.node.node_id = node.node.id
 51 |                                     node_key = str(node.node_id) + '-' + str(node.node.node_id)
 52 |                                     if node_key not in node_relation:
 53 |                                         node_relation[node_key] = (node.node_id, node.node.node_id)
 54 |                                     node = node.node
 55 |                                 else:
 56 |                                     break
 57 |                 else:
 58 |                     print('no language')
 59 |                     print(book.json())
 60 | 
 61 | 
 62 | def compress(file_name):
 63 |     zip_file = file_name + ".zip"
 64 |     zf = zipfile.ZipFile(zip_file, "w", zipfile.ZIP_DEFLATED)
 65 |     zf.write(file_name, arcname=os.path.basename(file_name))
 66 |     zf.close()
 67 |     return zip_file
 68 | 
 69 | 
 70 | if not os.path.exists(data_dir):
 71 |     os.mkdir(data_dir)
 72 | 
 73 | # read data to list
 74 | 
 75 | for i in range(1, 401):
 76 |     f_cn = 'page/kindle_free_books_cn_' + str(i) + '.json'
 77 |     f_en = 'page/kindle_free_books_en_' + str(i) + '.json'
 78 |     load_book(f_cn)
 79 |     load_book(f_en)
 80 | 
 81 | # save to database
 82 | 
 83 | status = Status()
 84 | 
 85 | status.new_count = len(books_cn) + len(books_en) - status.count
 86 | 
 87 | status.count = len(books_cn) + len(books_en)
 88 | status.bump()
 89 | 
 90 | conn = sqlite3.connect('data/books_' + str(status.version) + '.db')
 91 | cur = conn.cursor()
 92 | cur.execute('''CREATE TABLE IF NOT EXISTS book (
 93 |     id INTEGER PRIMARY KEY AUTOINCREMENT,
 94 |     title TEXT,
 95 |     author TEXT,
 96 |     score REAL,
 97 |     url TEXT,
 98 |     item_id TEXT,
 99 |     pages TEXT,
100 |     publisher TEXT,
101 |     brand TEXT,
102 |     asin TEXT,
103 |     edition TEXT,
104 |     isbn TEXT,
105 |     large_image_url TEXT,
106 |     medium_image_url TEXT,
107 |     small_image_url TEXT,
108 |     region TEXT,
109 |     release_date TEXT,
110 |     publication_date TEXT,
111 |     languages TEXT
112 |     );''')
113 | 
114 | cur.executemany('''insert into book (
115 |     title,
116 |     author,
117 |     score,
118 |     url,
119 |     item_id,
120 |     pages,
121 |     publisher,
122 |     brand,
123 |     asin,
124 |     edition,
125 |     isbn,
126 |     large_image_url,
127 |     medium_image_url,
128 |     small_image_url,
129 |     region,
130 |     release_date,
131 |     publication_date,
132 |     languages
133 |     ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
134 |     ''', books_cn + books_en)
135 | 
136 | cur.execute('''CREATE TABLE IF NOT EXISTS node (
137 |     id INTEGER PRIMARY KEY AUTOINCREMENT,
138 |     node_id INTEGER,
139 |     name TEXT,
140 |     is_root INTEGER
141 |     );''')
142 | 
143 | cur.executemany('''insert into node (
144 |     node_id,
145 |     name,
146 |     is_root
147 |     ) values (?, ?, ?)
148 |     ''', list(nodes.values()))
149 | 
150 | cur.execute('''CREATE TABLE IF NOT EXISTS node_relation (
151 |     id INTEGER PRIMARY KEY AUTOINCREMENT,
152 |     descendant INTEGER,
153 |     ancestor INTEGER
154 |     );''')
155 | 
156 | cur.executemany('''insert into node_relation (
157 |     descendant,
158 |     ancestor
159 |     ) values (?, ?)
160 |     ''', list(node_relation.values()))
161 | 
162 | cur.execute('''CREATE TABLE IF NOT EXISTS node_map (
163 |     id INTEGER PRIMARY KEY AUTOINCREMENT,
164 |     item_id TEXT,
165 |     node_id INTEGER
166 |     );''')
167 | 
168 | cur.executemany('''insert into node_map (
169 |     item_id,
170 |     node_id
171 |     ) values (?, ?)
172 |     ''', node_map)
173 | 
174 | cur.execute('''CREATE TABLE IF NOT EXISTS status
175 |     ( id INTEGER PRIMARY KEY AUTOINCREMENT, version INTEGER, count INTEGER, new_count INTEGER, time INTEGER );''')
176 | 
177 | cur.execute('insert into status (version, count, new_count, time) values (?, ?, ?, ?)', status.to_list())
178 | 
179 | # conn.commit()
180 | # cur.close()
181 | # conn.close()
182 | 
183 | # save reviews to database
184 | 
185 | # conn = sqlite3.connect('data/reviews_' + str(status.version) + '.db')
186 | # cur = conn.cursor()
187 | cur.execute('''CREATE TABLE IF NOT EXISTS review (
188 |     id INTEGER PRIMARY KEY AUTOINCREMENT,
189 |     item_id TEXT,
190 |     editorial_review TEXT
191 |     );''')
192 | 
193 | cur.executemany('''insert into review (
194 |     item_id,
195 |     editorial_review
196 |     ) values (?, ?)
197 |     ''', reviews)
198 | 
199 | cur.execute('PRAGMA user_version = {v:d}'.format(v=status.version))
200 | 
201 | conn.commit()
202 | cur.close()
203 | conn.close()
204 | 
205 | zip_f = compress('data/books_' + str(status.version) + '.db')
206 | status.update(zip_f)
207 | 


--------------------------------------------------------------------------------