├── .gitignore ├── LICENSE ├── README.md ├── crawler ├── __init__.py ├── compat.py ├── config.py ├── db.py ├── douban_cookie.txt ├── main.py ├── test_crawler.py ├── test_db.py └── utils.py ├── images ├── Core-Python-Programming.png └── Edge-of-Eternity.png ├── logs └── .gitignore ├── requirements-dev.txt ├── requirements.txt └── start_crawler.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | py3venv/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # 98 | # Mac OS 99 | # 100 | *.DS_Store 101 | .AppleDouble 102 | .LSOverride 103 | 104 | # Icon must end with two \r 105 | Icon 106 | 107 | 108 | # Thumbnails 109 | ._* 110 | 111 | # Files that might appear in the root of a volume 112 | .DocumentRevisions-V100 113 | .fseventsd 114 | .Spotlight-V100 115 | .TemporaryItems 116 | .Trashes 117 | .VolumeIcon.icns 118 | .com.apple.timemachine.donotpresent 119 | 120 | # Directories potentially created on remote AFP share 121 | .AppleDB 122 | .AppleDesktop 123 | Network Trash Folder 124 | Temporary Items 125 | .apdisk 126 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 PengTao.Shi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 发现图书 2 | ======= 3 | 4 | 从豆瓣某一指定图书页开始抓取豆瓣推荐书目,图书与图书之间建立 RELATE 关系;之后可以使用 Neo4j Browser() 浏览图书。 5 | 6 | 欢迎 star,欢迎提交 pull requests :hatching_chick: 7 | 8 | 安装 9 | ---- 10 | 11 | * 首先安装 [neo4j](https://neo4j.com/) 12 | 13 | * 再安装 Python 依赖: 14 | 15 | $ pip install requirements.txt 16 | 17 | * 最后修改 `crawler/config.py` 中的 `NEO4J_AUTH`: 18 | 19 | NEO4J_AUTH = ('你的 neo4j 用户名,默认一般为 neo4j', '你的 neo4j 密码') 20 | 21 | 运行 22 | ---- 23 | 24 | **以 ''(-u) 作为起点开始抓取,最多抓取 100(-C) 本书,开启 8(-t) 个线程:** 25 | 26 | $ python start_crawler.py -u 'https://book.douban.com/subject/3112503/' -C 100 -t 8 27 | 28 | # windows 系统下 29 | $ python start_crawler.py -u https://book.douban.com/subject/3112503/ -C 100 -t 8 30 | 31 | **打印帮助信息:** 32 | 33 | $ python start_crawler.py -h 34 | 35 | #### 以 '' 作为起点抓取的 100 本书: 36 | 37 | ![Core-Python-Programming](images/Core-Python-Programming.png) 38 | 39 | #### 以 '' 作为起点抓取的 200 本书: 40 | 41 | ![Edge-of-Eternity](images/Edge-of-Eternity.png) 42 | 43 | 测试 44 | ---- 45 | 46 | $ pip install -r requirements-dev.txt 47 | $ pytest 48 | 49 | 已测试通过 python2.7, python3.6 50 | 51 | 一些 cypher 语句 52 | ---------------- 53 | 54 | **查看与某一本书有关联的书:** 55 | 56 | # 替换 book_id 为你想要查看的书 57 | match p=(n:DOUBAN_BOOK {book_id:3112503})-[:RELATE*]-() return p 58 | 59 | # 限制返回数量: 60 | match p=(n:DOUBAN_BOOK {book_id:3112503})-[:RELATE*]-() return p limit 30 61 | 62 | **删除数据库数据:** 63 | 64 | # 删除与某一本书有关的所有数据(包括node和relation) 65 | match p=(n:DOUBAN_BOOK {book_id:3112503})-[:RELATE*]-() delete p 66 | 67 | # 删除数据库中所有抓取的图书 68 | match (n:DOUBAN_BOOK) detach delete n 69 | 70 | 71 | License 72 | -------- 73 | 74 | [MIT](LICENSE) 75 | -------------------------------------------------------------------------------- /crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shipengtaov/discover-books/3bbf390446b3205ea8eb2d2c8f78f88b4afc383d/crawler/__init__.py -------------------------------------------------------------------------------- /crawler/compat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | 5 | if sys.version_info.major < 3: 6 | text_type = unicode 7 | from urlparse import urlparse, urljoin 8 | else: 9 | text_type = str 10 | from urllib.parse import urlparse, urljoin 11 | -------------------------------------------------------------------------------- /crawler/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | from os import path 6 | 7 | ROOT_DIR = path.dirname(path.dirname(path.abspath(__file__))) 8 | LOGS_DIR = path.join(ROOT_DIR, 'logs') 9 | CRAWLER_DIR = path.join(ROOT_DIR, 'crawler') 10 | 11 | COOKIE_FILE = path.join(CRAWLER_DIR, 'douban_cookie.txt') 12 | 13 | MAX_CRAWL_COUNT = 1000 14 | 15 | NEO4J_URI = 'bolt://localhost:7687' 16 | # (username, password) 17 | NEO4J_AUTH = ('your username', 'your password') 18 | 19 | USER_AGENTS = ( 20 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', 21 | 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0', 22 | 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0', 23 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko)' 24 | 'Chrome/19.0.1084.46 Safari/536.5', 25 | 'Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46' 26 | 'Safari/536.5', 27 | ) 28 | -------------------------------------------------------------------------------- /crawler/db.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from neo4j.v1 import GraphDatabase 4 | 5 | from . import config 6 | 7 | neo4j_driver = GraphDatabase.driver(config.NEO4J_URI, auth=config.NEO4J_AUTH) 8 | 9 | 10 | def does_crawled_before(book_id, label='DOUBAN_BOOK'): 11 | """判断是否已抓取过book_id 12 | """ 13 | with neo4j_driver.session() as session: 14 | result = session.run("MATCH (n:{label} {{book_id:{value}}}) return n limit 1".format( 15 | label=label, 16 | value=book_id)) 17 | data = result.data() 18 | if not data: 19 | return False 20 | # 判断是否有标题,用来确定是否抓取过此书 21 | if 'title' not in data[0]['n']: 22 | return False 23 | return True 24 | -------------------------------------------------------------------------------- /crawler/douban_cookie.txt: -------------------------------------------------------------------------------- 1 | // 保存豆瓣登录Cookie -------------------------------------------------------------------------------- /crawler/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals 5 | 6 | from os import path 7 | from argparse import ArgumentParser 8 | import time 9 | from threading import Thread, Lock, Event 10 | from multiprocessing import Queue 11 | import random 12 | import traceback 13 | 14 | import requests 15 | import lxml 16 | 17 | from . import config 18 | from . import utils 19 | from . import db 20 | from .db import neo4j_driver 21 | 22 | 23 | logger = utils.get_logger('crawler', path.join(config.LOGS_DIR, 'crawler.log')) 24 | # 当前抓了多少本书 25 | CURRENT_COUNT = 0 26 | CURRENT_COUNT_LOCK = Lock() 27 | 28 | 29 | class Crawler(Thread): 30 | def __init__(self, queue, run_event, max_count): 31 | super(Crawler, self).__init__() 32 | self.deamon = True 33 | self.queue = queue 34 | self.run_event = run_event 35 | self.max_count = max_count 36 | 37 | def run(self): 38 | global CURRENT_COUNT 39 | while self.run_event.is_set(): 40 | if CURRENT_COUNT > 0: 41 | logger.debug('当前已抓取 {}/{} 本书'.format(CURRENT_COUNT, self.max_count)) 42 | if CURRENT_COUNT >= self.max_count: 43 | break 44 | if self.queue.empty(): 45 | logger.debug('no task') 46 | time.sleep(3) 47 | continue 48 | try: 49 | referer_url, url = self.queue.get() 50 | book_id = utils.get_book_id_from_url(url) 51 | assert book_id, '未能从 url: <{}> 中提取出book_id'.format(url) 52 | 53 | # 判断数据库是否已存在 54 | if db.does_crawled_before(book_id): 55 | logger.debug('数据库中已存在book_id: {}, url: {}'.format(book_id, url)) 56 | continue 57 | 58 | logger.debug('正在抓取 {}'.format(url)) 59 | headers = { 60 | 'User-Agent': random.choice(config.USER_AGENTS), 61 | } 62 | response = requests.get(url, headers=headers, timeout=10) 63 | assert response.ok, '{} status code error: {}'.format(url, response.status_code) 64 | 65 | parse_book = utils.parse_book(response) 66 | 67 | with neo4j_driver.session() as s: 68 | cypher = """MERGE (n:DOUBAN_BOOK {{book_id: {book_id}}}) 69 | ON CREATE SET n.book_id=toInteger({book_id}), 70 | n.title="{title}", 71 | n.url="{url}", 72 | n.author="{author}", 73 | n.press="{press}", 74 | n.publish_date="{publish_date}", 75 | n.price="{price}" 76 | ON MATCH SET n.title="{title}", 77 | n.url="{url}", 78 | n.author="{author}", 79 | n.press="{press}", 80 | n.publish_date="{publish_date}", 81 | n.price="{price}" 82 | """.format( 83 | book_id=book_id, 84 | title=parse_book['title'], 85 | url=url, 86 | author=parse_book['author'], 87 | press=parse_book['press'], 88 | publish_date=parse_book['publish_date'], 89 | price=parse_book['price']) 90 | s.run(cypher) 91 | 92 | # 建立 relation 93 | if referer_url: 94 | referer_book_id = utils.get_book_id_from_url(referer_url) 95 | cypher = """MATCH 96 | (m:DOUBAN_BOOK {{book_id: {referer_book_id}}}), 97 | (n:DOUBAN_BOOK {{book_id: {book_id}}}) 98 | MERGE (m)-[:RELATE]->(n) 99 | """.format(referer_book_id=referer_book_id, 100 | book_id=book_id) 101 | s.run(cypher) 102 | 103 | for relate_book_url in parse_book['related_books']: 104 | if not utils.is_douban_book_url(relate_book_url): 105 | logger.debug('book {} 不是一个合法的豆瓣图书链接'.format(relate_book_url)) 106 | continue 107 | # logger.debug('adding task: {}'.format(relate_book_url)) 108 | self.queue.put((url, relate_book_url)) 109 | # 当前已经抓了多少本书 110 | with CURRENT_COUNT_LOCK: 111 | CURRENT_COUNT += 1 112 | logger.debug('图书 <{}> 处理完毕'.format(url)) 113 | except KeyboardInterrupt: 114 | break 115 | except: 116 | logger.error(traceback.format_exc()) 117 | time.sleep(3) 118 | continue 119 | finally: 120 | # self.queue.task_done() 121 | # sleep,防止频繁抓取 122 | time.sleep(random.random()*5) 123 | 124 | 125 | def start_crawler(args): 126 | start_urls = args.urls 127 | max_count = args.max_count or config.MAX_CRAWL_COUNT 128 | thread_count = args.thread_count 129 | 130 | queue = Queue() 131 | for url in start_urls: 132 | if not utils.is_douban_book_url(url): 133 | raise SystemExit('<{}> 不是合法的豆瓣图书链接'.format(url)) 134 | logger.debug('adding url: {}'.format(url)) 135 | queue.put((None, url)) 136 | 137 | run_event = Event() 138 | run_event.set() 139 | 140 | threads = [] 141 | for i in range(thread_count): 142 | logger.debug('starting thread: {}/{}'.format(i+1, thread_count)) 143 | thread = Crawler(queue=queue, run_event=run_event, max_count=max_count) 144 | thread.start() 145 | threads.append(thread) 146 | try: 147 | while True: 148 | if all([not t.is_alive() for t in threads]): 149 | break 150 | time.sleep(.1) 151 | except KeyboardInterrupt: 152 | print('stoping all threads') 153 | run_event.clear() 154 | for t in threads: 155 | t.join() 156 | print('threads successfully closed') 157 | print('crawled total {} books'.format(CURRENT_COUNT)) 158 | 159 | 160 | def cli(): 161 | parser = ArgumentParser() 162 | parser.add_argument('-u', '--urls', nargs='+', help='从哪些链接开始抓取') 163 | parser.add_argument('-C', '--max-count', type=int, help='最多抓取多少本书. 默认:{}'.format(config.MAX_CRAWL_COUNT)) 164 | parser.add_argument('-t', '--thread-count', type=int, default=4, help='多少线程. 默认:4') 165 | args = parser.parse_args() 166 | 167 | if not args.urls: 168 | parser.print_help() 169 | raise SystemExit 170 | return args 171 | -------------------------------------------------------------------------------- /crawler/test_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import pytest 6 | from . import utils 7 | 8 | 9 | @pytest.mark.parametrize('url,expected', [ 10 | ('https://book.douban.com/subject/3112503', True), 11 | ('http://book.douban.com/subject/3112503/', True), 12 | ('https://book.douban.com/subject/3112503/test', True), 13 | 14 | # 这种情况也应该排除 15 | # ('https://book.douban.com/subject/3112503t', False), 16 | 17 | # 6位数字 18 | ('http://book.douban.com/subject/311250/', True), 19 | # 8位数字 20 | ('http://book.douban.com/subject/31125031/', True), 21 | 22 | ('https://book.douban.com/subject/3112503/?start=30', True), 23 | ('https://book.douban.com/subject/3112503?start=30', True), 24 | ]) 25 | def test_is_douban_book_url(url, expected): 26 | assert expected is utils.is_douban_book_url(url) 27 | 28 | 29 | @pytest.mark.parametrize('url,expected', [ 30 | ('https://book.douban.com/subject/3112503/', 3112503), 31 | ('https://book.douban.com/subject/3112503/?start=25&limit=25', 3112503), 32 | 33 | ('https://book.douban.com/subject/31125/', 31125), 34 | ('https://book.douban.com/subject/test/', None), 35 | ('https://another-domain.com/subject/1111', None), 36 | ]) 37 | def test_get_book_id_from_url(url, expected): 38 | assert expected == utils.get_book_id_from_url(url) 39 | 40 | 41 | def test_parse_book_info(): 42 | func = utils._parse_book_info 43 | 44 | html = """ 45 |
46 | 47 | 作者: 48 | [美]Wesley J. Chun(陳仲才) 49 |
50 | 出版社: 人民邮电出版社
51 | 原作名: Core Python Programming, 2nd Edition
52 | 53 | 译者: 54 | CPUG 55 |
56 | 出版年: 2008-06
57 | 页数: 654
58 | 定价: 89.00元
59 | 装帧: 平装
60 | ISBN: 9787115178503
61 |
62 | """ 63 | result = func(html) 64 | assert result['author'] == '[美]Wesley J. Chun(陳仲才)' 65 | assert result['press'] == '人民邮电出版社' 66 | assert result['publish_date'] == '2008-06' 67 | assert result['price'] == '89.00元' 68 | 69 | 70 | def test_parse_book_related(): 71 | func = utils._parse_book_related 72 | 73 | html = """ 74 |
75 |
76 |
77 | 78 |
79 |
80 | book1 name 81 |
82 |
83 |
84 |
85 | 86 |
87 |
88 | book2 name 89 |
90 |
91 |
92 | """ 93 | result = func(html) 94 | assert result == ['book1_url', 'book2_url'] 95 | -------------------------------------------------------------------------------- /crawler/test_db.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .db import neo4j_driver 4 | from .db import does_crawled_before 5 | 6 | 7 | def test_does_crawled_before(): 8 | try: 9 | label = 'DISCOVER_BOOKS_TEST_DOUBAN_BOOK' 10 | assert does_crawled_before(12345, label=label) is False 11 | 12 | with neo4j_driver.session() as session: 13 | session.run('create (:{} {{book_id: 12345, url:"https://book.douban.com/subject/111"}})'.format(label)) 14 | assert does_crawled_before(12345, label=label) is False 15 | with neo4j_driver.session() as session: 16 | session.run('match (n:{} {{book_id: 12345}}) delete n'.format(label)) 17 | assert does_crawled_before(12345, label=label) is False 18 | 19 | with neo4j_driver.session() as session: 20 | session.run('create (:{} {{book_id: 12345, title: "Brave New World"}})'.format(label)) 21 | assert does_crawled_before(12345678, label=label) is False 22 | assert does_crawled_before(12345, label=label) is True 23 | with neo4j_driver.session() as session: 24 | session.run('match (n:{} {{book_id: 12345}}) delete n'.format(label)) 25 | assert does_crawled_before(12345, label=label) is False 26 | finally: 27 | with neo4j_driver.session() as session: 28 | session.run('match (n:{}) detach delete n'.format(label)) 29 | -------------------------------------------------------------------------------- /crawler/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | 5 | import logging 6 | import re 7 | 8 | import lxml.html 9 | 10 | from .compat import urlparse, text_type 11 | 12 | douban_book_pattern = re.compile(r'^https?://book\.douban\.com/subject/\d+/?', re.I) 13 | douban_book_id_pattern = re.compile(r'^https?://book\.douban\.com/subject/(\d+)/?', re.I) 14 | 15 | 16 | def get_logger(name, filename, level=logging.DEBUG, fmt=None): 17 | logger = logging.Logger(name) 18 | 19 | fmt = fmt or '%(asctime)s-%(name)s-%(levelname)-10s%(message)s' 20 | formatter = logging.Formatter(fmt=fmt, datefmt='%Y-%m-%d %H:%M:%S') 21 | 22 | stream_handler = logging.StreamHandler() 23 | stream_handler.setFormatter(formatter) 24 | 25 | file_handler = logging.FileHandler(filename) 26 | file_handler.setFormatter(formatter) 27 | 28 | logger.addHandler(stream_handler) 29 | logger.addHandler(file_handler) 30 | 31 | logger.setLevel(level) 32 | 33 | return logger 34 | 35 | 36 | def is_douban_book_url(url): 37 | """判断是否是豆瓣图书链接 38 | """ 39 | match = douban_book_pattern.match(url) 40 | return True if match else False 41 | 42 | 43 | def get_book_id_from_url(url): 44 | match = douban_book_id_pattern.search(url) 45 | return int(match.group(1)) if match else None 46 | 47 | 48 | def parse_book(response): 49 | """解析豆瓣图书详情页 50 | """ 51 | doc = lxml.html.fromstring(response.text) 52 | 53 | title = ''.join([i.strip() for i in doc.xpath('//h1//text()')]) 54 | 55 | info_html = lxml.html.tostring(doc.xpath('//*[@id="info"]')[0], encoding=text_type) 56 | book_info = _parse_book_info(info_html) 57 | 58 | related_books = _parse_book_related(response.text) 59 | 60 | return dict( 61 | title=title, 62 | author=book_info['author'], 63 | press=book_info['press'], 64 | publish_date=book_info['publish_date'], 65 | price=book_info['price'], 66 | related_books=related_books) 67 | 68 | 69 | def _parse_book_info(html): 70 | """解析豆瓣图书信息(作者,出版社,出版年,定价) 71 | 72 | :param html(string): 图书信息部分的原始html 73 | """ 74 | end_flag = 'END_FLAG' 75 | html = html.replace('
', end_flag) 76 | html = html.replace('
', end_flag) 77 | 78 | doc = lxml.html.fromstring(html) 79 | text = doc.text_content() 80 | pattern = r'{}[::](.*?){}' 81 | result = dict() 82 | for key, column in [ 83 | ('author', '作者'), 84 | ('press', '出版社'), 85 | ('publish_date', '出版年'), 86 | ('price', '定价')]: 87 | result[key] = re.search(pattern.format(column, end_flag), 88 | text, 89 | re.I | re.DOTALL).group(1).strip() 90 | return result 91 | 92 | 93 | def _parse_book_related(html): 94 | """获取相关图书 95 | """ 96 | doc = lxml.html.fromstring(html) 97 | books = doc.xpath('//dl[@class=""]/dt/a') 98 | assert len(books) > 0, "parse related books fail. 0 related books" 99 | return [i.attrib['href'] for i in books] 100 | -------------------------------------------------------------------------------- /images/Core-Python-Programming.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shipengtaov/discover-books/3bbf390446b3205ea8eb2d2c8f78f88b4afc383d/images/Core-Python-Programming.png -------------------------------------------------------------------------------- /images/Edge-of-Eternity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shipengtaov/discover-books/3bbf390446b3205ea8eb2d2c8f78f88b4afc383d/images/Edge-of-Eternity.png -------------------------------------------------------------------------------- /logs/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | neo4j-driver 2 | requests 3 | lxml 4 | -------------------------------------------------------------------------------- /start_crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from crawler.main import start_crawler, cli 5 | 6 | if __name__ == '__main__': 7 | start_crawler(cli()) 8 | --------------------------------------------------------------------------------