├── setup.cfg ├── query ├── __init__.py └── movie.py ├── test ├── __init__.py ├── test_bookExport.py └── test_movieExport.py ├── utils ├── __init__.py └── logutil.py ├── MANIFEST.in ├── screenshot └── screenshot-output-result.png ├── requirements.txt ├── export_csv.py ├── setup.py ├── README.md ├── exporter ├── __init__.py ├── notes.py ├── status.py ├── book.py ├── music.py ├── movie.py └── __main__.py ├── .gitignore ├── douban-movie-export.user.js └── douban-book-export.user.js /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /query/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include screenshots/* -------------------------------------------------------------------------------- /screenshot/screenshot-output-result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/einverne/douban-export/HEAD/screenshot/screenshot-output-result.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.7.1 2 | certifi==2019.6.16 3 | chardet==3.0.4 4 | idna==2.8 5 | lxml==4.3.4 6 | PySocks==1.7.0 7 | requests==2.22.0 8 | soupsieve==1.9.1 9 | urllib3==1.25.3 10 | Click -------------------------------------------------------------------------------- /export_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import csv 5 | 6 | from exporter.movie import MovieExport 7 | 8 | # This is a simple demo 9 | if __name__ == '__main__': 10 | m = MovieExport("einverne") 11 | with open('movie_export.csv', mode='w') as movie_file: 12 | writer = csv.writer(movie_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 13 | for wish in m.get_wish(): 14 | writer.writerow([wish.title, wish.url, wish.intro, wish.tags, wish.comment, wish.rating, wish.rating_date]) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | 6 | from setuptools import setup, find_packages 7 | 8 | 9 | def read(fname): 10 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 11 | 12 | 13 | requirements = [ 14 | 'Click', 15 | 'requests', 16 | 'bs4', 17 | 'lxml' 18 | ] 19 | 20 | setup( 21 | name="douban-export", 22 | version="0.0.1", 23 | author="Ein Verne", 24 | author_email="git@einverne.info", 25 | description="A tool to help export douban data ", 26 | license="MIT", 27 | keywords="douban, export, command, tools", 28 | url="https://github.com/einverne/douban-export", 29 | packages=find_packages(exclude=["test"]), 30 | long_description=read('README.md'), 31 | include_package_data=True, 32 | install_requires=requirements, 33 | entry_points={'console_scripts': ['douban=exporter.__main__:cli']}, 34 | ) 35 | -------------------------------------------------------------------------------- /utils/logutil.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import logging 4 | import logging.handlers 5 | from threading import Lock 6 | 7 | cache = {} 8 | lock = Lock() 9 | 10 | 11 | def get_logger(logger_name): 12 | # logging.getLogger 获取单例,多次调用会加多个handler,重复写的问题 13 | global cache 14 | with lock: 15 | if not cache.get(logger_name): 16 | cache[logger_name] = _get_logger(logger_name) 17 | return cache.get(logger_name) 18 | 19 | 20 | def _get_logger(logger_name): 21 | logger = logging.getLogger(logger_name) 22 | level = "DEBUG" 23 | logger.setLevel(level) 24 | logger.propagate = False # disable dup celery log 25 | ch = logging.StreamHandler() 26 | formatter = logging.Formatter('[%(asctime)s][%(name)s][%(levelname)6s] [%(pathname)s:%(lineno)s - %(funcName)s] ' 27 | '%(message)s') 28 | ch.setFormatter(formatter) 29 | logger.addHandler(ch) 30 | 31 | return logger 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 豆瓣导出工具 2 | 3 | 工具包含 Python 版本和 JavaScript 版本。 4 | 5 | ## Python 6 | Python 版本基于 Python 3.6.x ,其他版本暂未测试。 7 | 8 | 主要实现: 9 | 10 | - 电影导出 11 | - 书籍导出 12 | - 音乐导出 13 | - 日记导出 14 | 15 | ![result](screenshot/screenshot-output-result.png) 16 | 17 | 关于豆瓣相册导出可以参考我 [这个](https://github.com/einverne/douban-dl) 项目。 18 | 19 | ### 命令使用 20 | 21 | 设置 22 | 23 | douban-export setup 24 | 25 | 输入 uesr id,会将用户ID保存到 HOME 目录的 `~/.douban-export` 文件中,如果预先设置,一下的命令可以省略 `-u` 选项。 26 | 27 | 导出电影 28 | 29 | douban-export movie -u einverne -t wish -o wish_movie.csv 30 | 31 | 说明: 32 | 33 | - `-t` 参数可以选择 `collect` 看过,`wish` 想看, `doing` 在看 34 | 35 | 导出书籍 36 | 37 | douban-export book -u einverne -t wish -o wish_book.csv 38 | 39 | 同理 40 | 41 | douban-export music -u einverne -t wish -o wish_music.csv 42 | 43 | 44 | ## JS 45 | 46 | userscript 主要来自于 47 | 48 | - douban-book-export.user.js 49 | - douban-movie-export.user.js 50 | 51 | 分别来自于: 52 | 53 | OpenUserJS 54 | 55 | - 56 | 57 | DannyVim 58 | 59 | - https://raw.githubusercontent.com/DannyVim/ToolsCollection/master/douban_book.js 60 | 61 | 62 | 63 | ## reference 64 | 65 | - 66 | 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /exporter/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import abc 4 | import re 5 | 6 | import requests 7 | 8 | 9 | def r0(pattern, text): 10 | m = re.search(pattern, text) 11 | if m: 12 | return m.group(0) 13 | return '' 14 | 15 | 16 | class BaseReview: 17 | def __init__(self): 18 | self.title = '' 19 | self.url = '' 20 | self.id = '' 21 | self.content = '' 22 | self.publish_time = '' 23 | self.useful_count = 0 24 | self.useless_count = 0 25 | self.total_count = 0 26 | 27 | @abc.abstractmethod 28 | def parse(self, item): 29 | raise NotImplementedError 30 | 31 | def update(self, raw_content): 32 | self.content = raw_content['html'] 33 | if 'votes' in raw_content: 34 | raw_votes = raw_content['votes'] 35 | if 'useful_count' in raw_votes: 36 | self.useful_count = raw_votes['useful_count'] 37 | if 'useless_count' in raw_votes: 38 | self.useless_count = raw_votes['useless_count'] 39 | if 'totalcount' in raw_votes: 40 | self.total_count = raw_votes['totalcount'] 41 | 42 | def __str__(self): 43 | s = [] 44 | for k in self.__dict__: 45 | s.append("{key}={value}".format(key=k, value=self.__dict__.get(k))) 46 | return ', '.join(s) 47 | 48 | def __repr__(self): 49 | return self.__str__() 50 | 51 | 52 | COLLECT = 'collect' 53 | WISH = 'wish' 54 | DOING = 'do' 55 | 56 | 57 | class BaseExporter: 58 | 59 | def get_review_content(self, id): 60 | url = 'https://www.douban.com/j/review/{}/full'.format(id) 61 | r = requests.get(url, headers={ 62 | 'Host': 'movie.douban.com', 63 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' 64 | }) 65 | return r.json() 66 | -------------------------------------------------------------------------------- /test/test_bookExport.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import unittest 4 | from unittest import TestCase 5 | 6 | from exporter import * 7 | from exporter.book import BookExport 8 | from utils.logutil import get_logger 9 | 10 | log = get_logger(__name__) 11 | 12 | 13 | class TestBookExport(TestCase): 14 | 15 | def setUp(self): 16 | self.exporter = BookExport("einverne") 17 | 18 | def test_get_books(self): 19 | books = self.exporter.get_books(COLLECT) 20 | for book in books: 21 | log.debug(book) 22 | self.assertIsNotNone(book, "book object fetch failed") 23 | self.assertNotEqual(book.title, '', 'book title fetch failed') 24 | break 25 | 26 | def test_get_read(self): 27 | readed = self.exporter.get_read() 28 | for b in readed: 29 | log.debug(b) 30 | self.assertIsNotNone(b, 'book object fetch failed') 31 | self.assertNotEqual(b.title, '', 'book title fetch failed') 32 | break 33 | 34 | def test_get_wish(self): 35 | wish = self.exporter.get_wish() 36 | for b in wish: 37 | log.debug(b) 38 | self.assertIsNotNone(b, 'book object fetch failed') 39 | self.assertNotEqual(b.title, '', 'book title fetch failed') 40 | break 41 | 42 | def test_get_reading(self): 43 | reading = self.exporter.get_reading() 44 | for b in reading: 45 | log.debug(b) 46 | self.assertIsNotNone(b, 'book object fetch failed') 47 | self.assertNotEqual(b.title, '', 'book title fetch failed') 48 | break 49 | 50 | # def test_get_reviews(self): 51 | # readed = self.exporter.get_reviews() 52 | # for b in readed: 53 | # log.debug(b) 54 | # self.assertIsNotNone(b, 'book object fetch failed') 55 | # self.assertNotEqual(b.title, '', 'book title fetch failed') 56 | # break 57 | 58 | 59 | if __name__ == '__main__': 60 | unittest.main() 61 | -------------------------------------------------------------------------------- /test/test_movieExport.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import unittest 4 | from unittest import TestCase 5 | 6 | from exporter import * 7 | from exporter.movie import MovieExport 8 | from utils.logutil import get_logger 9 | 10 | log = get_logger(__name__) 11 | 12 | 13 | class TestMovieExport(TestCase): 14 | 15 | def setUp(self): 16 | self.exporter = MovieExport("einverne") 17 | 18 | def test_get_movies(self): 19 | movies = self.exporter.get_movies(COLLECT) 20 | for m in movies: 21 | log.debug(m) 22 | self.assertIsNotNone(m, 'content should not be none') 23 | self.assertNotEqual(m.title, '', 'movie title fetch failed') 24 | break 25 | 26 | def test_get_watched(self): 27 | movies = self.exporter.get_watched() 28 | for m in movies: 29 | log.debug(m) 30 | self.assertIsNotNone(m, 'content should not be none') 31 | self.assertNotEqual(m.title, '', 'movie title fetch failed') 32 | break 33 | 34 | def test_get_wish(self): 35 | movies = self.exporter.get_wish() 36 | for m in movies: 37 | log.debug(m) 38 | self.assertIsNotNone(m, 'content should not be none') 39 | self.assertNotEqual(m.title, '', 'movie title fetch failed') 40 | break 41 | 42 | def test_get_doing(self): 43 | movies = self.exporter.get_doing() 44 | for m in movies: 45 | log.debug(m) 46 | self.assertIsNotNone(m, 'content should not be none') 47 | self.assertNotEqual(m.title, '', 'movie title fetch failed') 48 | break 49 | 50 | def test_get_reviews(self): 51 | movies = self.exporter.get_reviews() 52 | for m in movies: 53 | log.debug(m) 54 | self.assertIsNotNone(m, 'content should not be none') 55 | self.assertNotEqual(m.title, '', 'movie title fetch failed') 56 | break 57 | 58 | # def test_get_doulist(self): 59 | # movies = self.exporter.get_doulist() 60 | # for m in movies: 61 | # log.debug(m) 62 | # self.assertIsNotNone(m, 'content should not be none') 63 | # self.assertNotEqual(m.title, '', 'movie title fetch failed') 64 | # break 65 | 66 | def test_get_review_content(self): 67 | c = self.exporter.get_review_content('10124597') 68 | log.debug(c) 69 | self.assertIsNotNone(c) 70 | 71 | 72 | if __name__ == '__main__': 73 | unittest.main() 74 | -------------------------------------------------------------------------------- /query/movie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import json 4 | 5 | import urllib.parse 6 | 7 | import requests 8 | from bs4 import BeautifulSoup, NavigableString 9 | 10 | from utils.logutil import log 11 | 12 | SEARCH_URL = 'https://movie.douban.com/j/subject_suggest?q=' 13 | PAGE_URL = 'https://movie.douban.com/subject/%s/' 14 | 15 | 16 | 17 | 18 | class Movie: 19 | def __init__(self): 20 | self.id = '' 21 | self.title = '' 22 | self.score = 0 23 | self.director = '' 24 | self.actor = '' 25 | self.year = '' 26 | self.sub_title = '' 27 | 28 | def __str__(self): 29 | text = '=============== Douban Movie ===============\n' + \ 30 | 'Title: ' + self.title + '\n' + \ 31 | 'Score: ' + str(self.score) + '\n' + \ 32 | 'Year: ' + self.year + '\n' + \ 33 | 'Director: ' + self.director + '\n' + \ 34 | 'Actors: ' + self.actor + '\n' + \ 35 | '================================================' 36 | return text.encode('utf-8') 37 | 38 | 39 | def search(query_word): 40 | query_word = urllib.parse.quote(query_word) 41 | url = SEARCH_URL + query_word 42 | r = requests.get(url) 43 | if r.status_code != 200: 44 | return 45 | 46 | data = r.text.encode('utf-8') 47 | items = json.loads(data) 48 | if len(items) == 0: 49 | return 50 | movies = [] 51 | for item in items: 52 | if item['type'] != 'movie': 53 | continue 54 | movie = Movie() 55 | movie.id = item['id'] 56 | movie.title = item['title'] 57 | movie.year = item['year'] 58 | movie.sub_title = item['sub_title'] 59 | movies.append(movie) 60 | return movies 61 | 62 | 63 | def parse(movie): 64 | url = PAGE_URL % movie.id 65 | log.debug(url) 66 | r = requests.get(url) 67 | soup = BeautifulSoup(r.text.encode('utf-8'), 'lxml') 68 | movie.score = soup.find('strong', 'rating_num').text 69 | info = soup.find('div', {'id': 'info'}) 70 | for linebreak in info.find_all('br'): 71 | linebreak.extract() 72 | for span in info.contents: 73 | if isinstance(span, NavigableString): 74 | continue 75 | if span.contents[0]: 76 | if span.contents[0].string == u'导演': 77 | if isinstance(span.contents[1], NavigableString): 78 | movie.director = span.contents[2].text 79 | elif span.contents[0].string == u'主演': 80 | if isinstance(span.contents[1], NavigableString): 81 | movie.actor = span.contents[2].text 82 | print(movie) 83 | 84 | 85 | def get_movie(text): 86 | movies = search(text) 87 | if movies and len(movies): 88 | parse(movies[0]) 89 | else: 90 | print('cound not find movie: ' + text) 91 | 92 | 93 | if __name__ == '__main__': 94 | get_movie("zootopia") -------------------------------------------------------------------------------- /exporter/notes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import requests 4 | from bs4 import BeautifulSoup 5 | 6 | from exporter import r0 7 | 8 | 9 | class NoteInfo: 10 | def __init__(self): 11 | self.title = '' 12 | self.url = '' 13 | self.id = '' 14 | self.content = '' 15 | self.publish_time = '' 16 | 17 | @classmethod 18 | def parse(cls, item): 19 | instance = cls() 20 | instance.title = item.select('h3')[0].text.strip() 21 | instance.url = item['data-url'] 22 | instance.id = r0(r'\d+', instance.url) 23 | instance.publish_time = item.select('.pub-date')[0].text.strip() 24 | return instance 25 | 26 | def __str__(self): 27 | s = [] 28 | for k in self.__dict__: 29 | s.append("{key}={value}".format(key=k, value=self.__dict__.get(k))) 30 | return ', '.join(s) 31 | 32 | def __repr__(self): 33 | return self.__str__() 34 | 35 | 36 | class NoteExport: 37 | """ 38 | 遍历网页的问题可能被豆瓣反爬虫机制伤及,如果能够直接从接口 dump 数据就比较快 39 | """ 40 | BASE_URL = 'https://www.douban.com/people/{}' 41 | 42 | def __init__(self, nickname): 43 | self.user_url = NoteExport.BASE_URL.format(nickname) 44 | 45 | def get_notes(self): 46 | """ 47 | Get one's all notes 48 | 49 | https://music.douban.com/people/einverne/reviews?start=0 50 | https://music.douban.com/j/review/10000057/fullinfo?show_works=False 51 | """ 52 | start = 0 53 | while True: 54 | note_list = self.__get_notes_list(start) 55 | step = len(note_list) 56 | if step == 0: 57 | break 58 | for note in note_list: 59 | note_info = NoteInfo.parse(note) 60 | content = self.get_note_content(note_info.id) 61 | bs = BeautifulSoup(content, 'html.parser') 62 | note_info.content = bs.text 63 | yield note_info 64 | if step < 10: 65 | break 66 | start += step 67 | 68 | def __get_notes_list(self, start=0): 69 | url = self.user_url + '/notes' 70 | r = requests.get(url, params={ 71 | 'type': 'note', 72 | 'start': start 73 | }, headers={ 74 | 'Host': 'www.douban.com', 75 | 'Referer': self.user_url + '/notes?start=0&type=note', 76 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' 77 | }) 78 | soup = BeautifulSoup(r.text, 'html.parser') 79 | return soup.select('.note-container') 80 | 81 | def get_note_content(self, id='721551646'): 82 | """ 83 | Get all note content by pass ID, return the content str 84 | https://www.douban.com/j/note/{id}/full 85 | :param id: 86 | :return: 87 | """ 88 | url = "https://www.douban.com/j/note/{}/full".format(id) 89 | r = requests.get(url, headers={ 90 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 91 | 'Accept-Encoding': 'gzip', 92 | 'Host': 'www.douban.com', 93 | 'Referer': self.user_url, 94 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' 95 | }) 96 | return r.json()['html'] 97 | 98 | 99 | if __name__ == '__main__': 100 | m = NoteExport('einverne') 101 | for note in m.get_notes(): 102 | print(note) 103 | -------------------------------------------------------------------------------- /exporter/status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import re 4 | 5 | import requests 6 | from bs4 import BeautifulSoup 7 | 8 | 9 | class MusicInfo: 10 | def __init__(self): 11 | self.title = '' 12 | self.url = '' 13 | self.intro = '' 14 | self.tags = '' 15 | self.comment = '' 16 | self.rating_date = '' 17 | self.rating = '' 18 | 19 | @classmethod 20 | def parse(cls, item): 21 | instance = cls() 22 | instance.title = item.select('.title a')[0].text.strip() 23 | instance.url = item.select('.title a')[0]['href'] 24 | instance.intro = item.select('.intro')[0].text.strip() 25 | instance.rating_date = item.select('.date')[0].text.strip() 26 | if len(item.select('.date span')) > 0: 27 | instance.rating = item.select('.date span')[0]['class'][0][6] 28 | if len(item.select('.tags')) > 0: 29 | instance.tags = item.select('.tags')[0].text 30 | if len(item.select('.comment')) > 0: 31 | instance.comment = item.select('.comment')[0].text.strip() 32 | return instance 33 | 34 | def __str__(self): 35 | s = [] 36 | for k in self.__dict__: 37 | s.append("{key}={value}".format(key=k, value=self.__dict__.get(k))) 38 | return ', '.join(s) 39 | 40 | def __repr__(self): 41 | return self.__str__() 42 | 43 | 44 | class StatusExport: 45 | """ 46 | 遍历网页的问题可能被豆瓣反爬虫机制伤及,如果能够直接从接口 dump 数据就比较快 47 | """ 48 | BASE_URL = 'https://www.douban.com/people/{}/statuses' 49 | 50 | def __init__(self, nickname): 51 | self.user_url = StatusExport.BASE_URL.format(nickname) 52 | 53 | def get_status(self): 54 | """ 55 | https://music.douban.com/people/einverne/collect 56 | 第 1 页 https://music.douban.com/people/einverne/collect?start=0&sort=time&rating=all&filter=all&mode=grid 57 | 第 2 页 https://music.douban.com/people/einverne/collect?start=15&sort=time&rating=all&filter=all&mode=grid 58 | 第 3 页 https://music.douban.com/people/einverne/collect?start=30&sort=time&rating=all&filter=all&mode=grid 59 | ... 60 | https://music.douban.com/people/einverne/collect?start=60&sort=time&rating=all&filter=all&mode=grid 61 | """ 62 | start = 1 63 | while True: 64 | item_list = self.__get_status_by_page(start) 65 | step = len(item_list) 66 | if step == 0: 67 | break 68 | for item in item_list: 69 | yield MusicInfo.parse(item) 70 | if step < 30: 71 | break 72 | start += step 73 | 74 | def __get_status_by_page(self, page_num=1): 75 | url = self.user_url 76 | r = requests.get(url, params={ 77 | 'p': page_num, 78 | }, headers={ 79 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 80 | 'Accept-Encoding': 'gzip, deflate, br', 81 | 'Referer': url, 82 | 'Host': 'www.douban.com' 83 | }) 84 | # res = brotli.decompress(r.content) 85 | soup = BeautifulSoup(r.text, 'html.parser') 86 | item_list = soup.select('.new-status status-wrapper') 87 | return item_list 88 | 89 | def get_watched(self): 90 | return self.get_status() 91 | 92 | def get_wish(self): 93 | """https://music.douban.com/people/einverne/wish""" 94 | return self.get_status(self.WISH) 95 | 96 | def get_doing(self): 97 | """https://music.douban.com/people/einverne/do""" 98 | return self.get_status(self.DOING) 99 | 100 | 101 | 102 | if __name__ == '__main__': 103 | m = StatusExport('einverne') 104 | l = m.get_status() 105 | for item in l: 106 | print(item) 107 | wishes = m.get_wish() 108 | for wish in wishes: 109 | print(wish) 110 | reviews = m.get_reviews() 111 | for r in reviews: 112 | print(r) 113 | -------------------------------------------------------------------------------- /exporter/book.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | from exporter import * 7 | 8 | 9 | class BookInfo(object): 10 | def __init__(self): 11 | self.title = '' 12 | self.url = '' 13 | self.intro = '' 14 | self.tags = '' 15 | self.comment = '' 16 | self.rating = '' 17 | self.rating_date = '' 18 | 19 | @classmethod 20 | def parse(cls, item): 21 | instance = cls() 22 | instance.title = item.select('.title a')[0].text.strip() 23 | instance.url = item.select('.title a')[0]['href'] 24 | instance.intro = item.select('.intro')[0].text.strip() 25 | instance.rating_date = item.select('.date')[0].text.strip() 26 | if len(item.select('.date span')) > 0: 27 | instance.rating = item.select('.date span')[0]['class'][0][6] 28 | if len(item.select('.tags')) > 0: 29 | instance.tags = item.select('.tags')[0].text 30 | if len(item.select('.comment')) > 0: 31 | instance.comment = item.select('.comment')[0].text.strip() 32 | return instance 33 | 34 | def __str__(self): 35 | s = [] 36 | for k in self.__dict__: 37 | s.append("{key}={value}".format(key=k, value=self.__dict__.get(k))) 38 | return ', '.join(s) 39 | 40 | def __repr__(self): 41 | return self.__str__() 42 | 43 | 44 | class BookReview(BaseReview): 45 | 46 | def parse(self, item): 47 | self.title = item.select('h3')[0].text.strip() 48 | self.url = item.select('h3 > a')[0]['href'] 49 | self.id = r0(r'\d+', self.url) 50 | return self 51 | 52 | 53 | class BookExport(BaseExporter): 54 | BASE_URL = 'https://book.douban.com/people/{}' 55 | 56 | def __init__(self, nickname): 57 | self.user_url = BookExport.BASE_URL.format(nickname) 58 | 59 | def get_books(self, path=COLLECT): 60 | start = 0 61 | while True: 62 | item_list = self.__get_book_list(path, start) 63 | step = len(item_list) 64 | if step == 0: 65 | break 66 | for item in item_list: 67 | yield BookInfo.parse(item) 68 | if step < 30: 69 | break 70 | start += step 71 | 72 | def __get_book_list(self, path=COLLECT, start=0): 73 | url = self.user_url + '/' + path 74 | r = requests.get(url, params={ 75 | 'start': start, 76 | 'sort': 'time', 77 | 'rating': 'all', 78 | 'filter': 'all', 79 | 'mode': 'list' 80 | }, headers={ 81 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 82 | 'Referer': url + '?start=0&sort=time&rating=all&filter=all&mode=list', 83 | 'Host': 'book.douban.com' 84 | }) 85 | soup = BeautifulSoup(r.text, 'html.parser') 86 | return soup.select('.item') 87 | 88 | def get_read(self): 89 | return self.get_books(COLLECT) 90 | 91 | def get_wish(self): 92 | return self.get_books(WISH) 93 | 94 | def get_reading(self): 95 | return self.get_books(DOING) 96 | 97 | def get_reviews(self): 98 | start = 0 99 | while True: 100 | reviews_list = self.__get_reviews_list(start) 101 | step = len(reviews_list) 102 | if step == 0: 103 | break 104 | for review in reviews_list: 105 | r = BookReview() 106 | r.parse(review) 107 | content = self.get_review_content(r.id) 108 | r.update(content) 109 | yield r 110 | start += step 111 | 112 | def __get_reviews_list(self, start=0): 113 | url = self.user_url + '/reviews' 114 | r = requests.get(url, params={ 115 | 'start': start 116 | }, headers={ 117 | 'Host': 'book.douban.com', 118 | 'Referer': url, 119 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' 120 | }) 121 | soup = BeautifulSoup(r.text, 'html.parser') 122 | return soup.select('.tlst') 123 | 124 | 125 | if __name__ == '__main__': 126 | b = BookExport('einverne') 127 | for review in b.get_reviews(): 128 | print(review) 129 | # for book in b.get_books(): 130 | # print(book.title) 131 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.log 3 | 4 | ### Intellij+all ### 5 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 6 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 7 | 8 | # User-specific stuff 9 | .idea/**/workspace.xml 10 | .idea/**/tasks.xml 11 | .idea/**/usage.statistics.xml 12 | .idea/**/dictionaries 13 | .idea/**/shelf 14 | 15 | # Generated files 16 | .idea/**/contentModel.xml 17 | 18 | # Sensitive or high-churn files 19 | .idea/**/dataSources/ 20 | .idea/**/dataSources.ids 21 | .idea/**/dataSources.local.xml 22 | .idea/**/sqlDataSources.xml 23 | .idea/**/dynamic.xml 24 | .idea/**/uiDesigner.xml 25 | .idea/**/dbnavigator.xml 26 | 27 | # Gradle 28 | .idea/**/gradle.xml 29 | .idea/**/libraries 30 | 31 | # Gradle and Maven with auto-import 32 | # When using Gradle or Maven with auto-import, you should exclude module files, 33 | # since they will be recreated, and may cause churn. Uncomment if using 34 | # auto-import. 35 | # .idea/modules.xml 36 | # .idea/*.iml 37 | # .idea/modules 38 | 39 | # CMake 40 | cmake-build-*/ 41 | 42 | # Mongo Explorer plugin 43 | .idea/**/mongoSettings.xml 44 | 45 | # File-based project format 46 | *.iws 47 | 48 | # IntelliJ 49 | out/ 50 | 51 | # mpeltonen/sbt-idea plugin 52 | .idea_modules/ 53 | 54 | # JIRA plugin 55 | atlassian-ide-plugin.xml 56 | 57 | # Cursive Clojure plugin 58 | .idea/replstate.xml 59 | 60 | # Crashlytics plugin (for Android Studio and IntelliJ) 61 | com_crashlytics_export_strings.xml 62 | crashlytics.properties 63 | crashlytics-build.properties 64 | fabric.properties 65 | 66 | # Editor-based Rest Client 67 | .idea/httpRequests 68 | 69 | # Android studio 3.1+ serialized cache file 70 | .idea/caches/build_file_checksums.ser 71 | 72 | # JetBrains templates 73 | **___jb_tmp___ 74 | 75 | ### Intellij+all Patch ### 76 | # Ignores the whole .idea folder and all .iml files 77 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 78 | 79 | .idea/ 80 | 81 | # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023 82 | 83 | *.iml 84 | modules.xml 85 | .idea/misc.xml 86 | *.ipr 87 | 88 | # Sonarlint plugin 89 | .idea/sonarlint 90 | 91 | ### Python ### 92 | # Byte-compiled / optimized / DLL files 93 | __pycache__/ 94 | *.py[cod] 95 | *$py.class 96 | 97 | # C extensions 98 | *.so 99 | 100 | # Distribution / packaging 101 | .Python 102 | build/ 103 | develop-eggs/ 104 | dist/ 105 | downloads/ 106 | eggs/ 107 | .eggs/ 108 | lib/ 109 | lib64/ 110 | parts/ 111 | sdist/ 112 | var/ 113 | wheels/ 114 | pip-wheel-metadata/ 115 | share/python-wheels/ 116 | *.egg-info/ 117 | .installed.cfg 118 | *.egg 119 | MANIFEST 120 | 121 | # PyInstaller 122 | # Usually these files are written by a python script from a template 123 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 124 | *.manifest 125 | *.spec 126 | 127 | # Installer logs 128 | pip-log.txt 129 | pip-delete-this-directory.txt 130 | 131 | # Unit test / coverage reports 132 | htmlcov/ 133 | .tox/ 134 | .nox/ 135 | .coverage 136 | .coverage.* 137 | .cache 138 | nosetests.xml 139 | coverage.xml 140 | *.cover 141 | .hypothesis/ 142 | .pytest_cache/ 143 | 144 | # Translations 145 | *.mo 146 | *.pot 147 | 148 | # Django stuff: 149 | *.log 150 | local_settings.py 151 | db.sqlite3 152 | 153 | # Flask stuff: 154 | instance/ 155 | .webassets-cache 156 | 157 | # Scrapy stuff: 158 | .scrapy 159 | 160 | # Sphinx documentation 161 | docs/_build/ 162 | 163 | # PyBuilder 164 | target/ 165 | 166 | # Jupyter Notebook 167 | .ipynb_checkpoints 168 | 169 | # IPython 170 | profile_default/ 171 | ipython_config.py 172 | 173 | # pyenv 174 | .python-version 175 | 176 | # pipenv 177 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 178 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 179 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 180 | # install all needed dependencies. 181 | #Pipfile.lock 182 | 183 | # celery beat schedule file 184 | celerybeat-schedule 185 | 186 | # SageMath parsed files 187 | *.sage.py 188 | 189 | # Environments 190 | .env 191 | .venv 192 | env/ 193 | venv/ 194 | ENV/ 195 | env.bak/ 196 | venv.bak/ 197 | 198 | # Spyder project settings 199 | .spyderproject 200 | .spyproject 201 | 202 | # Rope project settings 203 | .ropeproject 204 | 205 | # mkdocs documentation 206 | /site 207 | 208 | # mypy 209 | .mypy_cache/ 210 | .dmypy.json 211 | dmypy.json 212 | 213 | # Pyre type checker 214 | .pyre/ 215 | -------------------------------------------------------------------------------- /exporter/music.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | from exporter import * 7 | 8 | 9 | class MusicInfo: 10 | def __init__(self): 11 | self.title = '' 12 | self.url = '' 13 | self.intro = '' 14 | self.tags = '' 15 | self.comment = '' 16 | self.rating_date = '' 17 | self.rating = '' 18 | 19 | @classmethod 20 | def parse(cls, item): 21 | instance = cls() 22 | instance.title = item.select('.title a')[0].text.strip() 23 | instance.url = item.select('.title a')[0]['href'] 24 | instance.intro = item.select('.intro')[0].text.strip() 25 | instance.rating_date = item.select('.date')[0].text.strip() 26 | if len(item.select('.date span')) > 0: 27 | instance.rating = item.select('.date span')[0]['class'][0][6] 28 | if len(item.select('.tags')) > 0: 29 | instance.tags = item.select('.tags')[0].text 30 | if len(item.select('.comment')) > 0: 31 | instance.comment = item.select('.comment')[0].text.strip() 32 | return instance 33 | 34 | def __str__(self): 35 | s = [] 36 | for k in self.__dict__: 37 | s.append("{key}={value}".format(key=k, value=self.__dict__.get(k))) 38 | return ', '.join(s) 39 | 40 | def __repr__(self): 41 | return self.__str__() 42 | 43 | 44 | class MusicReview(BaseReview): 45 | 46 | def parse(self, item): 47 | self.title = item.select('h3')[0].text.strip() 48 | self.url = item.select('h3 > a')[0]['href'] 49 | self.id = r0(r'\d+', self.url) 50 | self.publish_time = item.select('.review-create-time')[0].text.strip() 51 | return self 52 | 53 | 54 | class MusicExport(BaseExporter): 55 | """ 56 | 遍历网页的问题可能被豆瓣反爬虫机制伤及,如果能够直接从接口 dump 数据就比较快 57 | """ 58 | BASE_URL = 'https://music.douban.com/people/{}' 59 | 60 | def __init__(self, nickname): 61 | self.user_url = MusicExport.BASE_URL.format(nickname) 62 | 63 | def get_musics(self, path=COLLECT): 64 | """ 65 | https://music.douban.com/people/einverne/collect 66 | 第 1 页 https://music.douban.com/people/einverne/collect?start=0&sort=time&rating=all&filter=all&mode=grid 67 | 第 2 页 https://music.douban.com/people/einverne/collect?start=15&sort=time&rating=all&filter=all&mode=grid 68 | 第 3 页 https://music.douban.com/people/einverne/collect?start=30&sort=time&rating=all&filter=all&mode=grid 69 | ... 70 | https://music.douban.com/people/einverne/collect?start=60&sort=time&rating=all&filter=all&mode=grid 71 | """ 72 | start = 0 73 | while True: 74 | item_list = self.__get_music_list(path, start) 75 | step = len(item_list) 76 | if step == 0: 77 | break 78 | for item in item_list: 79 | yield MusicInfo.parse(item) 80 | if step < 30: 81 | break 82 | start += step 83 | 84 | def __get_music_list(self, path='collect', start=0): 85 | url = self.user_url + '/' + path 86 | r = requests.get(url, params={ 87 | 'start': start, 88 | 'sort': 'time', 89 | 'rating': 'all', 90 | 'filter': 'all', 91 | 'mode': 'list' 92 | }, headers={ 93 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 94 | 'Accept-Encoding': 'gzip, deflate, br', 95 | 'Referer': url + '?start=0&sort=time&rating=all&filter=all&mode=grid', 96 | 'Host': 'music.douban.com' 97 | }) 98 | # res = brotli.decompress(r.content) 99 | soup = BeautifulSoup(r.text, 'html.parser') 100 | item_list = soup.select('.item') 101 | return item_list 102 | 103 | def get_listened(self): 104 | return self.get_musics() 105 | 106 | def get_wish(self): 107 | """https://music.douban.com/people/einverne/wish""" 108 | return self.get_musics(WISH) 109 | 110 | def get_doing(self): 111 | """https://music.douban.com/people/einverne/do""" 112 | return self.get_musics(DOING) 113 | 114 | def get_reviews(self): 115 | """ 116 | Get one's all music reviews 117 | 118 | https://music.douban.com/people/einverne/reviews?start=0 119 | https://music.douban.com/j/review/10000057/fullinfo?show_works=False 120 | """ 121 | start = 0 122 | while True: 123 | reviews_list = self.__get_reviews_list(start) 124 | step = len(reviews_list) 125 | if step == 0: 126 | break 127 | for review in reviews_list: 128 | r = MusicReview() 129 | r.parse(review) 130 | content = self.get_review_content(r.id) 131 | r.update(content) 132 | yield r 133 | start += step 134 | 135 | def __get_reviews_list(self, start=0): 136 | url = self.user_url + '/reviews' 137 | r = requests.get(url, params={ 138 | 'start': start 139 | }, headers={ 140 | 'Host': 'music.douban.com', 141 | 'Referer': self.user_url + '/reviews?start=10', 142 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' 143 | }) 144 | soup = BeautifulSoup(r.text, 'html.parser') 145 | return soup.select('.comment-item') 146 | 147 | def get_doulist(self): 148 | """ 149 | 创建的豆列 https://www.douban.com/people/einverne/doulists/all?start=20&tag= 150 | 关注的豆列 https://www.douban.com/people/einverne/doulists/collect?start=20 151 | """ 152 | pass 153 | 154 | 155 | if __name__ == '__main__': 156 | m = MusicExport('einverne') 157 | # l = m.get_musics() 158 | # for item in l: 159 | # print(item) 160 | # wishes = m.get_wish() 161 | # for wish in wishes: 162 | # print(wish) 163 | reviews = m.get_reviews() 164 | for r in reviews: 165 | print(r) 166 | -------------------------------------------------------------------------------- /exporter/movie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | from exporter import * 7 | 8 | """ 9 |
  • 10 |
    11 | 16 |
    17 |    18 | 2019-03-16 19 |
    20 |
    21 |
    22 |
    23 | 2018-12-05(韩国) / 孔晓振 / 金叡园 / 金圣武 / 赵福来 / 李家燮 / 李天熙 / 金在华 / 金光奎 / 韩智恩 / 车烨 / 裴明真 / 郑钟宇 / 李钟求 / 尹钟硕 / 李相熹 / 韩国 / 李权 / 102分钟 / 门锁 / 悬疑 / 惊悚 / 朴正熙 Jeong-hee Park / 李权 Kwon Lee / 阿尔贝托·马里尼 Alberto Marini / 韩语
    24 | 标签: 犯罪 惊悚悬疑 剧情 悬疑 韩国 恐怖 2018 惊悚 惊悚片 25 |
    26 |
    27 |
  • 28 | """ 29 | 30 | 31 | class MovieInfo: 32 | def __init__(self): 33 | self.title = '' 34 | self.url = '' 35 | self.intro = '' 36 | self.tags = '' 37 | self.comment = '' 38 | self.rating_date = '' 39 | self.rating = '' 40 | 41 | @classmethod 42 | def parse(cls, item): 43 | instance = cls() 44 | instance.title = item.select('.title a')[0].text.strip() 45 | instance.url = item.select('.title a')[0]['href'] 46 | instance.intro = item.select('.intro')[0].text.strip() 47 | instance.rating_date = item.select('.date')[0].text.strip() 48 | if len(item.select('.date span')) > 0: 49 | instance.rating = item.select('.date span')[0]['class'][0][6] 50 | if len(item.select('.tags')) > 0: 51 | instance.tags = item.select('.tags')[0].text 52 | if len(item.select('.comment')) > 0: 53 | instance.comment = item.select('.comment')[0].text.strip() 54 | return instance 55 | 56 | def __str__(self): 57 | s = [] 58 | for k in self.__dict__: 59 | s.append("{key}={value}".format(key=k, value=self.__dict__.get(k))) 60 | return ', '.join(s) 61 | 62 | def __repr__(self): 63 | return self.__str__() 64 | 65 | 66 | class MovieReview(BaseReview): 67 | 68 | def parse(self, item): 69 | self.title = item.select('h3')[0].text.strip() 70 | self.url = item.select('h3 > a')[0]['href'] 71 | self.id = r0(r'\d+', self.url) 72 | return self 73 | 74 | 75 | class MovieExport(BaseExporter): 76 | """ 77 | 遍历网页的问题可能被豆瓣反爬虫机制伤及,如果能够直接从接口 dump 数据就比较快 78 | """ 79 | BASE_URL = 'https://movie.douban.com/people/{}' 80 | 81 | def __init__(self, nickname): 82 | self.user_url = MovieExport.BASE_URL.format(nickname) 83 | 84 | def get_movies(self, path=COLLECT): 85 | """ 86 | https://movie.douban.com/people/einverne/collect 87 | 第 1 页 https://movie.douban.com/people/einverne/collect?start=0&sort=time&rating=all&filter=all&mode=grid 88 | 第 2 页 https://movie.douban.com/people/einverne/collect?start=15&sort=time&rating=all&filter=all&mode=grid 89 | 第 3 页 https://movie.douban.com/people/einverne/collect?start=30&sort=time&rating=all&filter=all&mode=grid 90 | ... 91 | https://movie.douban.com/people/einverne/collect?start=60&sort=time&rating=all&filter=all&mode=grid 92 | """ 93 | start = 0 94 | while True: 95 | item_list = self.__get_movie_list(path, start) 96 | step = len(item_list) 97 | if step == 0: 98 | break 99 | for item in item_list: 100 | yield MovieInfo.parse(item) 101 | if step < 30: 102 | break 103 | start += step 104 | 105 | def __get_movie_list(self, path='collect', start=0): 106 | url = self.user_url + '/' + path 107 | r = requests.get(url, params={ 108 | 'start': start, 109 | 'sort': 'time', 110 | 'rating': 'all', 111 | 'filter': 'all', 112 | 'mode': 'list' 113 | }, headers={ 114 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 115 | 'Accept-Encoding': 'gzip, deflate, br', 116 | 'Referer': url + '?start=0&sort=time&rating=all&filter=all&mode=grid', 117 | 'Host': 'movie.douban.com' 118 | }) 119 | # res = brotli.decompress(r.content) 120 | soup = BeautifulSoup(r.text, 'html.parser') 121 | item_list = soup.select('.item') 122 | return item_list 123 | 124 | def get_watched(self): 125 | return self.get_movies() 126 | 127 | def get_wish(self): 128 | """https://movie.douban.com/people/einverne/wish""" 129 | return self.get_movies(WISH) 130 | 131 | def get_doing(self): 132 | """https://movie.douban.com/people/einverne/do""" 133 | return self.get_movies(DOING) 134 | 135 | def get_reviews(self): 136 | """ 137 | Get one's all movie reviews 138 | 139 | https://movie.douban.com/people/einverne/reviews?start=0 140 | https://movie.douban.com/j/review/10000057/fullinfo?show_works=False 141 | """ 142 | start = 0 143 | while True: 144 | reviews_list = self.__get_reviews_list(start) 145 | step = len(reviews_list) 146 | if step == 0: 147 | break 148 | for review in reviews_list: 149 | r = MovieReview() 150 | r.parse(review) 151 | raw_content = self.get_review_content(r.id) 152 | r.update(raw_content) 153 | yield r 154 | start += step 155 | 156 | def __get_reviews_list(self, start=0): 157 | url = self.user_url + '/reviews' 158 | r = requests.get(url, params={ 159 | 'start': start 160 | }, headers={ 161 | 'Host': 'movie.douban.com', 162 | 'Referer': self.user_url + '/reviews?start=10', 163 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' 164 | }) 165 | soup = BeautifulSoup(r.text, 'html.parser') 166 | return soup.select('.tlst') 167 | 168 | def get_doulist(self): 169 | """ 170 | 创建的豆列 https://www.douban.com/people/einverne/doulists/all?start=20&tag= 171 | 关注的豆列 https://www.douban.com/people/einverne/doulists/collect?start=20 172 | """ 173 | pass 174 | 175 | 176 | if __name__ == '__main__': 177 | m = MovieExport('einverne') 178 | # l = m.get_movies() 179 | # for item in l: 180 | # print(item) 181 | # wishes = m.get_wish() 182 | # for wish in wishes: 183 | # print(wish) 184 | reviews = m.get_reviews() 185 | for r in reviews: 186 | print(r) 187 | -------------------------------------------------------------------------------- /exporter/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import codecs 5 | import csv 6 | import os 7 | from configparser import ConfigParser 8 | 9 | import click 10 | 11 | from exporter.book import BookExport 12 | from exporter.movie import MovieExport 13 | from exporter.music import MusicExport 14 | from exporter.notes import NoteExport 15 | 16 | CONFIG_PATH = os.path.join(os.environ.get('HOME'), '.douban-export') 17 | 18 | 19 | def read_config(): 20 | config = ConfigParser() 21 | if os.path.exists(CONFIG_PATH): 22 | config.read(CONFIG_PATH) 23 | return config 24 | 25 | 26 | @click.group(context_settings=dict(help_option_names=['-h', '--help'])) 27 | def cli(): 28 | pass 29 | 30 | 31 | def save_movie(l, writer): 32 | for m in l: 33 | click.echo(m.title) 34 | writer.writerow([ 35 | m.title, 36 | m.url, 37 | m.intro, 38 | m.tags, 39 | m.comment, 40 | m.rating_date, 41 | m.rating 42 | ]) 43 | 44 | 45 | def save_book(l, writer): 46 | for b in l: 47 | click.echo(b.title) 48 | writer.writerow([ 49 | b.title, 50 | b.url, 51 | b.intro, 52 | b.tags, 53 | b.comment, 54 | b.rating_date, 55 | b.rating 56 | ]) 57 | 58 | 59 | def save_music(l, writer): 60 | for music in l: 61 | click.echo(music.title) 62 | writer.writerow([ 63 | music.title, 64 | music.url, 65 | music.intro, 66 | music.tags, 67 | music.comment, 68 | music.rating_date, 69 | music.rating 70 | ]) 71 | 72 | 73 | @cli.command() 74 | @click.option('-u', '--userid', required=False, help='user id') 75 | @click.option('-t', '--type', required=False, 76 | type=click.Choice(['collect', 'wish', 'doing']), 77 | default='collect', 78 | help='type of list, collect, wish, doing') 79 | @click.option('-o', '--outfile', help='output filename') 80 | def movie(userid, type, outfile): 81 | if not userid: 82 | config = read_config() 83 | if 'auth' in config and 'username' in config['auth']: 84 | userid = config['auth']['username'] 85 | else: 86 | click.echo("run setup first or pass -u parameter") 87 | return 88 | movie_exporter = MovieExport(userid) 89 | fout = codecs.open(outfile, mode='w', encoding='utf-8') 90 | writer = csv.writer(fout, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) 91 | if type == 'collect': 92 | save_movie(movie_exporter.get_watched(), writer) 93 | elif type == 'wish': 94 | save_movie(movie_exporter.get_wish(), writer) 95 | elif type == 'doing': 96 | save_movie(movie_exporter.get_doing(), writer) 97 | fout.close() 98 | 99 | 100 | @cli.command() 101 | @click.option('-u', '--userid', required=False, help='user id') 102 | @click.option('-t', '--type', required=False, 103 | type=click.Choice(['collect', 'wish', 'doing']), 104 | default='collect', 105 | help='type of list, collect, wish, doing') 106 | @click.option('-o', '--outfile', help='output filename') 107 | def book(userid, type, outfile): 108 | if not userid: 109 | config = read_config() 110 | if 'auth' in config and 'username' in config['auth']: 111 | userid = config['auth']['username'] 112 | else: 113 | click.echo('run setup first or pass -u parameter') 114 | return 115 | exporter = BookExport(userid) 116 | fout = codecs.open(outfile, mode='w', encoding='utf-8') 117 | writer = csv.writer(fout, delimiter=',', quotechar='"', 118 | quoting=csv.QUOTE_MINIMAL) 119 | if type == 'collect': 120 | save_book(exporter.get_read(), writer) 121 | elif type == 'wish': 122 | save_book(exporter.get_wish(), writer) 123 | elif type == 'doing': 124 | save_book(exporter.get_reading(), writer) 125 | fout.close() 126 | 127 | 128 | @cli.command() 129 | @click.option('-u', '--userid', required=False, help='user id') 130 | @click.option('-t', '--type', required=False, 131 | type=click.Choice(['collect', 'wish', 'doing']), 132 | default='collect', 133 | help='type of list, collect, wish, doing') 134 | @click.option('-o', '--outfile', help='output filename') 135 | def music(userid, type, outfile): 136 | if not userid: 137 | config = read_config() 138 | if 'auth' in config and 'username' in config['auth']: 139 | userid = config['auth']['username'] 140 | else: 141 | click.echo('run setup first or pass -u parameter') 142 | return 143 | exporter = MusicExport(userid) 144 | fout = codecs.open(outfile, mode='w', encoding='utf-8') 145 | writer = csv.writer(fout, delimiter=',', quotechar='"', 146 | quoting=csv.QUOTE_MINIMAL) 147 | if type == 'collect': 148 | save_music(exporter.get_listened(), writer) 149 | elif type == 'wish': 150 | save_music(exporter.get_wish(), writer) 151 | elif type == 'doing': 152 | save_music(exporter.get_doing(), writer) 153 | fout.close() 154 | 155 | 156 | def save_note(notes, writer): 157 | for note in notes: 158 | writer.writerow([ 159 | note.title, 160 | note.url, 161 | note.id, 162 | note.content, 163 | note.publish_time 164 | ]) 165 | 166 | 167 | @cli.command() 168 | @click.option('-u', '--userid', required=False, help='user id') 169 | @click.option('-o', '--outfile', help='output filename') 170 | def note(userid, outfile): 171 | if not userid: 172 | config = read_config() 173 | if 'auth' in config and 'username' in config['auth']: 174 | userid = config['auth']['username'] 175 | else: 176 | click.echo('run setup first or pass -u parameter') 177 | return 178 | exporter = NoteExport(userid) 179 | fout = codecs.open(outfile, mode='w', encoding='utf-8') 180 | writer = csv.writer(fout, delimiter=',', quotechar='"', 181 | quoting=csv.QUOTE_MINIMAL) 182 | save_note(exporter.get_notes(), writer) 183 | fout.close() 184 | 185 | 186 | @cli.command() 187 | def setup(): 188 | """set up username""" 189 | config = read_config() 190 | if 'auth' in config and 'username' in config['auth']: 191 | click.echo("username already setup: " + config['auth']['username']) 192 | return 193 | username = input("UserId: ").strip() 194 | config['auth'] = {'username': username} 195 | with codecs.open(CONFIG_PATH, mode='w', encoding='utf-8') as fconfig: 196 | config.write(fconfig) 197 | 198 | 199 | if __name__ == '__main__': 200 | cli() 201 | -------------------------------------------------------------------------------- /douban-movie-export.user.js: -------------------------------------------------------------------------------- 1 | // ==UserScript== 2 | // @name 豆瓣电影导出工具 3 | // @namespace https://kisexu.com/ 4 | // @version 0.1 5 | // @description 将豆瓣已看电影导出为csv文件。启用本脚本,进入豆瓣个人页面后,在『我看』部分会有一链接『导出看过电影』,点击即可。无需登录,支持导出任意用户已看电影。 6 | // @author KiseXu 7 | // @copyright 2018, KiseXu (https://kisexu.com) 8 | // @license MIT 9 | // @match https://movie.douban.com/people/*/collect* 10 | // @match https://www.douban.com/people/* 11 | // @require https://unpkg.com/dexie@latest/dist/dexie.js 12 | // @grant none 13 | // ==/UserScript== 14 | 15 | // ==OpenUserJs== 16 | // @author KiseXu 17 | // ==/OpenUserJs== 18 | 19 | (function() { 20 | 'use strict'; 21 | 22 | // 页面触发部分 23 | if (location.href.indexOf('//www.douban.com/') > -1) { 24 | // 加入导出按钮 25 | var people = location.href.slice(location.href.indexOf('/people') + 8, -1); 26 | var export_link = 'https://movie.douban.com/people/' + people + '/collect?start=0&sort=time&rating=all&filter=all&mode=list&export=1'; 27 | $('#movie .pl a:last').after(' · 导出看过电影') 28 | } 29 | 30 | if (location.href.indexOf('//movie.douban.com/') > -1 && location.href.indexOf('export=1') > -1) { 31 | // 开始导出 32 | getPage(); 33 | } 34 | 35 | 36 | // 获取当前页数据 37 | function getCurrentPageList() { 38 | var items = []; 39 | 40 | $('li.item').each(function(index) { 41 | items[index] = { 42 | title: $(this).find('a').text().replace(/修改删除/, '').trim(), 43 | rating: ($(this).find('.date span').attr('class')) ? $(this).find('.date span').attr('class').slice(6, 7) : '', 44 | date: $(this).find('.date').text().trim(), 45 | link: $(this).find('.title a').attr('href').trim(), 46 | }; 47 | }); 48 | 49 | return items; 50 | } 51 | 52 | // 采集当前页数据,保存到indexedDB 53 | function getPage() { 54 | const db = new Dexie('db_export'); 55 | db.version(1).stores({ 56 | items: `++id, title, rating, date, link` 57 | }); 58 | 59 | var items = getCurrentPageList(); 60 | db.items.bulkAdd(items).then (function(){ 61 | console.log('保存成功'); 62 | // 获取下一页链接 63 | var next_link = $('span.next a').attr('href'); 64 | if (next_link) { 65 | next_link = next_link + '&export=1'; 66 | window.location.href = next_link; 67 | } else { 68 | exportAll() 69 | } 70 | }).catch(function(error) { 71 | console.log("Ooops: " + error); 72 | }); 73 | 74 | } 75 | 76 | // 导出所有数据到CSV 77 | function exportAll() { 78 | const db = new Dexie('db_export'); 79 | db.version(1).stores({ 80 | items: `++id, title, rating, date, link` 81 | }); 82 | db.items.orderBy('date').toArray().then(function(all){ 83 | all = all.map(function(item,index,array){ 84 | delete item.id; 85 | return item; 86 | }) 87 | 88 | JSonToCSV.setDataConver({ 89 | data: all, 90 | fileName: 'movie', 91 | columns: { 92 | title: ['片名', '个人评分', '打分日期', '影片链接'], 93 | key: ['title', 'rating', 'date', 'link'] 94 | } 95 | }); 96 | db.delete(); 97 | }); 98 | } 99 | 100 | // 导出CSV函数 101 | // https://github.com/liqingzheng/pc/blob/master/JsonExportToCSV.js 102 | var JSonToCSV = { 103 | /* 104 | * obj是一个对象,其中包含有: 105 | * ## data 是导出的具体数据 106 | * ## fileName 是导出时保存的文件名称 是string格式 107 | * ## showLabel 表示是否显示表头 默认显示 是布尔格式 108 | * ## columns 是表头对象,且title和key必须一一对应,包含有 109 | title:[], // 表头展示的文字 110 | key:[], // 获取数据的Key 111 | formatter: function() // 自定义设置当前数据的 传入(key, value) 112 | */ 113 | setDataConver: function(obj) { 114 | var bw = this.browser(); 115 | if(bw['ie'] < 9) return; // IE9以下的 116 | var data = obj['data'], 117 | ShowLabel = typeof obj['showLabel'] === 'undefined' ? true : obj['showLabel'], 118 | fileName = (obj['fileName'] || 'UserExport') + '.csv', 119 | columns = obj['columns'] || { 120 | title: [], 121 | key: [], 122 | formatter: undefined 123 | }; 124 | ShowLabel = typeof ShowLabel === 'undefined' ? true : ShowLabel; 125 | var row = "", CSV = '', key; 126 | // 如果要现实表头文字 127 | if (ShowLabel) { 128 | // 如果有传入自定义的表头文字 129 | if (columns.title.length) { 130 | columns.title.map(function(n) { 131 | row += n + ','; 132 | }); 133 | } else { 134 | // 如果没有,就直接取数据第一条的对象的属性 135 | for (key in data[0]) row += key + ','; 136 | } 137 | row = row.slice(0, -1); // 删除最后一个,号,即a,b, => a,b 138 | CSV += row + '\r\n'; // 添加换行符号 139 | } 140 | // 具体的数据处理 141 | data.map(function(n) { 142 | row = ''; 143 | // 如果存在自定义key值 144 | if (columns.key.length) { 145 | columns.key.map(function(m) { 146 | row += '"' + (typeof columns.formatter === 'function' ? columns.formatter(m, n[m]) || n[m] : n[m]) + '",'; 147 | }); 148 | } else { 149 | for (key in n) { 150 | row += '"' + (typeof columns.formatter === 'function' ? columns.formatter(key, n[key]) || n[key] : n[key]) + '",'; 151 | } 152 | } 153 | row.slice(0, row.length - 1); // 删除最后一个, 154 | CSV += row + '\r\n'; // 添加换行符号 155 | }); 156 | if(!CSV) return; 157 | this.SaveAs(fileName, CSV); 158 | }, 159 | SaveAs: function(fileName, csvData) { 160 | var bw = this.browser(); 161 | if(!bw['edge'] || !bw['ie']) { 162 | var alink = document.createElement("a"); 163 | alink.id = "linkDwnldLink"; 164 | alink.href = this.getDownloadUrl(csvData); 165 | document.body.appendChild(alink); 166 | var linkDom = document.getElementById('linkDwnldLink'); 167 | linkDom.setAttribute('download', fileName); 168 | linkDom.click(); 169 | document.body.removeChild(linkDom); 170 | } 171 | else if(bw['ie'] >= 10 || bw['edge'] == 'edge') { 172 | var _utf = "\uFEFF"; 173 | var _csvData = new Blob([_utf + csvData], { 174 | type: 'text/csv' 175 | }); 176 | navigator.msSaveBlob(_csvData, fileName); 177 | } 178 | else { 179 | var oWin = window.top.open("about:blank", "_blank"); 180 | oWin.document.write('sep=,\r\n' + csvData); 181 | oWin.document.close(); 182 | oWin.document.execCommand('SaveAs', true, fileName); 183 | oWin.close(); 184 | } 185 | }, 186 | getDownloadUrl: function(csvData) { 187 | var _utf = "\uFEFF"; // 为了使Excel以utf-8的编码模式,同时也是解决中文乱码的问题 188 | if (window.Blob && window.URL && window.URL.createObjectURL) { 189 | csvData = new Blob([_utf + csvData], { 190 | type: 'text/csv' 191 | }); 192 | return URL.createObjectURL(csvData); 193 | } 194 | // return 'data:attachment/csv;charset=utf-8,' + _utf + encodeURIComponent(csvData); 195 | }, 196 | browser: function() { 197 | var Sys = {}; 198 | var ua = navigator.userAgent.toLowerCase(); 199 | var s; 200 | (s = ua.indexOf('edge') !== - 1 ? Sys.edge = 'edge' : ua.match(/rv:([\d.]+)\) like gecko/)) ? Sys.ie = s[1]: 201 | (s = ua.match(/msie ([\d.]+)/)) ? Sys.ie = s[1] : 202 | (s = ua.match(/firefox\/([\d.]+)/)) ? Sys.firefox = s[1] : 203 | (s = ua.match(/chrome\/([\d.]+)/)) ? Sys.chrome = s[1] : 204 | (s = ua.match(/opera.([\d.]+)/)) ? Sys.opera = s[1] : 205 | (s = ua.match(/version\/([\d.]+).*safari/)) ? Sys.safari = s[1] : 0; 206 | return Sys; 207 | } 208 | }; 209 | 210 | })(); -------------------------------------------------------------------------------- /douban-book-export.user.js: -------------------------------------------------------------------------------- 1 | // ==UserScript== 2 | // @name 豆瓣读书&电影导出工具 3 | // @namespace https://kisexu.com/ 4 | // @version 0.1 5 | // @description 原描述为:“将豆瓣已看电影导出为csv文件。启用本脚本,进入豆瓣个人页面后,在『我看』部分会有一链接『导出看过电影』,点击即可。无需登录,支持导出任意用户已看电影。”本代码仅仅是将其添加了豆瓣读书的导出。 6 | // @author KiseXu 7 | // @copyright 2018, KiseXu (https://kisexu.com) 8 | // @license MIT 9 | // @match https://book.douban.com/people/*/collect* 10 | // @match https://movie.douban.com/people/*/collect* 11 | // @match https://www.douban.com/people/* 12 | // @require https://unpkg.com/dexie@latest/dist/dexie.js 13 | // @grant none 14 | // ==/UserScript== 15 | 16 | // ==OpenUserJs== 17 | // @author KiseXu 18 | // ==/OpenUserJs== 19 | 20 | (function() { 21 | 'use strict'; 22 | 23 | // 页面触发部分 24 | if (location.href.indexOf('//www.douban.com/') > -1) { 25 | // 加入导出按钮 26 | var people = location.href.slice(location.href.indexOf('/people') + 8, -1); 27 | var export_book_link = 'https://book.douban.com/people/' + people + '/collect?start=0&sort=time&rating=all&filter=all&mode=list&export=1'; 28 | $('#book .pl a:last').after(' · 导出读过的图书') 29 | var export_movie_link = 'https://movie.douban.com/people/' + people + '/collect?start=0&sort=time&rating=all&filter=all&mode=list&export=1'; 30 | $('#movie .pl a:last').after(' · 导出看过的电影') 31 | } 32 | 33 | if (location.href.indexOf('//book.douban.com/') > -1 && location.href.indexOf('export=1') > -1) { 34 | // 开始导出 35 | getPage(); 36 | } 37 | 38 | if (location.href.indexOf('//movie.douban.com/') > -1 && location.href.indexOf('export=1') > -1) { 39 | // 开始导出 40 | getPage(); 41 | } 42 | 43 | 44 | // 获取当前页数据 45 | function getCurrentPageList() { 46 | var items = []; 47 | 48 | $('li.item').each(function(index) { 49 | items[index] = { 50 | title: $(this).find('a').text().replace(/修改删除/, '').replace(/> 加入购书单/,'').trim(), 51 | rating: ($(this).find('.date span').attr('class')) ? $(this).find('.date span').attr('class').slice(6, 7) : '', 52 | date: $(this).find('.date').text().trim(), 53 | link: $(this).find('.title a').attr('href').trim(), 54 | comment:$(this).find('.comment').text().trim(), 55 | }; 56 | }); 57 | 58 | return items; 59 | } 60 | 61 | // 采集当前页数据,保存到indexedDB 62 | function getPage() { 63 | const db = new Dexie('db_export'); 64 | db.version(1).stores({ 65 | items: `++id, title, rating, date, link,comment` 66 | }); 67 | 68 | var items = getCurrentPageList(); 69 | db.items.bulkAdd(items).then (function(){ 70 | console.log('保存成功'); 71 | // 获取下一页链接 72 | var next_link = $('span.next a').attr('href'); 73 | if (next_link) { 74 | next_link = next_link + '&export=1'; 75 | window.location.href = next_link; 76 | } else { 77 | exportAll() 78 | } 79 | }).catch(function(error) { 80 | console.log("Ooops: " + error); 81 | }); 82 | 83 | } 84 | 85 | // 导出所有数据到CSV 86 | function exportAll() { 87 | const db = new Dexie('db_export'); 88 | db.version(1).stores({ 89 | items: `++id, title, rating, date, link,comment` 90 | }); 91 | db.items.orderBy('date').toArray().then(function(all){ 92 | all = all.map(function(item,index,array){ 93 | delete item.id; 94 | return item; 95 | }) 96 | 97 | JSonToCSV.setDataConver({ 98 | data: all, 99 | fileName: 'Book_Movie', 100 | columns: { 101 | title: ['标题', '个人评分', '打分日期', '条目链接','评论'], 102 | key: ['title', 'rating', 'date', 'link','comment'] 103 | } 104 | }); 105 | db.delete(); 106 | }); 107 | } 108 | 109 | // 导出CSV函数 110 | // https://github.com/liqingzheng/pc/blob/master/JsonExportToCSV.js 111 | var JSonToCSV = { 112 | /* 113 | * obj是一个对象,其中包含有: 114 | * ## data 是导出的具体数据 115 | * ## fileName 是导出时保存的文件名称 是string格式 116 | * ## showLabel 表示是否显示表头 默认显示 是布尔格式 117 | * ## columns 是表头对象,且title和key必须一一对应,包含有 118 | title:[], // 表头展示的文字 119 | key:[], // 获取数据的Key 120 | formatter: function() // 自定义设置当前数据的 传入(key, value) 121 | */ 122 | setDataConver: function(obj) { 123 | var bw = this.browser(); 124 | if(bw['ie'] < 9) return; // IE9以下的 125 | var data = obj['data'], 126 | ShowLabel = typeof obj['showLabel'] === 'undefined' ? true : obj['showLabel'], 127 | fileName = (obj['fileName'] || 'UserExport') + '.csv', 128 | columns = obj['columns'] || { 129 | title: [], 130 | key: [], 131 | formatter: undefined 132 | }; 133 | ShowLabel = typeof ShowLabel === 'undefined' ? true : ShowLabel; 134 | var row = "", CSV = '', key; 135 | // 如果要现实表头文字 136 | if (ShowLabel) { 137 | // 如果有传入自定义的表头文字 138 | if (columns.title.length) { 139 | columns.title.map(function(n) { 140 | row += n + ','; 141 | }); 142 | } else { 143 | // 如果没有,就直接取数据第一条的对象的属性 144 | for (key in data[0]) row += key + ','; 145 | } 146 | row = row.slice(0, -1); // 删除最后一个,号,即a,b, => a,b 147 | CSV += row + '\r\n'; // 添加换行符号 148 | } 149 | // 具体的数据处理 150 | data.map(function(n) { 151 | row = ''; 152 | // 如果存在自定义key值 153 | if (columns.key.length) { 154 | columns.key.map(function(m) { 155 | row += '"' + (typeof columns.formatter === 'function' ? columns.formatter(m, n[m]) || n[m] : n[m]) + '",'; 156 | }); 157 | } else { 158 | for (key in n) { 159 | row += '"' + (typeof columns.formatter === 'function' ? columns.formatter(key, n[key]) || n[key] : n[key]) + '",'; 160 | } 161 | } 162 | row.slice(0, row.length - 1); // 删除最后一个, 163 | CSV += row + '\r\n'; // 添加换行符号 164 | }); 165 | if(!CSV) return; 166 | this.SaveAs(fileName, CSV); 167 | }, 168 | SaveAs: function(fileName, csvData) { 169 | var bw = this.browser(); 170 | if(!bw['edge'] || !bw['ie']) { 171 | var alink = document.createElement("a"); 172 | alink.id = "linkDwnldLink"; 173 | alink.href = this.getDownloadUrl(csvData); 174 | document.body.appendChild(alink); 175 | var linkDom = document.getElementById('linkDwnldLink'); 176 | linkDom.setAttribute('download', fileName); 177 | linkDom.click(); 178 | document.body.removeChild(linkDom); 179 | } 180 | else if(bw['ie'] >= 10 || bw['edge'] == 'edge') { 181 | var _utf = "\uFEFF"; 182 | var _csvData = new Blob([_utf + csvData], { 183 | type: 'text/csv' 184 | }); 185 | navigator.msSaveBlob(_csvData, fileName); 186 | } 187 | else { 188 | var oWin = window.top.open("about:blank", "_blank"); 189 | oWin.document.write('sep=,\r\n' + csvData); 190 | oWin.document.close(); 191 | oWin.document.execCommand('SaveAs', true, fileName); 192 | oWin.close(); 193 | } 194 | }, 195 | getDownloadUrl: function(csvData) { 196 | var _utf = "\uFEFF"; // 为了使Excel以utf-8的编码模式,同时也是解决中文乱码的问题 197 | if (window.Blob && window.URL && window.URL.createObjectURL) { 198 | csvData = new Blob([_utf + csvData], { 199 | type: 'text/csv' 200 | }); 201 | return URL.createObjectURL(csvData); 202 | } 203 | // return 'data:attachment/csv;charset=utf-8,' + _utf + encodeURIComponent(csvData); 204 | }, 205 | browser: function() { 206 | var Sys = {}; 207 | var ua = navigator.userAgent.toLowerCase(); 208 | var s; 209 | (s = ua.indexOf('edge') !== - 1 ? Sys.edge = 'edge' : ua.match(/rv:([\d.]+)\) like gecko/)) ? Sys.ie = s[1]: 210 | (s = ua.match(/msie ([\d.]+)/)) ? Sys.ie = s[1] : 211 | (s = ua.match(/firefox\/([\d.]+)/)) ? Sys.firefox = s[1] : 212 | (s = ua.match(/chrome\/([\d.]+)/)) ? Sys.chrome = s[1] : 213 | (s = ua.match(/opera.([\d.]+)/)) ? Sys.opera = s[1] : 214 | (s = ua.match(/version\/([\d.]+).*safari/)) ? Sys.safari = s[1] : 0; 215 | return Sys; 216 | } 217 | }; 218 | 219 | })(); 220 | --------------------------------------------------------------------------------