├── .gitignore ├── LICENSE ├── README.md ├── book.py ├── cache_manager.py ├── csv_writer.py ├── details_parser.py ├── export.py ├── headers.txt ├── page_loader.py ├── rating_processor.py └── read_parser.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | cache 3 | __pycache__ 4 | read.html 5 | out.csv 6 | read_files/* 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Konstantin Khitrykh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LivelibExport 2 | 3 | ## Краткое описание 4 | 5 | Сервис https://www.livelib.ru не предоставляет удобного способа для экспорта прочитанного (как встроенной так и через API), поэтому был сделан отдельный скрипт для этого. Его использование поможет вам отвязаться от этого сервиса и иметь возможность мигрировать куда-нибудь еще. 6 | 7 | ## Экспорт книг из LiveLib в CSV 8 | 9 | - Войти на сайт 10 | - Открыть ссылку вида https://www.livelib.ru/reader/userName/read/print, где userName - ваше имя пользователя 11 | - Скачать текущее содержимое страницы в файл любым методом в файл read.html в папке со скриптом (возможная проблема - книги из файла не обнаруживаются скриптом, в таком случае следует создать html файл вручную и скопировать в него содержимое страницы из DevTools браузера) 12 | - Установить python3 (https://www.python.org/downloads/) и pip3 (https://pip.pypa.io/en/stable/installing/) 13 | - Установить зависимость: `pip3 install lxml` 14 | - Запустить скрипт: `python3 export.py` 15 | - Дополнительная опция `--convert-10-star-rating=True` может использоваться для указания того будут ли конвертироваться 10-звездные и дробные рейтинги в 5-звездные рейтинги (см. [issue](https://github.com/KonH/LivelibExport/issues/10)), по-умолчанию указывать этот ключ не нужно, он требуется только при включенных в настройках 10-звездных рейтингах 16 | - Дополнительная опция `--rating-convert-ceiling=True` используется в случае конвертации рейтингов и определяет, в какую сторону будет округлятся рейтинг (4,5: в случае True он станет 5, в случае False - 4) 17 | - Дополнительная опция `--parse-books-without-rating=False` используется, если необходимо конвертировать книги без указанного рейтинга (по умолчанию они игнорируются) 18 | - В случае возникновения проблем при получении содержимого страниц можно воспользоваться дополнительным режимом использования прокси-сервера FlareSolverr: 19 | - Использовать docker-образ, готовые исполняемые файлы или собрать из исходников - https://github.com/FlareSolverr/FlareSolverr 20 | - Запустить сервер и дождаться его инициализации (Test successful в логе FlareSolverr) 21 | - Указать дополнительную опцию `--proxy-host=localhost:8191` (значение по-умолчанию, может быть иным, указывается в логе FlareSolverr, 0.0.0.0 нужно заменить на localhost) 22 | - После этого вместо прямых запросов для получения содержимого страниц будет использоваться FlareSolverr 23 | - Это может решить основные проблемы, но не является гарантией (конкретно в случае Livelib определение блокировки работает некорректно) 24 | - Также в случае проблем может быть полезно: перезапустить сервер, зайти на сайт с помощью браузера и пройти проверку вручную, подождать некоторое время или сменить используемую сеть 25 | - Также возможно изменить задержки по-умолчанию с помощью параметров `--min-delay=90` и `--max-delay=120` (значения указываются в секундах) 26 | - Некоторые книги загружаются только для авторизованных пользователей и отдают ошибку 503 если авторизации нет, чтобы передать данные о логине используется файл headers.txt (это опционально): 27 | - С помощью DevTools своего браузера получите заголовки любой страницы сайта 28 | - Скопируйте их и укажите в файле headers.txt в том же формате 29 | - Заголовок Accept-Encoding будет проигнорирован 30 | - Будет загружен список книг из read.html и начата загрузка информации по ним 31 | - Подождать завершения процесса 32 | - Может потребоваться много времени, т.к. запросы отправляются с интервалом в 90-120 сек чтобы не было проблемы с блокировкой на стороне сервиса (это значение может быть изменено в скрипте, на свой страх и риск) 33 | - Итоговый файл out.csv будет содержать имя автора/авторов, название, ISBN и рейтинг, поставленный книге 34 | - Если для каких-то книг не было найдено ISBN, это будет отображено в логе и можно будет их добавить вручную 35 | - Отсутствующий ISBN означает следующее: 36 | - Он не указан на странице, тогда можно попробовать его найти в других источниках 37 | - Его нет в принципе, тогда этот метод экспорта не поможет 38 | - Есть необработанная ситуация в парсере, тогда можно завести issue здесь с указанием ссылки на страницу 39 | - Итоговый файл пригоден для импорта на https://goodreads.com, но потенциально может быть использован и в других случаях 40 | - Исходный код проекта открыт, issue по изменениям и багам можно присылать, pull request'ы будут рассматриваться -------------------------------------------------------------------------------- /book.py: -------------------------------------------------------------------------------- 1 | class Book: 2 | def __init__(this, link: str, rating: int, max_rating: int, date: str): 3 | this.link = link 4 | this.rating = rating 5 | this.max_rating = max_rating 6 | this.id : str = link[link.rfind("/")+1:] 7 | this.date = date 8 | this.authors = None 9 | this.name = None 10 | this.ISBN = None 11 | 12 | def __str__(this): 13 | return ('id="%s", link="%s", rating="%s", max_rating="%s", date="%s", authors="%s", name="%s", isbn="%s"' 14 | % (this.id, this.link, this.rating, this.max_rating, this.date, this.authors, this.name, this.ISBN)) 15 | 16 | def add_isbn(this, isbn: str): 17 | this.ISBN = isbn 18 | 19 | def add_authors(this, authors: list): 20 | this.authors = authors 21 | 22 | def add_name(this, name: str): 23 | this.name = name 24 | -------------------------------------------------------------------------------- /cache_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os import path 3 | 4 | # Operates with book pages cache 5 | class CacheManager: 6 | def __init__(this, cache_dir: str): 7 | this.cache_dir = cache_dir 8 | this.ensure_cache_dir() 9 | 10 | def ensure_cache_dir(this): 11 | if not path.exists(this.cache_dir): 12 | os.mkdir(this.cache_dir) 13 | 14 | def get_path(this, id: str): 15 | return path.join(this.cache_dir, id + '.html') 16 | 17 | def is_cached(this, id: str): 18 | return path.exists(this.get_path(id)) 19 | 20 | def save(this, id: str, content: str): 21 | file_name = this.get_path(id) 22 | with open(file_name, 'wb') as file: 23 | print('Save to cache: "%s"' % file_name) 24 | file.write(content) 25 | 26 | def load(this, id: str): 27 | file_name = this.get_path(id) 28 | try: 29 | with open(file_name, 'r', encoding="utf-8") as file: 30 | return file.read() 31 | except Exception as ex: 32 | print('load_book_content_from_cache("%s"): %s' % (file_name, ex)) 33 | return None 34 | 35 | -------------------------------------------------------------------------------- /csv_writer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from book import Book 3 | 4 | def str_or_empty(str): 5 | if str is None: 6 | return '' 7 | else: 8 | return str 9 | 10 | def joined_list_or_empty(list): 11 | if list is None: 12 | return '' 13 | else: 14 | return ", ".join(list) 15 | 16 | def rating_or_empty(rating): 17 | return rating if rating != -1 else '' 18 | 19 | def format_book(book: Book): 20 | return "%s; %s; %s; %s; %s; %s\n" % (book.id, joined_list_or_empty(book.authors), str_or_empty(book.name), str_or_empty(book.ISBN), rating_or_empty(book.rating), book.date) 21 | 22 | # Write books content to csv file 23 | class CsvWriter(): 24 | def save(this, books: list[Book], file_name: str): 25 | with open(file_name, 'w', encoding="utf-8") as file: 26 | file.write('ID; Author; Title; ISBN; My Rating; Date Added\n') 27 | for book in books: 28 | file.write(format_book(book)) -------------------------------------------------------------------------------- /details_parser.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from lxml import html 3 | from book import Book 4 | from cache_manager import CacheManager 5 | 6 | def parse_isbn_str(isbn_str: str): 7 | raw_isbn_str = isbn_str 8 | isbn = raw_isbn_str.split(',')[:1][0].split(' ')[0] 9 | return isbn 10 | 11 | def normalize_isbn(isbn: str): 12 | return isbn.replace('-', '').replace('.', '') 13 | 14 | def unique_texts(items): 15 | if len(items) > 0: 16 | return OrderedDict.fromkeys([item.text for item in items]).keys() 17 | else: 18 | return None 19 | 20 | def first_text_or_none(items): 21 | if len(items) > 0: 22 | return items[0].text 23 | else: 24 | return None 25 | 26 | def try_extract_authors_from_link(book_html): 27 | links = book_html.xpath('//a[starts-with(@class,"bc-author__link")]') 28 | return unique_texts(links) 29 | 30 | def try_extract_authors(book_html, book: Book): 31 | try: 32 | authors = try_extract_authors_from_link(book_html) 33 | if authors: 34 | book.add_authors(authors) 35 | return 36 | print('try_extract_authors(%s): can\'t find authors.' % book.id) 37 | except Exception as ex: 38 | print('try_extract_authors(%s): can\'t find authors: %s' % (book.id, ex)) 39 | 40 | 41 | def try_extract_name_from_span(book_html): 42 | name_spans = book_html.xpath('//span[@itemprop="name"]') 43 | return first_text_or_none(name_spans) 44 | 45 | def try_extract_name_from_header(book_html): 46 | headers = book_html.xpath('//h1') 47 | return first_text_or_none(headers) 48 | 49 | def try_extract_name(book_html, book: Book): 50 | name = try_extract_name_from_span(book_html) 51 | if name is not None: 52 | book.add_name(name) 53 | return 54 | name = try_extract_name_from_header(book_html) 55 | if name is not None: 56 | book.add_name(name) 57 | return 58 | print('try_extract_name(%s): can\'t find name.' % book.id) 59 | 60 | def parse_downloaded_book(book_content: str, book: Book): 61 | if book_content is not None: 62 | book_html = html.fromstring(book_content) 63 | try_extract_authors(book_html, book) 64 | try_extract_name(book_html, book) 65 | isbn_spans = book_html.xpath('/html/head/meta[@property="book:isbn"]/@content') 66 | if len(isbn_spans) > 0: 67 | raw_isbn = isbn_spans[0] 68 | isbn = normalize_isbn(parse_isbn_str(raw_isbn)) 69 | book.add_isbn(isbn) 70 | else: 71 | print('parse_downloaded_book(%s): can\'t find ISBN.' % book.id) 72 | return book 73 | 74 | # Parse detail pages to update Book entries 75 | class DetailsParser: 76 | def __init__(this, cache: CacheManager): 77 | this.cache = cache 78 | 79 | def parse(this, books: list[Book]): 80 | ready_books = [] 81 | for book in books: 82 | book_content = this.cache.load(book.id) 83 | ready_book = parse_downloaded_book(book_content, book) 84 | if ready_book is not None: 85 | ready_books.append(ready_book) 86 | return ready_books 87 | 88 | -------------------------------------------------------------------------------- /export.py: -------------------------------------------------------------------------------- 1 | from read_parser import ReadParser 2 | from csv_writer import CsvWriter 3 | from details_parser import DetailsParser 4 | from cache_manager import CacheManager 5 | from page_loader import PageLoader 6 | from rating_processor import RatingProcessor 7 | 8 | import sys, getopt 9 | 10 | try: 11 | opts, args = getopt.getopt(sys.argv[1:], '', ['convert-10-star-rating=', 'rating-convert-ceiling=', 'parse-books-without-rating=', 'min-delay=', 'max-delay=', 'proxy-host=']) 12 | except getopt.GetoptError: 13 | print('You can specify additional parameters (defaults below):') 14 | print('test.py --convert-10-star-rating=True --rating-convert-ceiling=True --parse-books-without-rating=False --min-delay=90 --max-delay=120 --proxy-host=') 15 | exit() 16 | 17 | # settings 18 | input_file_name = 'read.html' 19 | cache_dir_name = 'cache' 20 | out_file_name = 'out.csv' 21 | convert_10_star_rating = True 22 | rating_convert_ceiling = True 23 | parse_books_without_rating = False 24 | min_delay = 90 25 | max_delay = 120 26 | proxy_host = '' 27 | 28 | for opt, arg in opts: 29 | if opt == '--convert-10-star-rating': 30 | convert_10_star_rating = arg == 'True' 31 | if opt == '--rating-convert-ceiling': 32 | rating_convert_ceiling = arg == 'True' 33 | if opt == '--parse-books-without-rating': 34 | parse_books_without_rating = arg == 'True' 35 | if opt == '--min-delay': 36 | min_delay = int(arg) 37 | if opt == '--max-delay': 38 | max_delay = int(arg) 39 | if opt == '--proxy-host': 40 | proxy_host = arg 41 | print('Convert 10-star rating (defaults: True): %s' % convert_10_star_rating) 42 | print('Ceil rating while converting (defaults: True): %s' % rating_convert_ceiling) 43 | print('Parse books without rating (defaults: False): %s' % parse_books_without_rating) 44 | print('Min delay (defaults: 90): %s' % min_delay) 45 | print('Max delay (defaults: 120): %s' % max_delay) 46 | print('Proxy host (defaults: \'\'): %s' % proxy_host) 47 | 48 | print('Load books from file: "%s"' % input_file_name) 49 | read_parser = ReadParser() 50 | if read_parser.load_from_file(input_file_name) is False: 51 | exit(1) 52 | print('Books loaded.') 53 | 54 | print('Parse books from summary.') 55 | books = read_parser.parse_books(parse_books_without_rating) 56 | print('Books parsed: %s.' % len(books)) 57 | 58 | print('Start download detailed book pages.') 59 | cache = CacheManager(cache_dir_name) 60 | loader = PageLoader(cache, min_delay, max_delay, proxy_host) 61 | loader.download(books) 62 | print('Detailed book pages downloaded.') 63 | 64 | print('Prepare books for export.') 65 | details_parser = DetailsParser(cache) 66 | ready_books = details_parser.parse(books) 67 | print('Books ready to export: %s.' % len(ready_books)) 68 | 69 | rating_processor = RatingProcessor() 70 | should_convert_rating = convert_10_star_rating and rating_processor.is_applicable(ready_books) 71 | if should_convert_rating: 72 | print('Change rating from 10-star to 5-star format with accuracy loss') 73 | rating_processor.change_rating(ready_books, rating_convert_ceiling) 74 | 75 | writer = CsvWriter() 76 | writer.save(ready_books, out_file_name) 77 | print('Books saved to "%s"' % out_file_name) -------------------------------------------------------------------------------- /headers.txt: -------------------------------------------------------------------------------- 1 | Host: www.livelib.ru 2 | User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 -------------------------------------------------------------------------------- /page_loader.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | import json 4 | import http 5 | from urllib import request, error 6 | 7 | from book import Book 8 | from cache_manager import CacheManager 9 | 10 | def assert_page_content(content: bytes): 11 | str = content.decode('utf-8') 12 | if 'captcha-show' in str: 13 | print('Suspicios page content: %s' % str) 14 | raise Exception('DDoS protection page found, please adjust your settings and wait some time') 15 | 16 | def download_book_page_direct(link: str): 17 | print('Start direct download page from "%s"' % link) 18 | headers = {} 19 | with open('headers.txt', 'r') as file: 20 | for line in file: 21 | key, value = line.strip().split(': ') 22 | headers[key] = value 23 | if 'Accept-Encoding' in headers: 24 | del headers['Accept-Encoding'] # Breaks request reading 25 | req = request.Request(link, headers = headers) 26 | try: 27 | r = request.urlopen(req) 28 | with r as data: 29 | content: bytes = data.read() 30 | assert_page_content(content) 31 | print('Page downloaded.') 32 | return content 33 | except error.HTTPError as e: 34 | print('HTTP error: %s' % e) 35 | if hasattr(e, 'read'): 36 | content: bytes = e.read() 37 | print('Page contents: %s' % content) 38 | raise e 39 | 40 | def create_proxy_session(proxy_host: str): 41 | print('Create proxy session for "%s"' % proxy_host) 42 | endpoint = proxy_host + '/v1' 43 | print('Proxy endpoint: "%s"' % endpoint) 44 | req_data = { 45 | 'cmd': 'sessions.create', 46 | 'session': 'single_livelib_session' 47 | } 48 | req_json = json.dumps(req_data) 49 | print('Proxy create session request data: "%s"' % req_json) 50 | req_json_bytes = req_json.encode() 51 | headers = { 'Content-Type': 'application/json' } 52 | req = request.Request(endpoint, data=req_json_bytes, method='POST', headers = headers) 53 | r: http.client.HTTPResponse = request.urlopen(req) 54 | with r as data: 55 | full_content: bytes = data.read() 56 | full_content_str = full_content.decode('utf-8') 57 | full_content_obj: dict = json.loads(full_content_str) 58 | status = full_content_obj['status'] 59 | if status == 'ok': 60 | print('Proxy session successfully created') 61 | else: 62 | raise Exception('Unexpected create session status: ' + status) 63 | 64 | def destroy_proxy_session(proxy_host: str): 65 | print('Destroy proxy session for "%s"' % proxy_host) 66 | endpoint = proxy_host + '/v1' 67 | print('Proxy endpoint: "%s"' % endpoint) 68 | req_data = { 69 | 'cmd': 'sessions.destroy', 70 | 'session': 'single_livelib_session' 71 | } 72 | req_json = json.dumps(req_data) 73 | print('Proxy destroy session request data: "%s"' % req_json) 74 | req_json_bytes = req_json.encode() 75 | headers = { 'Content-Type': 'application/json' } 76 | req = request.Request(endpoint, data=req_json_bytes, method='POST', headers = headers) 77 | r: http.client.HTTPResponse = request.urlopen(req) 78 | with r as data: 79 | full_content: bytes = data.read() 80 | full_content_str = full_content.decode('utf-8') 81 | full_content_obj: dict = json.loads(full_content_str) 82 | status = full_content_obj['status'] 83 | if status == 'ok': 84 | print('Proxy session successfully destroyed') 85 | else: 86 | print('Unexpected session destroy status (non-critical): ' + status) 87 | 88 | def download_book_page_via_proxy(link: str, proxy_host: str): 89 | print('Start download page from "%s" using proxy "%s"' % (link, proxy_host)) 90 | endpoint = proxy_host + '/v1' 91 | print('Proxy endpoint: "%s"' % endpoint) 92 | req_data = { 93 | 'cmd': 'request.get', 94 | 'url': link, 95 | 'session': 'single_livelib_session', 96 | 'maxTimeout': 60000 97 | } 98 | req_json = json.dumps(req_data) 99 | print('Proxy request data: "%s"' % req_json) 100 | req_json_bytes = req_json.encode() 101 | headers = { 'Content-Type': 'application/json' } 102 | req = request.Request(endpoint, data=req_json_bytes, method='POST', headers = headers) 103 | r: http.client.HTTPResponse = request.urlopen(req) 104 | with r as data: 105 | full_content: bytes = data.read() 106 | full_content_str = full_content.decode('utf-8') 107 | full_content_obj: dict = json.loads(full_content_str) 108 | status = full_content_obj['status'] 109 | if status != 'ok': 110 | raise Exception('Unexpected status: ' + status) 111 | solution: dict = full_content_obj['solution'] 112 | response: str = solution['response'] 113 | content = response.encode('utf-8') 114 | assert_page_content(content) 115 | print('Page downloaded.') 116 | return content 117 | 118 | def download_book_page(link: str, proxy_host: str): 119 | if proxy_host: 120 | return download_book_page_via_proxy(link, proxy_host) 121 | else: 122 | return download_book_page_direct(link) 123 | 124 | def wait_for_delay(delay: int): 125 | print("Waiting %s sec..." % delay) 126 | time.sleep(delay) 127 | 128 | # Page loader to download book pages to cache 129 | class PageLoader: 130 | def __init__(this, cache: CacheManager, min_delay: int, max_delay: int, proxy_host: str): 131 | this.cache = cache 132 | this.min_delay = min_delay 133 | this.max_delay = max_delay 134 | this.proxy_host = proxy_host 135 | 136 | def try_download_book_page(this, book: Book, delay: int): 137 | full_link = "https://livelib.ru/book/" + book.id 138 | print('Downloading book with id = "%s" from "%s"' % (book.id, full_link)) 139 | if this.cache.is_cached(book.id): 140 | print('Already in cache, skipping.') 141 | return False 142 | else: 143 | if delay: 144 | wait_for_delay(delay) 145 | page = download_book_page(full_link, this.proxy_host) 146 | this.cache.save(book.id, page) 147 | return True 148 | 149 | def download(this, books: list[Book]): 150 | count = 1 151 | total = len(books) 152 | if this.proxy_host: 153 | create_proxy_session(this.proxy_host) 154 | delay = 0 155 | for book in books: 156 | print('%s/%s' % (count, total)) 157 | count += 1 158 | if this.try_download_book_page(book, delay): 159 | delay = random.randint(this.min_delay, this.max_delay) 160 | print() 161 | if this.proxy_host: 162 | destroy_proxy_session(this.proxy_host) 163 | -------------------------------------------------------------------------------- /rating_processor.py: -------------------------------------------------------------------------------- 1 | import math 2 | from book import Book 3 | 4 | # 10-star to 5-star rating converter 5 | class RatingProcessor: 6 | def is_applicable(this, books: list[Book]): 7 | for book in books: 8 | if (book.max_rating is not None and book.max_rating > 5 and book.rating != -1) or isinstance(book.rating, float): 9 | return True 10 | return False 11 | 12 | def change_rating(this, books: list[Book], ceil: bool): 13 | for book in books: 14 | if book.max_rating is not None and book.rating != -1: 15 | raw_rating = (book.rating / (book.max_rating / 5)) 16 | if ceil: 17 | book.rating = math.ceil(raw_rating) 18 | else: 19 | book.rating = math.floor(raw_rating) 20 | -------------------------------------------------------------------------------- /read_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import defaultdict 3 | from lxml import html 4 | from lxml import etree 5 | from book import Book 6 | 7 | def get_rating_from_title(rating_title: str): 8 | try: 9 | parts = rating_title.split() 10 | try: 11 | return int(parts[-3]) 12 | except ValueError: 13 | return float(parts[-3]) 14 | except IndexError: 15 | # Case for 'нет рейтинга' string 16 | return None 17 | except Exception as ex: 18 | print('get_rating_from_title("%s"): %s' % (rating_title, ex)) 19 | return None 20 | 21 | def get_max_rating_from_title(rating_title: str): 22 | try: 23 | parts = rating_title.split() 24 | return int(parts[-1]) 25 | except ValueError: 26 | # Case for 'нет рейтинга' string 27 | return None 28 | except Exception as ex: 29 | print('get_max_rating_from_title("%s"): %s' % (rating_title, ex)) 30 | return None 31 | 32 | def try_get_link(link: str): 33 | if "/book/" in link: 34 | return link 35 | return None 36 | 37 | def parse_book(row, last_date: str, without_rating: bool): 38 | link = None 39 | rating = None 40 | max_rating = None 41 | 42 | for cell in row.iter(): 43 | if rating is None: 44 | spans = cell.xpath('.//span') 45 | if len(spans) == 2: 46 | rating_title = spans[1].get('title') 47 | rating = get_rating_from_title(rating_title) 48 | max_rating = get_max_rating_from_title(rating_title) 49 | if link is None: 50 | hrefs = cell.xpath('.//a') 51 | for href in hrefs: 52 | link = try_get_link(hrefs[0].get('href')) 53 | 54 | if rating is None and without_rating: 55 | rating = -1 56 | 57 | if link is not None and rating is not None: 58 | return Book(link, rating, max_rating, last_date) 59 | if link is not None or rating is not None: 60 | if link is None: 61 | print('Parsing error (link is not parsed):') 62 | if rating is None: 63 | print('Parsing error (rating is not parsed):') 64 | print(etree.tostring(row)) 65 | print('') 66 | return None 67 | 68 | def try_parse_month(raw_month: str): 69 | dict = defaultdict(lambda: '01', { 70 | 'Январь': '01', 71 | 'Февраль': '02', 72 | 'Март': '03', 73 | 'Апрель': '04', 74 | 'Май': '05', 75 | 'Июнь': '06', 76 | 'Июль': '07', 77 | 'Август': '08', 78 | 'Сентябрь': '09', 79 | 'Октябрь': '10', 80 | 'Ноябрь': '11', 81 | 'Декабрь': '12' 82 | }) 83 | return dict[raw_month] 84 | 85 | def try_parse_date(row): 86 | headers = row.xpath('.//td/h2') 87 | for header in headers: 88 | raw_text = header.text 89 | if raw_text is not None: 90 | m = re.search('\d{4} г.', raw_text) 91 | if m is not None: 92 | year = m.group(0).split(' ')[0] 93 | raw_month = raw_text.split(' ')[0] 94 | month = try_parse_month(raw_month) 95 | return '%s-%s-01' % (year, month) 96 | return None 97 | 98 | # ReadParser - parse read list in html format 99 | class ReadParser: 100 | def load_from_file(this, file_name: str): 101 | try: 102 | with open(file_name, 'r', encoding="utf-8") as file: 103 | this.content = file.read() 104 | return True 105 | except Exception as ex: 106 | print('load_from_file("%s"): %s' % (file_name, ex)) 107 | this.content = None 108 | return False 109 | 110 | def parse_books(this, without_rating: bool) -> list[Book]: 111 | books = [] 112 | books_html = html.fromstring(this.content) 113 | rows = books_html.xpath('//tr') 114 | last_date = None 115 | for row in rows: 116 | result = parse_book(row, last_date, without_rating) 117 | if result is not None: 118 | books.append(result) 119 | else: 120 | date = try_parse_date(row) 121 | if date is not None: 122 | last_date = date 123 | return books 124 | --------------------------------------------------------------------------------