├── textbook_info.py ├── .gitignore ├── README.md └── pdf_book_download_from_zxxeducn.py /textbook_info.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import csv 4 | from pathlib import Path 5 | import os 6 | 7 | def get_parts(return_type='json'): 8 | '''get urls return list''' 9 | url = 'https://s-file-1.ykt.cbern.com.cn/zxx/ndrs/resources/tch_material/version/data_version.json' 10 | headers = { 11 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36", 12 | "Referer": "https://basic.smartedu.cn/", 13 | "Origin": "https://basic.smartedu.cn" 14 | } 15 | req = requests.get(url=url, headers=headers) 16 | 17 | if return_type == 'json': 18 | data = json.loads(req.text) 19 | else: 20 | data = req.text 21 | return data['urls'].split(',') 22 | 23 | def save_textbook_info(): 24 | """ 25 | Extract textbook IDs and names and save them to a CSV file 26 | """ 27 | # Get the URLs 28 | urls = get_parts() 29 | 30 | headers = { 31 | 'Referer': 'https://basic.smartedu.cn/', 32 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 33 | 'Origin': 'https://basic.smartedu.cn', 34 | 'Accept': 'application/json, text/plain, */*', 35 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 36 | } 37 | 38 | # Prepare CSV file 39 | home = str(Path.home()) 40 | dir_path = os.path.join(home, "Downloads") 41 | csv_path = os.path.join(dir_path, "textbook_info.csv") 42 | 43 | book_number = 1 # Initialize book counter 44 | 45 | # Add UTF-8 BOM to handle Chinese characters 46 | with open(csv_path, 'w', newline='', encoding='utf-8-sig') as csvfile: 47 | writer = csv.writer(csvfile) 48 | writer.writerow(['Number', 'Book ID', 'Book Name']) # Updated header 49 | 50 | for index, ref in enumerate(urls, 1): 51 | print(f"Processing directory {index}/{len(urls)}") 52 | response = requests.get(ref, headers=headers) 53 | response.encoding = 'utf-8' # Explicitly set response encoding 54 | info = json.loads(response.text) 55 | 56 | for book in info: 57 | try: 58 | book_id = book['id'] 59 | publisher = next((tag['tag_name'] for tag in book['tag_list'] if '版' in tag['tag_name']), '') 60 | book_name = f"{publisher}{book['title']}" 61 | 62 | writer.writerow([book_number, book_id, book_name]) 63 | book_number += 1 64 | 65 | except Exception as e: 66 | print(f"Error processing book: {str(e)}") 67 | 68 | print(f"CSV file has been saved to: {csv_path}") 69 | 70 | if __name__ == "__main__": 71 | save_textbook_info() -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be added to the global gitignore or merged into this project gitignore. For a PyCharm 158 | # project, it is recommended to include the following files: 159 | # .idea/ 160 | # *.iml 161 | # *.ipr 162 | # *.iws 163 | .idea/ 164 | *.iml 165 | *.ipr 166 | *.iws 167 | 168 | # VS Code 169 | .vscode/ 170 | 171 | # macOS 172 | .DS_Store 173 | .AppleDouble 174 | .LSOverride 175 | 176 | # Windows 177 | Thumbs.db 178 | ehthumbs.db 179 | Desktop.ini 180 | 181 | # Linux 182 | *~ 183 | 184 | # Project specific 185 | # Downloaded textbooks (these can be large and should not be in version control) 186 | ~/Downloads/textbook_download/ 187 | Downloads/textbook_download/ 188 | 189 | # CSV files with textbook metadata (these can be regenerated) 190 | *.csv 191 | 192 | # Log files 193 | *.log 194 | 195 | # Temporary files 196 | *.tmp 197 | *.temp 198 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Enhanced Textbook Downloader from National Smart Education Platform 2 | 3 | [English](#english) | [中文](#中文) 4 | 5 | --- 6 | 7 | ## English 8 | 9 | ### 📚 Overview 10 | 11 | This enhanced script downloads complete PDF textbooks from the Chinese National Smart Education Platform (国家中小学智慧教育平台) with multiple download modes, comprehensive error handling, and flexible download controls. 12 | 13 | ### ✨ Features 14 | 15 | - **Multiple Download Modes**: Sequence number, book range, book ID, and legacy catalog-based approaches 16 | - **CDN Fallback Logic**: Automatically tries r1, r2, r3 endpoints if one fails 17 | - **Enhanced Error Handling**: Detailed error messages and graceful fallbacks 18 | - **Progress Tracking**: Real-time download status and file size information 19 | - **Flexible Controls**: Download specific books, ranges, or use legacy catalog-based approach 20 | - **Robust Network Handling**: Timeouts, retries, and connection error handling 21 | 22 | ### 🚀 Download Modes 23 | 24 | #### 1. **By Sequence Number** (`--sequence`) 25 | Downloads a specific book by its global sequence number across all catalogs. 26 | 27 | ```bash 28 | python pdf_book_download_from_zxxeducn.py --sequence 2548 29 | ``` 30 | 31 | #### 2. **By Book Range** (`--range`) 32 | Downloads multiple books within a specified range. 33 | 34 | ```bash 35 | python pdf_book_download_from_zxxeducn.py --range "200-250" 36 | python pdf_book_download_from_zxxeducn.py --range "200" # Single book 37 | ``` 38 | 39 | #### 3. **By Book ID** (`--book-id`) 40 | Downloads a specific book by its unique identifier (UUID). 41 | 42 | ```bash 43 | python pdf_book_download_from_zxxeducn.py --book-id "bdc00134-465d-454b-a541-dcd0cec4d86e" 44 | ``` 45 | 46 | #### 4. **Legacy Modes** 47 | - `--single N`: Download only the Nth textbook from the catalog 48 | - `--limit N`: Download only N textbooks (starting from the beginning) 49 | - `--table N`: Start from catalog N (0-based indexing) 50 | - `--item N`: Start from item N within the catalog (0-based indexing) 51 | 52 | ### 📋 Requirements 53 | 54 | - Python 3.6+ 55 | - `requests` library 56 | - Internet connection 57 | - Access to the National Smart Education Platform 58 | 59 | ### 🛠️ Installation 60 | 61 | 1. Clone or download the script 62 | 2. Install required dependencies: 63 | ```bash 64 | pip install requests 65 | ``` 66 | 67 | ### 📖 Usage Examples 68 | 69 | ```bash 70 | # Download by sequence number 71 | python pdf_book_download_from_zxxeducn.py --sequence 2548 72 | 73 | # Download a range of books 74 | python pdf_book_download_from_zxxeducn.py --range "1-5" 75 | 76 | # Download by book ID 77 | python pdf_book_download_from_zxxeducn.py --book-id "bdc00134-465d-454b-a541-dcd0cec4d86e" 78 | 79 | # Legacy single book download 80 | python pdf_book_download_from_zxxeducn.py --single 1 81 | 82 | # Legacy limited download 83 | python pdf_book_download_from_zxxeducn.py --limit 10 84 | 85 | # Resume interrupted download 86 | python pdf_book_download_from_zxxeducn.py --table 1 --item 5 87 | ``` 88 | 89 | ### 🔧 Technical Details 90 | 91 | - **CDN Endpoints**: Automatically tries r1-ndr-oversea, r2-ndr-oversea, and r3-ndr-oversea in sequence 92 | - **File Validation**: Downloads are validated to ensure they are actual PDF files (>1MB) 93 | - **Network Timeouts**: 30-second timeout for all network requests 94 | - **Output Directory**: All downloads are saved to `~/Downloads/textbook_download/` 95 | 96 | ### 📁 Output 97 | 98 | Downloaded PDFs are saved to: 99 | ``` 100 | ~/Downloads/textbook_download/ 101 | ├── 统编版(根据2022年版课程标准修订)义务教育教科书·道德与法治一年级上册.pdf 102 | ├── 统编版(根据2022年版课程标准修订)义务教育教科书·道德与法治一年级下册.pdf 103 | └── ... 104 | ``` 105 | 106 | ### 📊 Companion Script: textbook_info.py 107 | 108 | The `textbook_info.py` script is a companion tool that collects metadata for all available textbooks and exports it to a CSV file. This is useful for: 109 | 110 | - **Finding specific textbooks**: Search through the CSV to locate books by title, publisher, or other criteria 111 | - **Planning downloads**: See the complete catalog before deciding what to download 112 | - **Resume functionality**: Use the sequence numbers to resume interrupted downloads 113 | 114 | #### Usage: 115 | ```bash 116 | python textbook_info.py 117 | ``` 118 | 119 | #### Output: 120 | - Creates a CSV file in your Downloads folder 121 | - Contains: Book ID, Title, Publisher, Catalog position, and Global sequence number 122 | - Useful for determining the correct parameters for the main download script 123 | 124 | #### Example CSV structure: 125 | ```csv 126 | sequence_number,catalog_index,catalog_position,book_id,title,publisher 127 | 1,0,0,bdc00134-465d-454b-a541-dcd0cec4d86e,义务教育教科书·道德与法治一年级上册,统编版 128 | 2,0,1,bdc00135-465d-454b-a541-dcd0cec4d86e,义务教育教科书·道德与法治一年级下册,统编版 129 | ... 130 | ``` 131 | 132 | ### 🐛 Troubleshooting 133 | 134 | - **Network Errors**: Check your internet connection and firewall settings 135 | - **Permission Errors**: Ensure you have write access to the Downloads folder 136 | - **Timeout Errors**: The script will automatically retry with different CDN endpoints 137 | 138 | ### 📝 License 139 | 140 | Open source - feel free to use and modify as needed. 141 | 142 | --- 143 | 144 | ## 中文 145 | 146 | ### 📚 概述 147 | 148 | 这是一个增强版的脚本,用于从国家中小学智慧教育平台下载完整的PDF教材,支持多种下载模式、全面的错误处理和灵活的下载控制。 149 | 150 | ### ✨ 功能特点 151 | 152 | - **多种下载模式**: 序列号、书籍范围、书籍ID和传统目录方式 153 | - **CDN故障转移**: 自动尝试r1、r2、r3端点,如果一个失败则切换到下一个 154 | - **增强错误处理**: 详细的错误信息和优雅的故障转移 155 | - **进度跟踪**: 实时下载状态和文件大小信息 156 | - **灵活控制**: 下载特定书籍、范围或使用传统目录方式 157 | - **稳健网络处理**: 超时、重试和连接错误处理 158 | 159 | ### 🚀 下载模式 160 | 161 | #### 1. **按序列号下载** (`--sequence`) 162 | 通过全局序列号下载特定书籍(跨所有目录)。 163 | 164 | ```bash 165 | python pdf_book_download_from_zxxeducn.py --sequence 2548 166 | ``` 167 | 168 | #### 2. **按书籍范围下载** (`--range`) 169 | 下载指定范围内的多本书籍。 170 | 171 | ```bash 172 | python pdf_book_download_from_zxxeducn.py --range "200-250" 173 | python pdf_book_download_from_zxxeducn.py --range "200" # 单本书 174 | ``` 175 | 176 | #### 3. **按书籍ID下载** (`--book-id`) 177 | 通过唯一标识符(UUID)下载特定书籍。 178 | 179 | ```bash 180 | python pdf_book_download_from_zxxeducn.py --book-id "bdc00134-465d-454b-a541-dcd0cec4d86e" 181 | ``` 182 | 183 | #### 4. **传统模式** 184 | - `--single N`: 仅下载目录中的第N本教材 185 | - `--limit N`: 限制本次运行下载的书籍数量 186 | - `--table N`: 从目录N开始(基于0的索引) 187 | - `--item N`: 从目录中的项目N开始(基于0的索引) 188 | 189 | ### 📋 系统要求 190 | 191 | - Python 3.6+ 192 | - `requests` 库 193 | - 网络连接 194 | - 访问国家中小学智慧教育平台的权限 195 | 196 | ### 🛠️ 安装 197 | 198 | 1. 克隆或下载脚本 199 | 2. 安装所需依赖: 200 | ```bash 201 | pip install requests 202 | ``` 203 | 204 | ### 📖 使用示例 205 | 206 | ```bash 207 | # 按序列号下载 208 | python pdf_book_download_from_zxxeducn.py --sequence 2548 209 | 210 | # 下载书籍范围 211 | python pdf_book_download_from_zxxeducn.py --range "1-5" 212 | 213 | # 按书籍ID下载 214 | python pdf_book_download_from_zxxeducn.py --book-id "bdc00134-465d-454b-a541-dcd0cec4d86e" 215 | 216 | # 传统单本书下载 217 | python pdf_book_download_from_zxxeducn.py --single 1 218 | 219 | # 传统限制下载 220 | python pdf_book_download_from_zxxeducn.py --limit 10 221 | 222 | # 恢复中断的下载 223 | python pdf_book_download_from_zxxeducn.py --table 1 --item 5 224 | ``` 225 | 226 | ### 🔧 技术细节 227 | 228 | - **CDN端点**: 自动按顺序尝试r1-ndr-oversea、r2-ndr-oversea和r3-ndr-oversea 229 | - **文件验证**: 验证下载内容确保是实际的PDF文件(>1MB) 230 | - **网络超时**: 所有网络请求30秒超时 231 | - **输出目录**: 所有下载保存到`~/Downloads/textbook_download/` 232 | 233 | ### 📁 输出 234 | 235 | 下载的PDF文件保存到: 236 | ``` 237 | ~/Downloads/textbook_download/ 238 | ├── 统编版(根据2022年版课程标准修订)义务教育教科书·道德与法治一年级上册.pdf 239 | ├── 统编版(根据2022年版课程标准修订)义务教育教科书·道德与法治一年级下册.pdf 240 | └── ... 241 | ``` 242 | 243 | ### 📊 配套脚本:textbook_info.py 244 | 245 | `textbook_info.py` 脚本是一个配套工具,用于收集所有可用教材的元数据并导出到CSV文件。这对于以下情况很有用: 246 | 247 | - **查找特定教材**: 通过CSV搜索按标题、出版社或其他条件定位书籍 248 | - **规划下载**: 在决定下载内容之前查看完整目录 249 | - **恢复功能**: 使用序列号恢复中断的下载 250 | 251 | #### 使用方法: 252 | ```bash 253 | python textbook_info.py 254 | ``` 255 | 256 | #### 输出: 257 | - 在Downloads文件夹中创建CSV文件 258 | - 包含:书籍ID、标题、出版社、目录位置和全局序列号 259 | - 有助于确定主下载脚本的正确参数 260 | 261 | #### CSV结构示例: 262 | ```csv 263 | sequence_number,catalog_index,catalog_position,book_id,title,publisher 264 | 1,0,0,bdc00134-465d-454b-a541-dcd0cec4d86e,义务教育教科书·道德与法治一年级上册,统编版 265 | 2,0,1,bdc00135-465d-454b-a541-dcd0cec4d86e,义务教育教科书·道德与法治一年级下册,统编版 266 | ... 267 | ``` 268 | 269 | ### 🐛 故障排除 270 | 271 | - **网络错误**: 检查网络连接和防火墙设置 272 | - **权限错误**: 确保对Downloads文件夹有写入权限 273 | - **超时错误**: 脚本将自动尝试不同的CDN端点 274 | 275 | ### 📝 许可证 276 | 277 | 开源 - 可自由使用和修改。 278 | 279 | --- 280 | 281 | ## 🔄 Version History 282 | 283 | - **v3.0.0**: Modified the download path and added more download mods. Enhanced documentation, type hints, and modular architecture 284 | - **v2.0.0**: Modified the download path and added new download control 285 | - **v1.0.0**: Original script with basic functionality 286 | 287 | ## 🤝 Contributing 288 | 289 | Feel free to submit issues, feature requests, or pull requests to improve this script. 290 | 291 | ## 📞 Support 292 | 293 | If you encounter any issues or have questions, please check the troubleshooting section above or create an issue in the repository. 294 | -------------------------------------------------------------------------------- /pdf_book_download_from_zxxeducn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Enhanced Textbook Downloader from the National Smart Education Platform (国家中小学智慧教育平台) 4 | 5 | This script provides multiple methods to download complete PDF textbooks from the Chinese national 6 | education platform, comprehensive error handling, and flexible download controls. 7 | 8 | License: Open source 9 | Version: 3.0.0 10 | 11 | Features: 12 | - CDN fallback: Automatically tries r1, r2, r3 endpoints if one fails 13 | - Multiple download modes: Sequence number, book range, book ID, and legacy modes 14 | - Enhanced error handling: Detailed error messages and graceful fallbacks 15 | - Progress tracking: Real-time download status and file size information 16 | - Flexible controls: Download specific books, ranges, or use legacy catalog-based approach 17 | """ 18 | 19 | import requests 20 | import json 21 | import os 22 | from pathlib import Path 23 | from urllib.parse import quote 24 | import argparse 25 | import time 26 | from typing import List, Tuple, Optional, Dict, Any, Union 27 | 28 | # Get user's home directory 29 | home = str(Path.home()) 30 | 31 | # Construct the path using os.path.join for cross-platform compatibility 32 | dir_path = os.path.join(home, "Downloads") 33 | 34 | # Verify the directory exists 35 | if not os.path.exists(dir_path): 36 | raise FileNotFoundError(f"Directory not found: {dir_path}") 37 | 38 | 39 | def get_parts(return_type: str = 'json') -> List[str]: 40 | """ 41 | Fetch the catalog URLs from the National Smart Education Platform. 42 | 43 | This function retrieves the list of catalog URLs that contain textbook metadata. 44 | There are typically 4 catalogs, with the first 3 containing up to 1000 books each. 45 | 46 | Args: 47 | return_type (str): Type of return value. 'json' returns parsed JSON, 48 | any other value returns raw text. Default: 'json' 49 | 50 | Returns: 51 | List[str]: List of catalog URLs for fetching textbook metadata 52 | 53 | Raises: 54 | requests.RequestException: If the HTTP request fails 55 | json.JSONDecodeError: If the response is not valid JSON (when return_type='json') 56 | 57 | Example: 58 | >>> urls = get_parts() 59 | >>> print(f"Found {len(urls)} catalogs") 60 | Found 4 catalogs 61 | """ 62 | url = 'https://s-file-1.ykt.cbern.com.cn/zxx/ndrs/resources/tch_material/version/data_version.json' 63 | headers = { 64 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36", 65 | "Referer": "https://basic.smartedu.cn/", 66 | "Origin": "https://basic.smartedu.cn" 67 | } 68 | 69 | try: 70 | req = requests.get(url=url, headers=headers, timeout=30) 71 | req.raise_for_status() # Raise exception for bad status codes 72 | 73 | if return_type == 'json': 74 | data = json.loads(req.text) 75 | return data['urls'].split(',') 76 | else: 77 | return req.text 78 | except requests.RequestException as e: 79 | print(f"❌ Failed to fetch catalog URLs: {e}") 80 | raise 81 | except json.JSONDecodeError as e: 82 | print(f"❌ Invalid JSON response from catalog API: {e}") 83 | raise 84 | 85 | 86 | def get_pdf_url(book_id: str) -> Optional[List[str]]: 87 | """ 88 | Retrieve PDF download URLs for a specific textbook using its book ID. 89 | 90 | This function fetches the textbook metadata and extracts all available CDN endpoints 91 | for PDF downloads. It transforms the private URLs to oversea URLs for public access. 92 | 93 | Args: 94 | book_id (str): The unique identifier (UUID) of the textbook 95 | 96 | Returns: 97 | Optional[List[str]]: List of CDN URLs for PDF download, or None if failed 98 | 99 | Raises: 100 | requests.RequestException: If the HTTP request fails 101 | json.JSONDecodeError: If the metadata response is not valid JSON 102 | KeyError: If the expected metadata structure is missing 103 | 104 | Example: 105 | >>> urls = get_pdf_url("bdc00134-465d-454b-a541-dcd0cec4d86e") 106 | >>> if urls: 107 | ... print(f"Found {len(urls)} CDN endpoints") 108 | Found 3 CDN endpoints 109 | """ 110 | try: 111 | # Construct the metadata API URL 112 | json_url = f"https://s-file-1.ykt.cbern.com.cn/zxx/ndrv2/resources/tch_material/details/{book_id}.json" 113 | 114 | # Fetch the textbook metadata 115 | response = requests.get(json_url) 116 | response.raise_for_status() 117 | 118 | # Parse the JSON response 119 | data = response.json() 120 | 121 | # Search for the source item (contains PDF download links) 122 | for item in data['ti_items']: 123 | if item.get('ti_file_flag') == 'source': 124 | if 'ti_storages' in item and item['ti_storages']: 125 | # Transform all private URLs to oversea URLs 126 | pdf_urls = [] 127 | for storage_url in item['ti_storages']: 128 | oversea_url = storage_url.replace('-private', '-oversea') 129 | pdf_urls.append(oversea_url) 130 | return pdf_urls 131 | 132 | # No source item found 133 | print(f"⚠️ No PDF source found for book ID: {book_id}") 134 | return None 135 | 136 | except requests.RequestException as e: 137 | print(f"❌ Network error getting metadata for {book_id}: {e}") 138 | return None 139 | except json.JSONDecodeError as e: 140 | print(f"❌ Invalid JSON response for {book_id}: {e}") 141 | return None 142 | except KeyError as e: 143 | print(f"❌ Unexpected metadata structure for {book_id}: {e}") 144 | return None 145 | except Exception as e: 146 | print(f"❌ Unexpected error getting metadata for {book_id}: {e}") 147 | import traceback 148 | print("Full traceback:") 149 | print(traceback.format_exc()) 150 | return None 151 | 152 | 153 | def download_pdf_with_cdn_fallback(pdf_urls: List[str], book_name: str, 154 | headers: Dict[str, str], work_path: str) -> bool: 155 | """ 156 | Download a PDF textbook with automatic CDN fallback logic. 157 | 158 | This function attempts to download the PDF from multiple CDN endpoints (r1, r2, r3) 159 | in sequence. If one fails, it automatically tries the next. It validates the downloaded 160 | content to ensure it's actually a PDF file and not an error page. 161 | 162 | Args: 163 | pdf_urls (List[str]): List of CDN URLs to try for download 164 | book_name (str): Name of the textbook (used for filename) 165 | headers (Dict[str, str]): HTTP headers for the download request 166 | work_path (str): Directory path where the PDF should be saved 167 | 168 | Returns: 169 | bool: True if download succeeded, False if all CDN endpoints failed 170 | 171 | Raises: 172 | OSError: If there's an error writing the file to disk 173 | requests.RequestException: If all HTTP requests fail 174 | 175 | Example: 176 | >>> success = download_pdf_with_cdn_fallback(urls, "Math Book", headers, "/downloads") 177 | >>> if success: 178 | ... print("Download completed successfully") 179 | Download completed successfully 180 | """ 181 | if not pdf_urls: 182 | print(f"❌ No PDF URLs available for {book_name}") 183 | return False 184 | 185 | # Try each CDN endpoint in sequence 186 | for i, pdf_url in enumerate(pdf_urls): 187 | cdn_name = f"r{i+1}-ndr-oversea" 188 | 189 | try: 190 | # Attempt to download the PDF with timeout 191 | pdf_response = requests.get(pdf_url, headers=headers, timeout=30) 192 | 193 | if pdf_response.status_code == 200: 194 | # Validate the downloaded content 195 | content_length = len(pdf_response.content) 196 | content_type = pdf_response.headers.get('content-type', '') 197 | 198 | # Check if we got a valid PDF (not an error page) 199 | if 'pdf' in content_type.lower() and content_length > 1000000: # > 1MB 200 | # Save the PDF to disk 201 | file_path = os.path.join(work_path, f"{book_name}.pdf") 202 | 203 | try: 204 | with open(file_path, 'wb') as f: 205 | f.write(pdf_response.content) 206 | 207 | print(f" 💾 Downloaded: {book_name} {content_length / (1024*1024):.1f} MB") 208 | return True 209 | 210 | except OSError as e: 211 | print(f" ❌ Failed to save file: {e}") 212 | continue 213 | else: 214 | print(f" ⚠️ {cdn_name} returned invalid content: {content_type}, {content_length} bytes") 215 | else: 216 | print(f" ❌ {cdn_name} failed: Status {pdf_response.status_code}") 217 | 218 | except requests.exceptions.Timeout: 219 | print(f" ⏰ {cdn_name} timeout after 30 seconds") 220 | except requests.exceptions.RequestException as e: 221 | print(f" ❌ {cdn_name} network error: {e}") 222 | except Exception as e: 223 | print(f" ❌ {cdn_name} unexpected error: {e}") 224 | 225 | # All CDN endpoints failed 226 | print(f"❌ All CDN endpoints failed for {book_name}") 227 | return False 228 | 229 | 230 | def get_book_by_sequence_number(catalog_urls: List[str], sequence_number: int) -> Tuple[Optional[Dict[str, Any]], Optional[int], Optional[int]]: 231 | """ 232 | Locate a textbook by its global sequence number across all catalogs. 233 | 234 | This function maps a global sequence number (e.g., 2548) to the specific catalog 235 | and position where that textbook can be found. It handles the catalog structure 236 | where the first 3 catalogs typically contain 1000 books each. 237 | 238 | Args: 239 | catalog_urls (List[str]): List of catalog API URLs 240 | sequence_number (int): Global sequence number of the textbook (1-based) 241 | 242 | Returns: 243 | Tuple[Optional[Dict[str, Any]], Optional[int], Optional[int]]: 244 | - book_info: Dictionary containing textbook metadata, or None if not found 245 | - catalog_index: Index of the catalog (0-3), or None if not found 246 | - catalog_position: Position within the catalog (0-based), or None if not found 247 | 248 | Raises: 249 | requests.RequestException: If any catalog API request fails 250 | json.JSONDecodeError: If any catalog response is not valid JSON 251 | 252 | Example: 253 | >>> book_info, cat_idx, cat_pos = get_book_by_sequence_number(urls, 2548) 254 | >>> if book_info: 255 | ... print(f"Found book {book_info['title']} in catalog {cat_idx + 1}") 256 | Found book 道德与法治 in catalog 3 257 | """ 258 | # Validate input 259 | if sequence_number < 1: 260 | print(f"❌ Invalid sequence number: {sequence_number} (must be >= 1)") 261 | return None, None, None 262 | 263 | current_sequence = 1 # Start counting from the first book 264 | 265 | # Iterate through each catalog to find the target sequence number 266 | for catalog_index, catalog_url in enumerate(catalog_urls): 267 | try: 268 | # Fetch catalog data 269 | response = requests.get(catalog_url, timeout=30) 270 | response.raise_for_status() 271 | info = json.loads(response.text) 272 | 273 | # Calculate how many books this catalog has 274 | catalog_size = len(info) 275 | 276 | # Check if our target sequence number falls within this catalog 277 | if current_sequence <= sequence_number < current_sequence + catalog_size: 278 | # Calculate the position within this catalog 279 | catalog_position = sequence_number - current_sequence 280 | 281 | # Verify the calculated position exists in the catalog 282 | if catalog_position < len(info): 283 | book_info = info[catalog_position] 284 | return book_info, catalog_index, catalog_position 285 | 286 | # Move to the next catalog's starting sequence number 287 | current_sequence += catalog_size 288 | 289 | except requests.RequestException as e: 290 | print(f"❌ Network error processing catalog {catalog_index + 1}: {e}") 291 | continue 292 | except json.JSONDecodeError as e: 293 | print(f"❌ Invalid JSON in catalog {catalog_index + 1}: {e}") 294 | continue 295 | except Exception as e: 296 | print(f"❌ Unexpected error processing catalog {catalog_index + 1}: {e}") 297 | continue 298 | 299 | # Sequence number not found in any catalog 300 | print(f"❌ Sequence number {sequence_number} not found in any catalog") 301 | return None, None, None 302 | 303 | 304 | def pdf_download(table: int = 0, item: int = 0, single_book: Optional[int] = None, 305 | download_limit: Optional[int] = None, sequence_number: Optional[int] = None, 306 | book_range: Optional[str] = None, book_id: Optional[str] = None) -> None: 307 | """ 308 | Enhanced textbook downloader with multiple download modes and CDN fallback. 309 | 310 | This is the main function that orchestrates textbook downloads based on the specified mode. 311 | It supports downloading by sequence number, book range, book ID, and legacy catalog-based 312 | approaches. All downloads use CDN fallback logic for reliability. 313 | 314 | Args: 315 | table (int): Starting catalog index (0-based). Default: 0 316 | item (int): Starting item index within the catalog (0-based). Default: 0 317 | single_book (Optional[int]): Download only one specific book number. Default: None 318 | download_limit (Optional[int]): Limit the number of books to download. Default: None 319 | sequence_number (Optional[int]): Download by global sequence number. Default: None 320 | book_range (Optional[str]): Download by book range (e.g., "200-250"). Default: None 321 | book_id (Optional[str]): Download by specific book ID (UUID). Default: None 322 | 323 | Returns: 324 | None: This function performs downloads but doesn't return values 325 | 326 | Raises: 327 | OSError: If there are file system errors 328 | requests.RequestException: If network requests fail 329 | ValueError: If book range format is invalid 330 | 331 | Example: 332 | # Download by sequence number 333 | pdf_download(sequence_number=2548) 334 | 335 | # Download by range 336 | pdf_download(book_range="1-10") 337 | 338 | # Legacy mode - download first 5 books from catalog 0 339 | pdf_download(limit=5) 340 | """ 341 | print("🚀 Starting textbook download...") 342 | 343 | # Set up HTTP headers for all requests 344 | headers = { 345 | 'Referer': 'https://basic.smartedu.cn/', 346 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 347 | 'Origin': 'https://basic.smartedu.cn', 348 | 'Accept': 'application/json, text/plain, */*', 349 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 350 | } 351 | 352 | # Create and verify the output directory 353 | work_path = os.path.join(dir_path, "textbook_download") 354 | try: 355 | if not os.path.exists(work_path): 356 | os.makedirs(work_path) 357 | except OSError as e: 358 | print(f"❌ Failed to create output directory: {e}") 359 | return 360 | 361 | # Handle different download modes based on provided arguments 362 | if book_id: 363 | # Mode 1: Download by book ID (UUID) 364 | _download_by_book_id(book_id, headers, work_path) 365 | return 366 | 367 | elif sequence_number: 368 | # Mode 2: Download by sequence number 369 | _download_by_sequence_number(sequence_number, headers, work_path) 370 | return 371 | 372 | elif book_range: 373 | # Mode 3: Download by book range 374 | _download_by_book_range(book_range, headers, work_path) 375 | return 376 | 377 | # Legacy modes (original functionality) 378 | elif single_book or download_limit or table > 0 or item > 0: 379 | _download_legacy_mode(table, item, single_book, download_limit, headers, work_path) 380 | 381 | else: 382 | print("❌ No download mode specified. Use --help to see available options.") 383 | return 384 | 385 | print(f"📁 Check your Downloads/textbook_download folder") 386 | 387 | 388 | def _download_by_book_id(book_id: str, headers: Dict[str, str], work_path: str) -> None: 389 | """ 390 | Download a textbook by its unique book ID (UUID). 391 | 392 | Args: 393 | book_id (str): The unique identifier of the textbook 394 | headers (Dict[str, str]): HTTP headers for the request 395 | work_path (str): Directory where the PDF should be saved 396 | """ 397 | print(f"🔍 Downloading by book ID: {book_id}") 398 | 399 | try: 400 | # Get PDF URLs for this book 401 | pdf_urls = get_pdf_url(book_id) 402 | if pdf_urls: 403 | # Fetch book title from metadata for better filename 404 | json_url = f"https://s-file-1.ykt.cbern.com.cn/zxx/ndrv2/resources/tch_material/details/{book_id}.json" 405 | response = requests.get(json_url, timeout=30) 406 | 407 | if response.ok: 408 | data = response.json() 409 | book_title = data.get('title', f'Book_{book_id}') 410 | else: 411 | book_title = f'Book_{book_id}' 412 | 413 | # Attempt download with CDN fallback 414 | success = download_pdf_with_cdn_fallback(pdf_urls, book_title, headers, work_path) 415 | if success: 416 | print(f" ✅ Successfully downloaded book ID: {book_id}") 417 | else: 418 | print(f"❌ Failed to download book ID: {book_id}") 419 | else: 420 | print(f"❌ Could not get PDF URLs for book ID: {book_id}") 421 | except Exception as e: 422 | print(f"❌ Error processing book ID {book_id}: {str(e)}") 423 | 424 | 425 | def _download_by_sequence_number(sequence_number: int, headers: Dict[str, str], work_path: str) -> None: 426 | """ 427 | Download a textbook by its global sequence number. 428 | 429 | Args: 430 | sequence_number (int): Global sequence number of the textbook 431 | headers (Dict[str, str]): HTTP headers for the request 432 | work_path (str): Directory where the PDF should be saved 433 | """ 434 | print(f"🔍 Downloading by sequence number: {sequence_number}") 435 | 436 | # Get textbook catalog 437 | print("📚 Getting textbook catalog...") 438 | try: 439 | catalog_urls = get_parts() 440 | except Exception as e: 441 | print(f"❌ Failed to get catalog: {e}") 442 | return 443 | 444 | # Find the book in the catalog 445 | book_info, catalog_index, catalog_position = get_book_by_sequence_number(catalog_urls, sequence_number) 446 | if book_info: 447 | print(f"📖 Found book: {book_info.get('title', 'Unknown')}") 448 | print(f"📍 Catalog: {catalog_index + 1}, Position: {catalog_position + 1}") 449 | 450 | # Get PDF URLs and download 451 | book_id = book_info['id'] 452 | pdf_urls = get_pdf_url(book_id) 453 | 454 | if pdf_urls: 455 | # Extract publisher information for filename 456 | publisher = next((tag['tag_name'] for tag in book_info['tag_list'] if '版' in tag['tag_name']), '') 457 | book_name = f"{publisher}{book_info['title']}" 458 | 459 | success = download_pdf_with_cdn_fallback(pdf_urls, book_name, headers, work_path) 460 | if success: 461 | print(f" ✅ Successfully downloaded sequence number: {sequence_number}") 462 | else: 463 | print(f"❌ Failed to download sequence number: {sequence_number}") 464 | else: 465 | print(f"❌ Could not get PDF URLs for sequence number: {sequence_number}") 466 | else: 467 | print(f"❌ Sequence number {sequence_number} not found in catalog") 468 | 469 | 470 | def _download_by_book_range(book_range: str, headers: Dict[str, str], work_path: str) -> None: 471 | """ 472 | Download multiple textbooks within a specified range. 473 | 474 | Args: 475 | book_range (str): Range specification (e.g., "200-250" or "200") 476 | headers (Dict[str, str]): HTTP headers for the request 477 | work_path (str): Directory where the PDFs should be saved 478 | """ 479 | print(f"🔍 Downloading by book range: {book_range}") 480 | 481 | try: 482 | # Parse the range specification 483 | if '-' in book_range: 484 | start, end = map(int, book_range.split('-')) 485 | if start > end: 486 | start, end = end, start # Swap if start > end 487 | else: 488 | start = end = int(book_range) # Single book 489 | 490 | print(f"📚 Downloading books from sequence {start} to {end}") 491 | 492 | # Get textbook catalog 493 | print("📚 Getting textbook catalog...") 494 | try: 495 | catalog_urls = get_parts() 496 | except Exception as e: 497 | print(f"❌ Failed to get catalog: {e}") 498 | return 499 | 500 | total_processed = 0 501 | failed_books = [] 502 | 503 | # Process each book in the range 504 | for seq_num in range(start, end + 1): 505 | print(f"\n📖 Processing sequence number: {seq_num}") 506 | 507 | book_info, catalog_index, catalog_position = get_book_by_sequence_number(catalog_urls, seq_num) 508 | if book_info: 509 | print(f" 📍 Found in catalog {catalog_index + 1}, position {catalog_position + 1}") 510 | 511 | book_id = book_info['id'] 512 | pdf_urls = get_pdf_url(book_id) 513 | 514 | if pdf_urls: 515 | publisher = next((tag['tag_name'] for tag in book_info['tag_list'] if '版' in tag['tag_name']), '') 516 | book_name = f"{publisher}{book_info['title']}" 517 | 518 | success = download_pdf_with_cdn_fallback(pdf_urls, book_name, headers, work_path) 519 | if success: 520 | total_processed += 1 521 | print(f" ✅ Successfully downloaded sequence number: {seq_num}") 522 | else: 523 | failed_books.append((seq_num, book_name, "Download failed")) 524 | else: 525 | print(f"⚠️ Could not get PDF URLs for sequence number: {seq_num}") 526 | failed_books.append((seq_num, book_info.get('title', 'Unknown'), "No PDF URLs")) 527 | else: 528 | print(f"⚠️ Sequence number {seq_num} not found") 529 | failed_books.append((seq_num, "Unknown", "Not found")) 530 | 531 | # Small delay between books to be respectful to the server 532 | time.sleep(1) 533 | 534 | # Summary report 535 | print(f"\n🎉 Range download complete! Successfully processed {total_processed} textbooks") 536 | 537 | if failed_books: 538 | print(f"\n⚠️ Failed {len(failed_books)} books:") 539 | for seq_num, title, reason in failed_books: 540 | print(f" • Sequence {seq_num}: {title} - {reason}") 541 | 542 | except ValueError: 543 | print(f"❌ Invalid range format: {book_range}. Use format like '200-250' or '200'") 544 | 545 | 546 | def _download_legacy_mode(table: int, item: int, single_book: Optional[int], 547 | download_limit: Optional[int], headers: Dict[str, str], work_path: str) -> None: 548 | """ 549 | Legacy download mode using catalog-based approach. 550 | 551 | This function maintains compatibility with the original script's functionality 552 | while adding enhanced error handling and CDN fallback. 553 | 554 | Args: 555 | table (int): Starting catalog index 556 | item (int): Starting item index within the catalog 557 | single_book (Optional[int]): Specific book number to download 558 | download_limit (Optional[int]): Maximum number of books to download 559 | headers (Dict[str, str]): HTTP headers for the request 560 | work_path (str): Directory where the PDFs should be saved 561 | """ 562 | print("📚 Using legacy download mode...") 563 | 564 | try: 565 | url = get_parts() 566 | except Exception as e: 567 | print(f"❌ Failed to get catalog URLs: {e}") 568 | return 569 | 570 | t = 0 + table 571 | total_processed = 0 572 | book_counter = item 573 | 574 | # Process each catalog 575 | for ref in url[table:]: 576 | print(f"正在下载目录{t+1}/{len(url)}中的电子教材") 577 | 578 | try: 579 | response = requests.get(ref, headers=headers, timeout=30) 580 | response.raise_for_status() 581 | info = json.loads(response.text) 582 | 583 | c = 0 + item 584 | for i in info[item:]: 585 | book_counter += 1 586 | 587 | # Skip if not the requested single book 588 | if single_book is not None and book_counter != single_book: 589 | c += 1 590 | continue 591 | 592 | # Check if we've reached the download limit 593 | if download_limit is not None and total_processed >= download_limit: 594 | print(f"已达到下载限制 ({download_limit} 本教材)") 595 | return 596 | 597 | try: 598 | book_id = i['id'] 599 | publisher = next((tag['tag_name'] for tag in i['tag_list'] if '版' in tag['tag_name']), '') 600 | book_name = f"{publisher}{i['title']}" 601 | 602 | print(f"📖 Processing: {book_name}") 603 | 604 | # Get the PDF URLs with CDN fallback 605 | pdf_urls = get_pdf_url(book_id) 606 | 607 | if pdf_urls: 608 | success = download_pdf_with_cdn_fallback(pdf_urls, book_name, headers, work_path) 609 | if success: 610 | print(f" ✅ Successfully downloaded: {book_name}") 611 | total_processed += 1 612 | else: 613 | print(f"❌ Failed to download: {book_name}") 614 | else: 615 | print(f"❌ Could not get PDF URLs for {book_name}") 616 | 617 | except Exception as e: 618 | print(f"❌ Error processing {book_name}: {str(e)}") 619 | 620 | # If we're downloading a single book and found it, we can return 621 | if single_book is not None and book_counter == single_book: 622 | return 623 | 624 | c += 1 625 | 626 | except requests.RequestException as e: 627 | print(f"❌ Network error processing catalog {t+1}: {e}") 628 | except json.JSONDecodeError as e: 629 | print(f"❌ Invalid JSON in catalog {t+1}: {e}") 630 | except Exception as e: 631 | print(f"❌ Unexpected error processing catalog {t+1}: {e}") 632 | 633 | t += 1 634 | item = 0 635 | 636 | print(f"\n🎉 Download complete! Processed {total_processed} textbooks") 637 | 638 | 639 | if __name__ == "__main__": 640 | # Set up command-line argument parser with comprehensive help 641 | parser = argparse.ArgumentParser( 642 | description=''' 643 | Enhanced Textbook Downloader from the National Smart Education Platform (国家中小学智慧教育平台) 644 | 645 | This enhanced script downloads complete PDF textbooks with CDN fallback logic, multiple download modes, 646 | and improved error handling. It now supports downloading by sequence number, book range, and book ID. 647 | 648 | The downloaded PDFs will be saved to: 649 | ~/Downloads/textbook_download/ 650 | 651 | FEATURES: 652 | - CDN Fallback: Automatically tries r1, r2, r3 endpoints if one fails 653 | - Multiple Download Modes: Flexible options for different use cases 654 | - Enhanced Error Handling: Detailed error messages and graceful fallbacks 655 | - Progress Tracking: Real-time download status and file size information 656 | - Robust Network Handling: Timeouts, retries, and connection error handling 657 | ''', 658 | formatter_class=argparse.RawDescriptionHelpFormatter, 659 | epilog=''' 660 | DOWNLOAD MODES: 661 | ============== 662 | 663 | 1. BY SEQUENCE NUMBER (--sequence): 664 | Downloads a specific book by its global sequence number across all catalogs. 665 | Example: --sequence 2548 (downloads the 2548th book across all catalogs) 666 | 667 | This is useful when you know the exact position of a book in the entire collection. 668 | The script automatically calculates which catalog and position contains the book. 669 | 670 | 2. BY BOOK RANGE (--range): 671 | Downloads multiple books within a specified range. 672 | Example: --range "200-250" (downloads books from sequence 200 to 250) 673 | 674 | Range format: "start-end" or just "start" for a single book. 675 | The script will process each book in the range and provide a summary report. 676 | 677 | 3. BY BOOK ID (--book-id): 678 | Downloads a specific book by its unique identifier (UUID). 679 | Example: --book-id "bdc00134-465d-454b-a541-dcd0cec4d86e" 680 | 681 | This is useful when you have the exact book ID from the metadata. 682 | 683 | 4. LEGACY MODES: 684 | - --single N: Download only the Nth textbook from the catalog 685 | - --limit N: Download only N textbooks (starting from the beginning) 686 | - --table N: Start from catalog N (0-based indexing) 687 | - --item N: Start from item N within the catalog (0-based indexing) 688 | 689 | EXAMPLES: 690 | ========= 691 | 692 | # Download by sequence number 693 | python pdf_book_download_from_zxxeducn.py --sequence 2548 694 | 695 | # Download a range of books 696 | python pdf_book_download_from_zxxeducn.py --range "1-5" 697 | 698 | # Download by book ID 699 | python pdf_book_download_from_zxxeducn.py --book-id "bdc00134-465d-454b-a541-dcd0cec4d86e" 700 | 701 | # Legacy single book download 702 | python pdf_book_download_from_zxxeducn.py --single 1 703 | 704 | # Legacy limited download 705 | python pdf_book_download_from_zxxeducn.py --limit 10 706 | 707 | # Resume interrupted download 708 | python pdf_book_download_from_zxxeducn.py --table 1 --item 5 709 | 710 | TECHNICAL DETAILS: 711 | ================== 712 | 713 | CDN Endpoints: The script automatically tries r1-ndr-oversea, r2-ndr-oversea, and r3-ndr-oversea 714 | in sequence if one fails, ensuring reliable downloads. 715 | 716 | File Validation: Downloads are validated to ensure they are actual PDF files (>1MB) and not error pages. 717 | 718 | Error Handling: Comprehensive error handling with detailed messages and graceful fallbacks. 719 | 720 | Network Timeouts: 30-second timeout for all network requests to prevent hanging. 721 | 722 | Output Directory: All downloads are saved to ~/Downloads/textbook_download/ with descriptive filenames. 723 | ''' 724 | ) 725 | 726 | # New download modes 727 | parser.add_argument( 728 | '--sequence', 729 | type=int, 730 | help='Download by global sequence number (e.g., 2548 for the 2548th book across all catalogs)' 731 | ) 732 | parser.add_argument( 733 | '--range', 734 | type=str, 735 | help='Download by book range (e.g., "200-250" for books 200 to 250, or "200" for single book)' 736 | ) 737 | parser.add_argument( 738 | '--book-id', 739 | type=str, 740 | help='Download by specific book ID (UUID format, e.g., "bdc00134-465d-454b-a541-dcd0cec4d86e")' 741 | ) 742 | 743 | # Legacy modes 744 | parser.add_argument( 745 | '--single', 746 | type=int, 747 | help='Download only one specific book number from the catalog (legacy mode)' 748 | ) 749 | parser.add_argument( 750 | '--limit', 751 | type=int, 752 | help='Limit the number of books to download in this run (legacy mode)' 753 | ) 754 | parser.add_argument( 755 | '--table', 756 | type=int, 757 | default=0, 758 | help='Start from specific catalog index (0-based, legacy mode). Range: 0-3' 759 | ) 760 | parser.add_argument( 761 | '--item', 762 | type=int, 763 | default=0, 764 | help='Start from specific item index within the catalog (0-based, legacy mode). Range: 0-999' 765 | ) 766 | 767 | # Parse command-line arguments 768 | args = parser.parse_args() 769 | 770 | # Call the enhanced download function with all parsed arguments 771 | pdf_download( 772 | table=args.table, 773 | item=args.item, 774 | single_book=args.single, 775 | download_limit=args.limit, 776 | sequence_number=args.sequence, 777 | book_range=args.range, 778 | book_id=args.book_id 779 | ) --------------------------------------------------------------------------------