├── textbook_info.py
├── .gitignore
├── README.md
└── pdf_book_download_from_zxxeducn.py


/textbook_info.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import csv
 4 | from pathlib import Path
 5 | import os
 6 | 
 7 | def get_parts(return_type='json'):
 8 |     '''get urls return list'''
 9 |     url = 'https://s-file-1.ykt.cbern.com.cn/zxx/ndrs/resources/tch_material/version/data_version.json'
10 |     headers = {
11 |         "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
12 |         "Referer": "https://basic.smartedu.cn/",
13 |         "Origin": "https://basic.smartedu.cn"
14 |     }
15 |     req = requests.get(url=url, headers=headers)
16 |     
17 |     if return_type == 'json':
18 |         data = json.loads(req.text)
19 |     else:
20 |         data = req.text
21 |     return data['urls'].split(',')
22 | 
23 | def save_textbook_info():
24 |     """
25 |     Extract textbook IDs and names and save them to a CSV file
26 |     """
27 |     # Get the URLs
28 |     urls = get_parts()
29 |     
30 |     headers = {
31 |         'Referer': 'https://basic.smartedu.cn/',
32 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
33 |         'Origin': 'https://basic.smartedu.cn',
34 |         'Accept': 'application/json, text/plain, */*',
35 |         'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
36 |     }
37 | 
38 |     # Prepare CSV file
39 |     home = str(Path.home())
40 |     dir_path = os.path.join(home, "Downloads")
41 |     csv_path = os.path.join(dir_path, "textbook_info.csv")
42 | 
43 |     book_number = 1  # Initialize book counter
44 | 
45 |     # Add UTF-8 BOM to handle Chinese characters
46 |     with open(csv_path, 'w', newline='', encoding='utf-8-sig') as csvfile:
47 |         writer = csv.writer(csvfile)
48 |         writer.writerow(['Number', 'Book ID', 'Book Name'])  # Updated header
49 | 
50 |         for index, ref in enumerate(urls, 1):
51 |             print(f"Processing directory {index}/{len(urls)}")
52 |             response = requests.get(ref, headers=headers)
53 |             response.encoding = 'utf-8'  # Explicitly set response encoding
54 |             info = json.loads(response.text)
55 | 
56 |             for book in info:
57 |                 try:
58 |                     book_id = book['id']
59 |                     publisher = next((tag['tag_name'] for tag in book['tag_list'] if '版' in tag['tag_name']), '')
60 |                     book_name = f"{publisher}{book['title']}"
61 |                     
62 |                     writer.writerow([book_number, book_id, book_name])
63 |                     book_number += 1
64 |                     
65 |                 except Exception as e:
66 |                     print(f"Error processing book: {str(e)}")
67 | 
68 |     print(f"CSV file has been saved to: {csv_path}")
69 | 
70 | if __name__ == "__main__":
71 |     save_textbook_info()


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be added to the global gitignore or merged into this project gitignore.  For a PyCharm
158 | #  project, it is recommended to include the following files:
159 | #  .idea/
160 | #  *.iml
161 | #  *.ipr
162 | #  *.iws
163 | .idea/
164 | *.iml
165 | *.ipr
166 | *.iws
167 | 
168 | # VS Code
169 | .vscode/
170 | 
171 | # macOS
172 | .DS_Store
173 | .AppleDouble
174 | .LSOverride
175 | 
176 | # Windows
177 | Thumbs.db
178 | ehthumbs.db
179 | Desktop.ini
180 | 
181 | # Linux
182 | *~
183 | 
184 | # Project specific
185 | # Downloaded textbooks (these can be large and should not be in version control)
186 | ~/Downloads/textbook_download/
187 | Downloads/textbook_download/
188 | 
189 | # CSV files with textbook metadata (these can be regenerated)
190 | *.csv
191 | 
192 | # Log files
193 | *.log
194 | 
195 | # Temporary files
196 | *.tmp
197 | *.temp
198 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Enhanced Textbook Downloader from National Smart Education Platform
  2 | 
  3 | [English](#english) | [中文](#中文)
  4 | 
  5 | ---
  6 | 
  7 | ## English
  8 | 
  9 | ### 📚 Overview
 10 | 
 11 | This enhanced script downloads complete PDF textbooks from the Chinese National Smart Education Platform (国家中小学智慧教育平台) with multiple download modes, comprehensive error handling, and flexible download controls.
 12 | 
 13 | ### ✨ Features
 14 | 
 15 | - **Multiple Download Modes**: Sequence number, book range, book ID, and legacy catalog-based approaches
 16 | - **CDN Fallback Logic**: Automatically tries r1, r2, r3 endpoints if one fails
 17 | - **Enhanced Error Handling**: Detailed error messages and graceful fallbacks
 18 | - **Progress Tracking**: Real-time download status and file size information
 19 | - **Flexible Controls**: Download specific books, ranges, or use legacy catalog-based approach
 20 | - **Robust Network Handling**: Timeouts, retries, and connection error handling
 21 | 
 22 | ### 🚀 Download Modes
 23 | 
 24 | #### 1. **By Sequence Number** (`--sequence`)
 25 | Downloads a specific book by its global sequence number across all catalogs.
 26 | 
 27 | ```bash
 28 | python pdf_book_download_from_zxxeducn.py --sequence 2548
 29 | ```
 30 | 
 31 | #### 2. **By Book Range** (`--range`)
 32 | Downloads multiple books within a specified range.
 33 | 
 34 | ```bash
 35 | python pdf_book_download_from_zxxeducn.py --range "200-250"
 36 | python pdf_book_download_from_zxxeducn.py --range "200"  # Single book
 37 | ```
 38 | 
 39 | #### 3. **By Book ID** (`--book-id`)
 40 | Downloads a specific book by its unique identifier (UUID).
 41 | 
 42 | ```bash
 43 | python pdf_book_download_from_zxxeducn.py --book-id "bdc00134-465d-454b-a541-dcd0cec4d86e"
 44 | ```
 45 | 
 46 | #### 4. **Legacy Modes**
 47 | - `--single N`: Download only the Nth textbook from the catalog
 48 | - `--limit N`: Download only N textbooks (starting from the beginning)
 49 | - `--table N`: Start from catalog N (0-based indexing)
 50 | - `--item N`: Start from item N within the catalog (0-based indexing)
 51 | 
 52 | ### 📋 Requirements
 53 | 
 54 | - Python 3.6+
 55 | - `requests` library
 56 | - Internet connection
 57 | - Access to the National Smart Education Platform
 58 | 
 59 | ### 🛠️ Installation
 60 | 
 61 | 1. Clone or download the script
 62 | 2. Install required dependencies:
 63 | ```bash
 64 | pip install requests
 65 | ```
 66 | 
 67 | ### 📖 Usage Examples
 68 | 
 69 | ```bash
 70 | # Download by sequence number
 71 | python pdf_book_download_from_zxxeducn.py --sequence 2548
 72 | 
 73 | # Download a range of books
 74 | python pdf_book_download_from_zxxeducn.py --range "1-5"
 75 | 
 76 | # Download by book ID
 77 | python pdf_book_download_from_zxxeducn.py --book-id "bdc00134-465d-454b-a541-dcd0cec4d86e"
 78 | 
 79 | # Legacy single book download
 80 | python pdf_book_download_from_zxxeducn.py --single 1
 81 | 
 82 | # Legacy limited download
 83 | python pdf_book_download_from_zxxeducn.py --limit 10
 84 | 
 85 | # Resume interrupted download
 86 | python pdf_book_download_from_zxxeducn.py --table 1 --item 5
 87 | ```
 88 | 
 89 | ### 🔧 Technical Details
 90 | 
 91 | - **CDN Endpoints**: Automatically tries r1-ndr-oversea, r2-ndr-oversea, and r3-ndr-oversea in sequence
 92 | - **File Validation**: Downloads are validated to ensure they are actual PDF files (>1MB)
 93 | - **Network Timeouts**: 30-second timeout for all network requests
 94 | - **Output Directory**: All downloads are saved to `~/Downloads/textbook_download/`
 95 | 
 96 | ### 📁 Output
 97 | 
 98 | Downloaded PDFs are saved to:
 99 | ```
100 | ~/Downloads/textbook_download/
101 | ├── 统编版（根据2022年版课程标准修订）义务教育教科书·道德与法治一年级上册.pdf
102 | ├── 统编版（根据2022年版课程标准修订）义务教育教科书·道德与法治一年级下册.pdf
103 | └── ...
104 | ```
105 | 
106 | ### 📊 Companion Script: textbook_info.py
107 | 
108 | The `textbook_info.py` script is a companion tool that collects metadata for all available textbooks and exports it to a CSV file. This is useful for:
109 | 
110 | - **Finding specific textbooks**: Search through the CSV to locate books by title, publisher, or other criteria
111 | - **Planning downloads**: See the complete catalog before deciding what to download
112 | - **Resume functionality**: Use the sequence numbers to resume interrupted downloads
113 | 
114 | #### Usage:
115 | ```bash
116 | python textbook_info.py
117 | ```
118 | 
119 | #### Output:
120 | - Creates a CSV file in your Downloads folder
121 | - Contains: Book ID, Title, Publisher, Catalog position, and Global sequence number
122 | - Useful for determining the correct parameters for the main download script
123 | 
124 | #### Example CSV structure:
125 | ```csv
126 | sequence_number,catalog_index,catalog_position,book_id,title,publisher
127 | 1,0,0,bdc00134-465d-454b-a541-dcd0cec4d86e,义务教育教科书·道德与法治一年级上册,统编版
128 | 2,0,1,bdc00135-465d-454b-a541-dcd0cec4d86e,义务教育教科书·道德与法治一年级下册,统编版
129 | ...
130 | ```
131 | 
132 | ### 🐛 Troubleshooting
133 | 
134 | - **Network Errors**: Check your internet connection and firewall settings
135 | - **Permission Errors**: Ensure you have write access to the Downloads folder
136 | - **Timeout Errors**: The script will automatically retry with different CDN endpoints
137 | 
138 | ### 📝 License
139 | 
140 | Open source - feel free to use and modify as needed.
141 | 
142 | ---
143 | 
144 | ## 中文
145 | 
146 | ### 📚 概述
147 | 
148 | 这是一个增强版的脚本，用于从国家中小学智慧教育平台下载完整的PDF教材，支持多种下载模式、全面的错误处理和灵活的下载控制。
149 | 
150 | ### ✨ 功能特点
151 | 
152 | - **多种下载模式**: 序列号、书籍范围、书籍ID和传统目录方式
153 | - **CDN故障转移**: 自动尝试r1、r2、r3端点，如果一个失败则切换到下一个
154 | - **增强错误处理**: 详细的错误信息和优雅的故障转移
155 | - **进度跟踪**: 实时下载状态和文件大小信息
156 | - **灵活控制**: 下载特定书籍、范围或使用传统目录方式
157 | - **稳健网络处理**: 超时、重试和连接错误处理
158 | 
159 | ### 🚀 下载模式
160 | 
161 | #### 1. **按序列号下载** (`--sequence`)
162 | 通过全局序列号下载特定书籍（跨所有目录）。
163 | 
164 | ```bash
165 | python pdf_book_download_from_zxxeducn.py --sequence 2548
166 | ```
167 | 
168 | #### 2. **按书籍范围下载** (`--range`)
169 | 下载指定范围内的多本书籍。
170 | 
171 | ```bash
172 | python pdf_book_download_from_zxxeducn.py --range "200-250"
173 | python pdf_book_download_from_zxxeducn.py --range "200"  # 单本书
174 | ```
175 | 
176 | #### 3. **按书籍ID下载** (`--book-id`)
177 | 通过唯一标识符（UUID）下载特定书籍。
178 | 
179 | ```bash
180 | python pdf_book_download_from_zxxeducn.py --book-id "bdc00134-465d-454b-a541-dcd0cec4d86e"
181 | ```
182 | 
183 | #### 4. **传统模式**
184 | - `--single N`: 仅下载目录中的第N本教材
185 | - `--limit N`: 限制本次运行下载的书籍数量
186 | - `--table N`: 从目录N开始（基于0的索引）
187 | - `--item N`: 从目录中的项目N开始（基于0的索引）
188 | 
189 | ### 📋 系统要求
190 | 
191 | - Python 3.6+
192 | - `requests` 库
193 | - 网络连接
194 | - 访问国家中小学智慧教育平台的权限
195 | 
196 | ### 🛠️ 安装
197 | 
198 | 1. 克隆或下载脚本
199 | 2. 安装所需依赖：
200 | ```bash
201 | pip install requests
202 | ```
203 | 
204 | ### 📖 使用示例
205 | 
206 | ```bash
207 | # 按序列号下载
208 | python pdf_book_download_from_zxxeducn.py --sequence 2548
209 | 
210 | # 下载书籍范围
211 | python pdf_book_download_from_zxxeducn.py --range "1-5"
212 | 
213 | # 按书籍ID下载
214 | python pdf_book_download_from_zxxeducn.py --book-id "bdc00134-465d-454b-a541-dcd0cec4d86e"
215 | 
216 | # 传统单本书下载
217 | python pdf_book_download_from_zxxeducn.py --single 1
218 | 
219 | # 传统限制下载
220 | python pdf_book_download_from_zxxeducn.py --limit 10
221 | 
222 | # 恢复中断的下载
223 | python pdf_book_download_from_zxxeducn.py --table 1 --item 5
224 | ```
225 | 
226 | ### 🔧 技术细节
227 | 
228 | - **CDN端点**: 自动按顺序尝试r1-ndr-oversea、r2-ndr-oversea和r3-ndr-oversea
229 | - **文件验证**: 验证下载内容确保是实际的PDF文件（>1MB）
230 | - **网络超时**: 所有网络请求30秒超时
231 | - **输出目录**: 所有下载保存到`~/Downloads/textbook_download/`
232 | 
233 | ### 📁 输出
234 | 
235 | 下载的PDF文件保存到：
236 | ```
237 | ~/Downloads/textbook_download/
238 | ├── 统编版（根据2022年版课程标准修订）义务教育教科书·道德与法治一年级上册.pdf
239 | ├── 统编版（根据2022年版课程标准修订）义务教育教科书·道德与法治一年级下册.pdf
240 | └── ...
241 | ```
242 | 
243 | ### 📊 配套脚本：textbook_info.py
244 | 
245 | `textbook_info.py` 脚本是一个配套工具，用于收集所有可用教材的元数据并导出到CSV文件。这对于以下情况很有用：
246 | 
247 | - **查找特定教材**: 通过CSV搜索按标题、出版社或其他条件定位书籍
248 | - **规划下载**: 在决定下载内容之前查看完整目录
249 | - **恢复功能**: 使用序列号恢复中断的下载
250 | 
251 | #### 使用方法：
252 | ```bash
253 | python textbook_info.py
254 | ```
255 | 
256 | #### 输出：
257 | - 在Downloads文件夹中创建CSV文件
258 | - 包含：书籍ID、标题、出版社、目录位置和全局序列号
259 | - 有助于确定主下载脚本的正确参数
260 | 
261 | #### CSV结构示例：
262 | ```csv
263 | sequence_number,catalog_index,catalog_position,book_id,title,publisher
264 | 1,0,0,bdc00134-465d-454b-a541-dcd0cec4d86e,义务教育教科书·道德与法治一年级上册,统编版
265 | 2,0,1,bdc00135-465d-454b-a541-dcd0cec4d86e,义务教育教科书·道德与法治一年级下册,统编版
266 | ...
267 | ```
268 | 
269 | ### 🐛 故障排除
270 | 
271 | - **网络错误**: 检查网络连接和防火墙设置
272 | - **权限错误**: 确保对Downloads文件夹有写入权限
273 | - **超时错误**: 脚本将自动尝试不同的CDN端点
274 | 
275 | ### 📝 许可证
276 | 
277 | 开源 - 可自由使用和修改。
278 | 
279 | ---
280 | 
281 | ## 🔄 Version History
282 | 
283 | - **v3.0.0**: Modified the download path and added more download mods. Enhanced documentation, type hints, and modular architecture
284 | - **v2.0.0**: Modified the download path and added new download control
285 | - **v1.0.0**: Original script with basic functionality
286 | 
287 | ## 🤝 Contributing
288 | 
289 | Feel free to submit issues, feature requests, or pull requests to improve this script.
290 | 
291 | ## 📞 Support
292 | 
293 | If you encounter any issues or have questions, please check the troubleshooting section above or create an issue in the repository.
294 | 


--------------------------------------------------------------------------------
/pdf_book_download_from_zxxeducn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Enhanced Textbook Downloader from the National Smart Education Platform (国家中小学智慧教育平台)
  4 | 
  5 | This script provides multiple methods to download complete PDF textbooks from the Chinese national
  6 | education platform, comprehensive error handling, and flexible download controls.
  7 | 
  8 | License: Open source
  9 | Version: 3.0.0
 10 | 
 11 | Features:
 12 | - CDN fallback: Automatically tries r1, r2, r3 endpoints if one fails
 13 | - Multiple download modes: Sequence number, book range, book ID, and legacy modes
 14 | - Enhanced error handling: Detailed error messages and graceful fallbacks
 15 | - Progress tracking: Real-time download status and file size information
 16 | - Flexible controls: Download specific books, ranges, or use legacy catalog-based approach
 17 | """
 18 | 
 19 | import requests
 20 | import json
 21 | import os
 22 | from pathlib import Path
 23 | from urllib.parse import quote
 24 | import argparse
 25 | import time
 26 | from typing import List, Tuple, Optional, Dict, Any, Union
 27 | 
 28 | # Get user's home directory
 29 | home = str(Path.home())
 30 | 
 31 | # Construct the path using os.path.join for cross-platform compatibility
 32 | dir_path = os.path.join(home, "Downloads")
 33 | 
 34 | # Verify the directory exists
 35 | if not os.path.exists(dir_path):
 36 |     raise FileNotFoundError(f"Directory not found: {dir_path}")
 37 | 
 38 | 
 39 | def get_parts(return_type: str = 'json') -> List[str]:
 40 |     """
 41 |     Fetch the catalog URLs from the National Smart Education Platform.
 42 |     
 43 |     This function retrieves the list of catalog URLs that contain textbook metadata.
 44 |     There are typically 4 catalogs, with the first 3 containing up to 1000 books each.
 45 |     
 46 |     Args:
 47 |         return_type (str): Type of return value. 'json' returns parsed JSON, 
 48 |                           any other value returns raw text. Default: 'json'
 49 |     
 50 |     Returns:
 51 |         List[str]: List of catalog URLs for fetching textbook metadata
 52 |         
 53 |     Raises:
 54 |         requests.RequestException: If the HTTP request fails
 55 |         json.JSONDecodeError: If the response is not valid JSON (when return_type='json')
 56 |         
 57 |     Example:
 58 |         >>> urls = get_parts()
 59 |         >>> print(f"Found {len(urls)} catalogs")
 60 |         Found 4 catalogs
 61 |     """
 62 |     url = 'https://s-file-1.ykt.cbern.com.cn/zxx/ndrs/resources/tch_material/version/data_version.json'
 63 |     headers = {
 64 |         "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
 65 |         "Referer": "https://basic.smartedu.cn/",
 66 |         "Origin": "https://basic.smartedu.cn"
 67 |     }
 68 |     
 69 |     try:
 70 |         req = requests.get(url=url, headers=headers, timeout=30)
 71 |         req.raise_for_status()  # Raise exception for bad status codes
 72 |         
 73 |         if return_type == 'json':
 74 |             data = json.loads(req.text)
 75 |             return data['urls'].split(',')
 76 |         else:
 77 |             return req.text
 78 |     except requests.RequestException as e:
 79 |         print(f"❌ Failed to fetch catalog URLs: {e}")
 80 |         raise
 81 |     except json.JSONDecodeError as e:
 82 |         print(f"❌ Invalid JSON response from catalog API: {e}")
 83 |         raise
 84 | 
 85 | 
 86 | def get_pdf_url(book_id: str) -> Optional[List[str]]:
 87 |     """
 88 |     Retrieve PDF download URLs for a specific textbook using its book ID.
 89 |     
 90 |     This function fetches the textbook metadata and extracts all available CDN endpoints
 91 |     for PDF downloads. It transforms the private URLs to oversea URLs for public access.
 92 |     
 93 |     Args:
 94 |         book_id (str): The unique identifier (UUID) of the textbook
 95 |         
 96 |     Returns:
 97 |         Optional[List[str]]: List of CDN URLs for PDF download, or None if failed
 98 |         
 99 |     Raises:
100 |         requests.RequestException: If the HTTP request fails
101 |         json.JSONDecodeError: If the metadata response is not valid JSON
102 |         KeyError: If the expected metadata structure is missing
103 |         
104 |     Example:
105 |         >>> urls = get_pdf_url("bdc00134-465d-454b-a541-dcd0cec4d86e")
106 |         >>> if urls:
107 |         ...     print(f"Found {len(urls)} CDN endpoints")
108 |         Found 3 CDN endpoints
109 |     """
110 |     try:
111 |         # Construct the metadata API URL
112 |         json_url = f"https://s-file-1.ykt.cbern.com.cn/zxx/ndrv2/resources/tch_material/details/{book_id}.json"
113 |         
114 |         # Fetch the textbook metadata
115 |         response = requests.get(json_url)
116 |         response.raise_for_status()
117 |         
118 |         # Parse the JSON response
119 |         data = response.json()
120 |         
121 |         # Search for the source item (contains PDF download links)
122 |         for item in data['ti_items']:
123 |             if item.get('ti_file_flag') == 'source':
124 |                 if 'ti_storages' in item and item['ti_storages']:
125 |                     # Transform all private URLs to oversea URLs
126 |                     pdf_urls = []
127 |                     for storage_url in item['ti_storages']:
128 |                         oversea_url = storage_url.replace('-private', '-oversea')
129 |                         pdf_urls.append(oversea_url)
130 |                     return pdf_urls
131 |         
132 |         # No source item found
133 |         print(f"⚠️ No PDF source found for book ID: {book_id}")
134 |         return None
135 |             
136 |     except requests.RequestException as e:
137 |         print(f"❌ Network error getting metadata for {book_id}: {e}")
138 |         return None
139 |     except json.JSONDecodeError as e:
140 |         print(f"❌ Invalid JSON response for {book_id}: {e}")
141 |         return None
142 |     except KeyError as e:
143 |         print(f"❌ Unexpected metadata structure for {book_id}: {e}")
144 |         return None
145 |     except Exception as e:
146 |         print(f"❌ Unexpected error getting metadata for {book_id}: {e}")
147 |         import traceback
148 |         print("Full traceback:")
149 |         print(traceback.format_exc())
150 |         return None
151 | 
152 | 
153 | def download_pdf_with_cdn_fallback(pdf_urls: List[str], book_name: str, 
154 |                                   headers: Dict[str, str], work_path: str) -> bool:
155 |     """
156 |     Download a PDF textbook with automatic CDN fallback logic.
157 |     
158 |     This function attempts to download the PDF from multiple CDN endpoints (r1, r2, r3)
159 |     in sequence. If one fails, it automatically tries the next. It validates the downloaded
160 |     content to ensure it's actually a PDF file and not an error page.
161 |     
162 |     Args:
163 |         pdf_urls (List[str]): List of CDN URLs to try for download
164 |         book_name (str): Name of the textbook (used for filename)
165 |         headers (Dict[str, str]): HTTP headers for the download request
166 |         work_path (str): Directory path where the PDF should be saved
167 |         
168 |     Returns:
169 |         bool: True if download succeeded, False if all CDN endpoints failed
170 |         
171 |     Raises:
172 |         OSError: If there's an error writing the file to disk
173 |         requests.RequestException: If all HTTP requests fail
174 |         
175 |     Example:
176 |         >>> success = download_pdf_with_cdn_fallback(urls, "Math Book", headers, "/downloads")
177 |         >>> if success:
178 |         ...     print("Download completed successfully")
179 |         Download completed successfully
180 |     """
181 |     if not pdf_urls:
182 |         print(f"❌ No PDF URLs available for {book_name}")
183 |         return False
184 |     
185 |     # Try each CDN endpoint in sequence
186 |     for i, pdf_url in enumerate(pdf_urls):
187 |         cdn_name = f"r{i+1}-ndr-oversea"
188 |         
189 |         try:
190 |             # Attempt to download the PDF with timeout
191 |             pdf_response = requests.get(pdf_url, headers=headers, timeout=30)
192 |             
193 |             if pdf_response.status_code == 200:
194 |                 # Validate the downloaded content
195 |                 content_length = len(pdf_response.content)
196 |                 content_type = pdf_response.headers.get('content-type', '')
197 |                 
198 |                 # Check if we got a valid PDF (not an error page)
199 |                 if 'pdf' in content_type.lower() and content_length > 1000000:  # > 1MB
200 |                     # Save the PDF to disk
201 |                     file_path = os.path.join(work_path, f"{book_name}.pdf")
202 |                     
203 |                     try:
204 |                         with open(file_path, 'wb') as f:
205 |                             f.write(pdf_response.content)
206 |                         
207 |                         print(f"    💾 Downloaded: {book_name}  {content_length / (1024*1024):.1f} MB")
208 |                         return True
209 |                         
210 |                     except OSError as e:
211 |                         print(f"    ❌ Failed to save file: {e}")
212 |                         continue
213 |                 else:
214 |                     print(f"    ⚠️ {cdn_name} returned invalid content: {content_type}, {content_length} bytes")
215 |             else:
216 |                 print(f"    ❌ {cdn_name} failed: Status {pdf_response.status_code}")
217 |                 
218 |         except requests.exceptions.Timeout:
219 |             print(f"    ⏰ {cdn_name} timeout after 30 seconds")
220 |         except requests.exceptions.RequestException as e:
221 |             print(f"    ❌ {cdn_name} network error: {e}")
222 |         except Exception as e:
223 |             print(f"    ❌ {cdn_name} unexpected error: {e}")
224 |     
225 |     # All CDN endpoints failed
226 |     print(f"❌ All CDN endpoints failed for {book_name}")
227 |     return False
228 | 
229 | 
230 | def get_book_by_sequence_number(catalog_urls: List[str], sequence_number: int) -> Tuple[Optional[Dict[str, Any]], Optional[int], Optional[int]]:
231 |     """
232 |     Locate a textbook by its global sequence number across all catalogs.
233 |     
234 |     This function maps a global sequence number (e.g., 2548) to the specific catalog
235 |     and position where that textbook can be found. It handles the catalog structure
236 |     where the first 3 catalogs typically contain 1000 books each.
237 |     
238 |     Args:
239 |         catalog_urls (List[str]): List of catalog API URLs
240 |         sequence_number (int): Global sequence number of the textbook (1-based)
241 |         
242 |     Returns:
243 |         Tuple[Optional[Dict[str, Any]], Optional[int], Optional[int]]: 
244 |             - book_info: Dictionary containing textbook metadata, or None if not found
245 |             - catalog_index: Index of the catalog (0-3), or None if not found
246 |             - catalog_position: Position within the catalog (0-based), or None if not found
247 |             
248 |     Raises:
249 |         requests.RequestException: If any catalog API request fails
250 |         json.JSONDecodeError: If any catalog response is not valid JSON
251 |         
252 |     Example:
253 |         >>> book_info, cat_idx, cat_pos = get_book_by_sequence_number(urls, 2548)
254 |         >>> if book_info:
255 |         ...     print(f"Found book {book_info['title']} in catalog {cat_idx + 1}")
256 |         Found book 道德与法治 in catalog 3
257 |     """
258 |     # Validate input
259 |     if sequence_number < 1:
260 |         print(f"❌ Invalid sequence number: {sequence_number} (must be >= 1)")
261 |         return None, None, None
262 |     
263 |     current_sequence = 1  # Start counting from the first book
264 |     
265 |     # Iterate through each catalog to find the target sequence number
266 |     for catalog_index, catalog_url in enumerate(catalog_urls):
267 |         try:
268 |             # Fetch catalog data
269 |             response = requests.get(catalog_url, timeout=30)
270 |             response.raise_for_status()
271 |             info = json.loads(response.text)
272 |             
273 |             # Calculate how many books this catalog has
274 |             catalog_size = len(info)
275 |             
276 |             # Check if our target sequence number falls within this catalog
277 |             if current_sequence <= sequence_number < current_sequence + catalog_size:
278 |                 # Calculate the position within this catalog
279 |                 catalog_position = sequence_number - current_sequence
280 |                 
281 |                 # Verify the calculated position exists in the catalog
282 |                 if catalog_position < len(info):
283 |                     book_info = info[catalog_position]
284 |                     return book_info, catalog_index, catalog_position
285 |             
286 |             # Move to the next catalog's starting sequence number
287 |             current_sequence += catalog_size
288 |             
289 |         except requests.RequestException as e:
290 |             print(f"❌ Network error processing catalog {catalog_index + 1}: {e}")
291 |             continue
292 |         except json.JSONDecodeError as e:
293 |             print(f"❌ Invalid JSON in catalog {catalog_index + 1}: {e}")
294 |             continue
295 |         except Exception as e:
296 |             print(f"❌ Unexpected error processing catalog {catalog_index + 1}: {e}")
297 |             continue
298 |     
299 |     # Sequence number not found in any catalog
300 |     print(f"❌ Sequence number {sequence_number} not found in any catalog")
301 |     return None, None, None
302 | 
303 | 
304 | def pdf_download(table: int = 0, item: int = 0, single_book: Optional[int] = None, 
305 |                  download_limit: Optional[int] = None, sequence_number: Optional[int] = None, 
306 |                  book_range: Optional[str] = None, book_id: Optional[str] = None) -> None:
307 |     """
308 |     Enhanced textbook downloader with multiple download modes and CDN fallback.
309 |     
310 |     This is the main function that orchestrates textbook downloads based on the specified mode.
311 |     It supports downloading by sequence number, book range, book ID, and legacy catalog-based
312 |     approaches. All downloads use CDN fallback logic for reliability.
313 |     
314 |     Args:
315 |         table (int): Starting catalog index (0-based). Default: 0
316 |         item (int): Starting item index within the catalog (0-based). Default: 0
317 |         single_book (Optional[int]): Download only one specific book number. Default: None
318 |         download_limit (Optional[int]): Limit the number of books to download. Default: None
319 |         sequence_number (Optional[int]): Download by global sequence number. Default: None
320 |         book_range (Optional[str]): Download by book range (e.g., "200-250"). Default: None
321 |         book_id (Optional[str]): Download by specific book ID (UUID). Default: None
322 |         
323 |     Returns:
324 |         None: This function performs downloads but doesn't return values
325 |         
326 |     Raises:
327 |         OSError: If there are file system errors
328 |         requests.RequestException: If network requests fail
329 |         ValueError: If book range format is invalid
330 |         
331 |     Example:
332 |         # Download by sequence number
333 |         pdf_download(sequence_number=2548)
334 |         
335 |         # Download by range
336 |         pdf_download(book_range="1-10")
337 |         
338 |         # Legacy mode - download first 5 books from catalog 0
339 |         pdf_download(limit=5)
340 |     """
341 |     print("🚀 Starting textbook download...")
342 |     
343 |     # Set up HTTP headers for all requests
344 |     headers = {
345 |         'Referer': 'https://basic.smartedu.cn/',
346 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
347 |         'Origin': 'https://basic.smartedu.cn',
348 |         'Accept': 'application/json, text/plain, */*',
349 |         'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
350 |     }
351 |     
352 |     # Create and verify the output directory
353 |     work_path = os.path.join(dir_path, "textbook_download")
354 |     try:
355 |         if not os.path.exists(work_path):
356 |             os.makedirs(work_path)
357 |     except OSError as e:
358 |         print(f"❌ Failed to create output directory: {e}")
359 |         return
360 |     
361 |     # Handle different download modes based on provided arguments
362 |     if book_id:
363 |         # Mode 1: Download by book ID (UUID)
364 |         _download_by_book_id(book_id, headers, work_path)
365 |         return
366 |     
367 |     elif sequence_number:
368 |         # Mode 2: Download by sequence number
369 |         _download_by_sequence_number(sequence_number, headers, work_path)
370 |         return
371 |     
372 |     elif book_range:
373 |         # Mode 3: Download by book range
374 |         _download_by_book_range(book_range, headers, work_path)
375 |         return
376 |     
377 |     # Legacy modes (original functionality)
378 |     elif single_book or download_limit or table > 0 or item > 0:
379 |         _download_legacy_mode(table, item, single_book, download_limit, headers, work_path)
380 |     
381 |     else:
382 |         print("❌ No download mode specified. Use --help to see available options.")
383 |         return
384 |     
385 |     print(f"📁 Check your Downloads/textbook_download folder")
386 | 
387 | 
388 | def _download_by_book_id(book_id: str, headers: Dict[str, str], work_path: str) -> None:
389 |     """
390 |     Download a textbook by its unique book ID (UUID).
391 |     
392 |     Args:
393 |         book_id (str): The unique identifier of the textbook
394 |         headers (Dict[str, str]): HTTP headers for the request
395 |         work_path (str): Directory where the PDF should be saved
396 |     """
397 |     print(f"🔍 Downloading by book ID: {book_id}")
398 |     
399 |     try:
400 |         # Get PDF URLs for this book
401 |         pdf_urls = get_pdf_url(book_id)
402 |         if pdf_urls:
403 |             # Fetch book title from metadata for better filename
404 |             json_url = f"https://s-file-1.ykt.cbern.com.cn/zxx/ndrv2/resources/tch_material/details/{book_id}.json"
405 |             response = requests.get(json_url, timeout=30)
406 |             
407 |             if response.ok:
408 |                 data = response.json()
409 |                 book_title = data.get('title', f'Book_{book_id}')
410 |             else:
411 |                 book_title = f'Book_{book_id}'
412 |             
413 |             # Attempt download with CDN fallback
414 |             success = download_pdf_with_cdn_fallback(pdf_urls, book_title, headers, work_path)
415 |             if success:
416 |                 print(f"    ✅ Successfully downloaded book ID: {book_id}")
417 |             else:
418 |                 print(f"❌ Failed to download book ID: {book_id}")
419 |         else:
420 |             print(f"❌ Could not get PDF URLs for book ID: {book_id}")
421 |     except Exception as e:
422 |         print(f"❌ Error processing book ID {book_id}: {str(e)}")
423 | 
424 | 
425 | def _download_by_sequence_number(sequence_number: int, headers: Dict[str, str], work_path: str) -> None:
426 |     """
427 |     Download a textbook by its global sequence number.
428 |     
429 |     Args:
430 |         sequence_number (int): Global sequence number of the textbook
431 |         headers (Dict[str, str]): HTTP headers for the request
432 |         work_path (str): Directory where the PDF should be saved
433 |     """
434 |     print(f"🔍 Downloading by sequence number: {sequence_number}")
435 |     
436 |     # Get textbook catalog
437 |     print("📚 Getting textbook catalog...")
438 |     try:
439 |         catalog_urls = get_parts()
440 |     except Exception as e:
441 |         print(f"❌ Failed to get catalog: {e}")
442 |         return
443 |     
444 |     # Find the book in the catalog
445 |     book_info, catalog_index, catalog_position = get_book_by_sequence_number(catalog_urls, sequence_number)
446 |     if book_info:
447 |         print(f"📖 Found book: {book_info.get('title', 'Unknown')}")
448 |         print(f"📍 Catalog: {catalog_index + 1}, Position: {catalog_position + 1}")
449 |         
450 |         # Get PDF URLs and download
451 |         book_id = book_info['id']
452 |         pdf_urls = get_pdf_url(book_id)
453 |         
454 |         if pdf_urls:
455 |             # Extract publisher information for filename
456 |             publisher = next((tag['tag_name'] for tag in book_info['tag_list'] if '版' in tag['tag_name']), '')
457 |             book_name = f"{publisher}{book_info['title']}"
458 |             
459 |             success = download_pdf_with_cdn_fallback(pdf_urls, book_name, headers, work_path)
460 |             if success:
461 |                 print(f"    ✅ Successfully downloaded sequence number: {sequence_number}")
462 |             else:
463 |                 print(f"❌ Failed to download sequence number: {sequence_number}")
464 |         else:
465 |             print(f"❌ Could not get PDF URLs for sequence number: {sequence_number}")
466 |     else:
467 |         print(f"❌ Sequence number {sequence_number} not found in catalog")
468 | 
469 | 
470 | def _download_by_book_range(book_range: str, headers: Dict[str, str], work_path: str) -> None:
471 |     """
472 |     Download multiple textbooks within a specified range.
473 |     
474 |     Args:
475 |         book_range (str): Range specification (e.g., "200-250" or "200")
476 |         headers (Dict[str, str]): HTTP headers for the request
477 |         work_path (str): Directory where the PDFs should be saved
478 |     """
479 |     print(f"🔍 Downloading by book range: {book_range}")
480 |     
481 |     try:
482 |         # Parse the range specification
483 |         if '-' in book_range:
484 |             start, end = map(int, book_range.split('-'))
485 |             if start > end:
486 |                 start, end = end, start  # Swap if start > end
487 |         else:
488 |             start = end = int(book_range)  # Single book
489 |         
490 |         print(f"📚 Downloading books from sequence {start} to {end}")
491 |         
492 |         # Get textbook catalog
493 |         print("📚 Getting textbook catalog...")
494 |         try:
495 |             catalog_urls = get_parts()
496 |         except Exception as e:
497 |             print(f"❌ Failed to get catalog: {e}")
498 |             return
499 |         
500 |         total_processed = 0
501 |         failed_books = []
502 |         
503 |         # Process each book in the range
504 |         for seq_num in range(start, end + 1):
505 |             print(f"\n📖 Processing sequence number: {seq_num}")
506 |             
507 |             book_info, catalog_index, catalog_position = get_book_by_sequence_number(catalog_urls, seq_num)
508 |             if book_info:
509 |                 print(f"    📍 Found in catalog {catalog_index + 1}, position {catalog_position + 1}")
510 |                 
511 |                 book_id = book_info['id']
512 |                 pdf_urls = get_pdf_url(book_id)
513 |                 
514 |                 if pdf_urls:
515 |                     publisher = next((tag['tag_name'] for tag in book_info['tag_list'] if '版' in tag['tag_name']), '')
516 |                     book_name = f"{publisher}{book_info['title']}"
517 |                     
518 |                     success = download_pdf_with_cdn_fallback(pdf_urls, book_name, headers, work_path)
519 |                     if success:
520 |                         total_processed += 1
521 |                         print(f"    ✅ Successfully downloaded sequence number: {seq_num}")
522 |                     else:
523 |                         failed_books.append((seq_num, book_name, "Download failed"))
524 |                 else:
525 |                     print(f"⚠️ Could not get PDF URLs for sequence number: {seq_num}")
526 |                     failed_books.append((seq_num, book_info.get('title', 'Unknown'), "No PDF URLs"))
527 |             else:
528 |                 print(f"⚠️ Sequence number {seq_num} not found")
529 |                 failed_books.append((seq_num, "Unknown", "Not found"))
530 |             
531 |             # Small delay between books to be respectful to the server
532 |             time.sleep(1)
533 |         
534 |         # Summary report
535 |         print(f"\n🎉 Range download complete! Successfully processed {total_processed} textbooks")
536 |         
537 |         if failed_books:
538 |             print(f"\n⚠️ Failed {len(failed_books)} books:")
539 |             for seq_num, title, reason in failed_books:
540 |                 print(f"   • Sequence {seq_num}: {title} - {reason}")
541 |         
542 |     except ValueError:
543 |         print(f"❌ Invalid range format: {book_range}. Use format like '200-250' or '200'")
544 | 
545 | 
546 | def _download_legacy_mode(table: int, item: int, single_book: Optional[int], 
547 |                          download_limit: Optional[int], headers: Dict[str, str], work_path: str) -> None:
548 |     """
549 |     Legacy download mode using catalog-based approach.
550 |     
551 |     This function maintains compatibility with the original script's functionality
552 |     while adding enhanced error handling and CDN fallback.
553 |     
554 |     Args:
555 |         table (int): Starting catalog index
556 |         item (int): Starting item index within the catalog
557 |         single_book (Optional[int]): Specific book number to download
558 |         download_limit (Optional[int]): Maximum number of books to download
559 |         headers (Dict[str, str]): HTTP headers for the request
560 |         work_path (str): Directory where the PDFs should be saved
561 |     """
562 |     print("📚 Using legacy download mode...")
563 |     
564 |     try:
565 |         url = get_parts()
566 |     except Exception as e:
567 |         print(f"❌ Failed to get catalog URLs: {e}")
568 |         return
569 |     
570 |     t = 0 + table
571 |     total_processed = 0
572 |     book_counter = item
573 | 
574 |     # Process each catalog
575 |     for ref in url[table:]:
576 |         print(f"正在下载目录{t+1}/{len(url)}中的电子教材")
577 |         
578 |         try:
579 |             response = requests.get(ref, headers=headers, timeout=30)
580 |             response.raise_for_status()
581 |             info = json.loads(response.text)
582 | 
583 |             c = 0 + item
584 |             for i in info[item:]:
585 |                 book_counter += 1
586 |                 
587 |                 # Skip if not the requested single book
588 |                 if single_book is not None and book_counter != single_book:
589 |                     c += 1
590 |                     continue
591 | 
592 |                 # Check if we've reached the download limit
593 |                 if download_limit is not None and total_processed >= download_limit:
594 |                     print(f"已达到下载限制 ({download_limit} 本教材)")
595 |                     return
596 | 
597 |                 try:
598 |                     book_id = i['id']
599 |                     publisher = next((tag['tag_name'] for tag in i['tag_list'] if '版' in tag['tag_name']), '')
600 |                     book_name = f"{publisher}{i['title']}"
601 |                     
602 |                     print(f"📖 Processing: {book_name}")
603 |                     
604 |                     # Get the PDF URLs with CDN fallback
605 |                     pdf_urls = get_pdf_url(book_id)
606 |                     
607 |                     if pdf_urls:
608 |                         success = download_pdf_with_cdn_fallback(pdf_urls, book_name, headers, work_path)
609 |                         if success:
610 |                             print(f"    ✅ Successfully downloaded: {book_name}")
611 |                             total_processed += 1
612 |                         else:
613 |                             print(f"❌ Failed to download: {book_name}")
614 |                     else:
615 |                         print(f"❌ Could not get PDF URLs for {book_name}")
616 |                     
617 |                 except Exception as e:
618 |                     print(f"❌ Error processing {book_name}: {str(e)}")
619 |                 
620 |                 # If we're downloading a single book and found it, we can return
621 |                 if single_book is not None and book_counter == single_book:
622 |                     return
623 | 
624 |                 c += 1
625 |                 
626 |         except requests.RequestException as e:
627 |             print(f"❌ Network error processing catalog {t+1}: {e}")
628 |         except json.JSONDecodeError as e:
629 |             print(f"❌ Invalid JSON in catalog {t+1}: {e}")
630 |         except Exception as e:
631 |             print(f"❌ Unexpected error processing catalog {t+1}: {e}")
632 |         
633 |         t += 1
634 |         item = 0
635 |     
636 |     print(f"\n🎉 Download complete! Processed {total_processed} textbooks")
637 | 
638 | 
639 | if __name__ == "__main__":
640 |     # Set up command-line argument parser with comprehensive help
641 |     parser = argparse.ArgumentParser(
642 |         description='''
643 | Enhanced Textbook Downloader from the National Smart Education Platform (国家中小学智慧教育平台)
644 | 
645 | This enhanced script downloads complete PDF textbooks with CDN fallback logic, multiple download modes,
646 | and improved error handling. It now supports downloading by sequence number, book range, and book ID.
647 | 
648 | The downloaded PDFs will be saved to:
649 | ~/Downloads/textbook_download/
650 | 
651 | FEATURES:
652 | - CDN Fallback: Automatically tries r1, r2, r3 endpoints if one fails
653 | - Multiple Download Modes: Flexible options for different use cases
654 | - Enhanced Error Handling: Detailed error messages and graceful fallbacks
655 | - Progress Tracking: Real-time download status and file size information
656 | - Robust Network Handling: Timeouts, retries, and connection error handling
657 |         ''',
658 |         formatter_class=argparse.RawDescriptionHelpFormatter,
659 |         epilog='''
660 | DOWNLOAD MODES:
661 | ==============
662 | 
663 | 1. BY SEQUENCE NUMBER (--sequence):
664 |    Downloads a specific book by its global sequence number across all catalogs.
665 |    Example: --sequence 2548 (downloads the 2548th book across all catalogs)
666 |    
667 |    This is useful when you know the exact position of a book in the entire collection.
668 |    The script automatically calculates which catalog and position contains the book.
669 | 
670 | 2. BY BOOK RANGE (--range):
671 |    Downloads multiple books within a specified range.
672 |    Example: --range "200-250" (downloads books from sequence 200 to 250)
673 |    
674 |    Range format: "start-end" or just "start" for a single book.
675 |    The script will process each book in the range and provide a summary report.
676 | 
677 | 3. BY BOOK ID (--book-id):
678 |    Downloads a specific book by its unique identifier (UUID).
679 |    Example: --book-id "bdc00134-465d-454b-a541-dcd0cec4d86e"
680 |    
681 |    This is useful when you have the exact book ID from the metadata.
682 | 
683 | 4. LEGACY MODES:
684 |    - --single N: Download only the Nth textbook from the catalog
685 |    - --limit N: Download only N textbooks (starting from the beginning)
686 |    - --table N: Start from catalog N (0-based indexing)
687 |    - --item N: Start from item N within the catalog (0-based indexing)
688 | 
689 | EXAMPLES:
690 | =========
691 | 
692 | # Download by sequence number
693 | python pdf_book_download_from_zxxeducn.py --sequence 2548
694 | 
695 | # Download a range of books
696 | python pdf_book_download_from_zxxeducn.py --range "1-5"
697 | 
698 | # Download by book ID
699 | python pdf_book_download_from_zxxeducn.py --book-id "bdc00134-465d-454b-a541-dcd0cec4d86e"
700 | 
701 | # Legacy single book download
702 | python pdf_book_download_from_zxxeducn.py --single 1
703 | 
704 | # Legacy limited download
705 | python pdf_book_download_from_zxxeducn.py --limit 10
706 | 
707 | # Resume interrupted download
708 | python pdf_book_download_from_zxxeducn.py --table 1 --item 5
709 | 
710 | TECHNICAL DETAILS:
711 | ==================
712 | 
713 | CDN Endpoints: The script automatically tries r1-ndr-oversea, r2-ndr-oversea, and r3-ndr-oversea
714 |                in sequence if one fails, ensuring reliable downloads.
715 | 
716 | File Validation: Downloads are validated to ensure they are actual PDF files (>1MB) and not error pages.
717 | 
718 | Error Handling: Comprehensive error handling with detailed messages and graceful fallbacks.
719 | 
720 | Network Timeouts: 30-second timeout for all network requests to prevent hanging.
721 | 
722 | Output Directory: All downloads are saved to ~/Downloads/textbook_download/ with descriptive filenames.
723 |         '''
724 |     )
725 |     
726 |     # New download modes
727 |     parser.add_argument(
728 |         '--sequence', 
729 |         type=int, 
730 |         help='Download by global sequence number (e.g., 2548 for the 2548th book across all catalogs)'
731 |     )
732 |     parser.add_argument(
733 |         '--range', 
734 |         type=str, 
735 |         help='Download by book range (e.g., "200-250" for books 200 to 250, or "200" for single book)'
736 |     )
737 |     parser.add_argument(
738 |         '--book-id', 
739 |         type=str, 
740 |         help='Download by specific book ID (UUID format, e.g., "bdc00134-465d-454b-a541-dcd0cec4d86e")'
741 |     )
742 |     
743 |     # Legacy modes
744 |     parser.add_argument(
745 |         '--single', 
746 |         type=int, 
747 |         help='Download only one specific book number from the catalog (legacy mode)'
748 |     )
749 |     parser.add_argument(
750 |         '--limit', 
751 |         type=int, 
752 |         help='Limit the number of books to download in this run (legacy mode)'
753 |     )
754 |     parser.add_argument(
755 |         '--table', 
756 |         type=int, 
757 |         default=0, 
758 |         help='Start from specific catalog index (0-based, legacy mode). Range: 0-3'
759 |     )
760 |     parser.add_argument(
761 |         '--item', 
762 |         type=int, 
763 |         default=0, 
764 |         help='Start from specific item index within the catalog (0-based, legacy mode). Range: 0-999'
765 |     )
766 |     
767 |     # Parse command-line arguments
768 |     args = parser.parse_args()
769 |     
770 |     # Call the enhanced download function with all parsed arguments
771 |     pdf_download(
772 |         table=args.table,
773 |         item=args.item,
774 |         single_book=args.single,
775 |         download_limit=args.limit,
776 |         sequence_number=args.sequence,
777 |         book_range=args.range,
778 |         book_id=args.book_id
779 |     )


--------------------------------------------------------------------------------