├── .gitignore ├── README.md ├── bookid.csv ├── getbookid.py ├── main.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | .vscode 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MarxismCrawler 2 | 从中国共产党思想理论资源数据库 http://data.lilun.cn/ 爬取马克思主义电子书籍 3 | 4 | # 安装依赖 5 | 6 | python版本3.7 7 | 8 | ```python 9 | pip install -r requirements.txt 10 | ``` 11 | 12 | # 使用示例 13 | 14 | 爬取bookid分别为B_01019144_001、B_01019282_001、B_01019283_001的三本书,会在当前目录创建三个文件夹,文件夹里是每一页的jpg图片 15 | 16 | 并且会在下载图片完毕后,自动在当前目录生成由jpg图片合成的pdf文档,pdf文档由bookid命名 17 | 18 | ```python 19 | python main.py B_01019144_001 B_01019282_001 B_01019283_001 20 | ``` 21 | 22 | # 使用建议 23 | 24 | ## 获取bookid 25 | 26 | 打开 http://data.lilun.cn/ 首页,搜索想要的书籍,搜到后,点击“在线阅读”,页面会显示未登录状态,但网页URL中会有bookid信息,示例如下 27 | 28 | ``` 29 | http://data.lilun.cn/Service/?logic=PDFReaderController&call=readPDF&bookid=B_01018144_001&page=1&html=selectText_NOINC&from=online&searchChar=undefined 30 | ``` 31 | 32 | 目前爬取了一部分书名与bookid的对应关系,保存在了bookid.csv中,有需要可自行检索或批量下载 33 | 34 | -------------------------------------------------------------------------------- /getbookid.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | import os 4 | import time 5 | 6 | archiveMap = { 7 | '习近平新时代中国特色社会主义思想库': {'id':769,'size':185}, 8 | '马克思主义著作库': {'id':313,'size':1834}, 9 | '党和国家主要领导人著作库': {'id':314,'size':778}, 10 | '党和国家重要文献库': 315, 11 | '经典著作选编和重要论述摘编库': 327, 12 | '党的思想理论研究成果库': 415, 13 | '中国特色社会主义建设库': 416, 14 | '法律法规库': 419, 15 | '中共党史库': 422, 16 | '党的建设库': 423, 17 | '革命时期出版图书库': {'id': 424, 'size': 1651}, 18 | '国际共运资料库': {'id': 425, 'size': 575}, 19 | '历史知识库': {'id': 426, 'size': 3836}, 20 | '哲学知识库': {'id':427, 'size': 1716}, 21 | } 22 | 23 | def getBookInfo(archiveID: int, size: int) -> list: 24 | response = requests.post(url='http://data.lilun.cn/Service/?logic=bookController&call=getBookByColumn', 25 | data='specialid={}&columnId={}&page=0&pageSize={}'.format(archiveID,archiveID,size), 26 | headers={'Content-Type': 'application/x-www-form-urlencoded'}) 27 | bookInfoList = json.loads(response.content).get('result').get('bookinfo',[]) 28 | return bookInfoList 29 | 30 | if __name__ == "__main__": 31 | bookInfoList = [] 32 | bookInfoList.extend(getBookInfo(archiveMap['国际共运资料库']['id'],archiveMap['国际共运资料库']['size'])) 33 | bookInfoList.extend(getBookInfo(archiveMap['哲学知识库']['id'],archiveMap['哲学知识库']['size'])) 34 | bookInfoList.extend(getBookInfo(archiveMap['历史知识库']['id'],archiveMap['历史知识库']['size'])) 35 | bookInfoList.extend(getBookInfo(archiveMap['马克思主义著作库']['id'],archiveMap['马克思主义著作库']['size'])) 36 | with open('bookid.csv','a',encoding='utf-8') as fp: 37 | for info in bookInfoList: 38 | record = '{},{},{},{},{},{}\n'.format(info.get('isbn','unknown'),info['bookid'],info['bookname'],info.get('author','unknown'),info.get('publish','unknown'),info.get('releaseDate','unknown')) 39 | fp.write(record) 40 | fp.close() -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from requests.utils import quote 4 | import threading 5 | import os 6 | from retrying import retry 7 | import argparse 8 | 9 | import glob 10 | import re 11 | 12 | from reportlab.lib import utils 13 | from reportlab.pdfgen import canvas 14 | 15 | #---------------------------------------------------------------------- 16 | def sorted_nicely( l ): 17 | """ 18 | # http://stackoverflow.com/questions/2669059/how-to-sort-alpha-numeric-set-in-python 19 | 20 | Sort the given iterable in the way that humans expect. 21 | """ 22 | convert = lambda text: int(text) if text.isdigit() else text 23 | alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 24 | return sorted(l, key = alphanum_key) 25 | 26 | #---------------------------------------------------------------------- 27 | def unite_pictures_into_pdf(outputPdfName, pathToSavePdfTo, pathToPictures, splitType, numberOfEntitiesInOnePdf, listWithImagesExtensions, picturesAreInRootFolder, nameOfPart): 28 | 29 | if numberOfEntitiesInOnePdf < 1: 30 | print("Wrong value of numberOfEntitiesInOnePdf.") 31 | return 32 | if len(listWithImagesExtensions) == 0: 33 | print("listWithImagesExtensions is empty.") 34 | return 35 | 36 | if picturesAreInRootFolder == False: 37 | foldersInsideFolderWithPictures = sorted_nicely(glob.glob(pathToPictures + "\\*\\")) 38 | # print(foldersInsideFolderWithPictures) 39 | if len(foldersInsideFolderWithPictures) != 0: 40 | picturesPathsForEachFolder = [] 41 | for iFolder in foldersInsideFolderWithPictures: 42 | picturePathsInFolder = [] 43 | for jExtension in listWithImagesExtensions: 44 | picturePathsInFolder.extend(glob.glob(iFolder + "*." + jExtension)) 45 | picturesPathsForEachFolder.append(sorted_nicely(picturePathsInFolder)) 46 | if splitType == "folder": 47 | numberOfFoldersAdded = 0; 48 | for iFolder in picturesPathsForEachFolder: 49 | if (numberOfFoldersAdded % numberOfEntitiesInOnePdf) == 0: 50 | endNumber = numberOfFoldersAdded + numberOfEntitiesInOnePdf 51 | if endNumber > len(picturesPathsForEachFolder): 52 | endNumber = len(picturesPathsForEachFolder) 53 | filename = [] 54 | if numberOfEntitiesInOnePdf > 1: 55 | filename = os.path.join(pathToSavePdfTo, outputPdfName + "_" + nameOfPart + "_" + str(numberOfFoldersAdded + 1) + '-' + str(endNumber) + "_of_" + str(len(picturesPathsForEachFolder)) + ".pdf") 56 | elif numberOfEntitiesInOnePdf == 1: 57 | filename = os.path.join(pathToSavePdfTo, outputPdfName + "_" + nameOfPart + "_" + str(numberOfFoldersAdded + 1) + "_of_" + str(len(picturesPathsForEachFolder)) + ".pdf") 58 | c = canvas.Canvas(filename) 59 | for jPicture in iFolder: 60 | img = utils.ImageReader(jPicture) 61 | imagesize = img.getSize() 62 | c.setPageSize(imagesize) 63 | c.drawImage(jPicture, 0, 0) 64 | c.showPage() 65 | numberOfFoldersAdded += 1 66 | if (numberOfFoldersAdded % numberOfEntitiesInOnePdf) == 0: 67 | c.save() 68 | print("created", filename) 69 | if (numberOfFoldersAdded % numberOfEntitiesInOnePdf) != 0: 70 | c.save() 71 | print("created", filename) 72 | elif splitType == "picture": 73 | numberOfPicturesAdded = 0; 74 | totalNumberOfPictures = 0; 75 | for iFolder in picturesPathsForEachFolder: 76 | totalNumberOfPictures += len(iFolder) 77 | for iFolder in picturesPathsForEachFolder: 78 | for jPicture in iFolder: 79 | if (numberOfPicturesAdded % numberOfEntitiesInOnePdf) == 0: 80 | endNumber = numberOfPicturesAdded + numberOfEntitiesInOnePdf 81 | if endNumber > totalNumberOfPictures: 82 | endNumber = totalNumberOfPictures 83 | filename = [] 84 | if numberOfEntitiesInOnePdf > 1: 85 | filename = os.path.join(pathToSavePdfTo, outputPdfName + "_" + nameOfPart + "_" + str(numberOfPicturesAdded + 1) + '-' + str(endNumber) + "_of_" + str(totalNumberOfPictures) + ".pdf") 86 | elif numberOfEntitiesInOnePdf == 1: 87 | filename = os.path.join(pathToSavePdfTo, outputPdfName + "_" + nameOfPart + "_" + str(numberOfPicturesAdded + 1) + "_of_" + str(totalNumberOfPictures) + ".pdf") 88 | c = canvas.Canvas(filename) 89 | img = utils.ImageReader(jPicture) 90 | imagesize = img.getSize() 91 | c.setPageSize(imagesize) 92 | c.drawImage(jPicture, 0, 0) 93 | c.showPage() 94 | numberOfPicturesAdded += 1 95 | if (numberOfPicturesAdded % numberOfEntitiesInOnePdf) == 0: 96 | c.save() 97 | print("created", filename) 98 | if (numberOfPicturesAdded % numberOfEntitiesInOnePdf) != 0: 99 | c.save() 100 | print("created", filename) 101 | elif splitType == "none": 102 | filename = os.path.join(pathToSavePdfTo, outputPdfName + ".pdf") 103 | c = canvas.Canvas(filename) 104 | for iFolder in picturesPathsForEachFolder: 105 | for jPicture in iFolder: 106 | img = utils.ImageReader(jPicture) 107 | imagesize = img.getSize() 108 | c.setPageSize(imagesize) 109 | c.drawImage(jPicture, 0, 0) 110 | c.showPage() 111 | c.save() 112 | print("created", filename) 113 | else: 114 | print("Wrong splitType value") 115 | else: 116 | print("No pictures found.") 117 | return 118 | 119 | if picturesAreInRootFolder == True: 120 | picturesInsideFolderWithPictures = [] 121 | for iExtension in listWithImagesExtensions: 122 | picturesInsideFolderWithPictures.extend(glob.glob(pathToPictures + "\\*." + iExtension)) 123 | picturesInsideFolderWithPictures = sorted_nicely(picturesInsideFolderWithPictures) 124 | if len(picturesInsideFolderWithPictures) != 0: 125 | if splitType == "picture": 126 | numberOfPicturesAdded = 0 127 | totalNumberOfPictures = len(picturesInsideFolderWithPictures) 128 | for iPicture in picturesInsideFolderWithPictures: 129 | if (numberOfPicturesAdded % numberOfEntitiesInOnePdf) == 0: 130 | endNumber = numberOfPicturesAdded + numberOfEntitiesInOnePdf 131 | if endNumber > totalNumberOfPictures: 132 | endNumber = totalNumberOfPictures 133 | filename = [] 134 | if numberOfEntitiesInOnePdf > 1: 135 | filename = os.path.join(pathToSavePdfTo, outputPdfName + "_" + nameOfPart + "_" + str(numberOfPicturesAdded + 1) + '-' + str(endNumber) + "_of_" + str(totalNumberOfPictures) + ".pdf") 136 | elif numberOfEntitiesInOnePdf == 1: 137 | filename = os.path.join(pathToSavePdfTo, outputPdfName + "_" + nameOfPart + "_" + str(numberOfPicturesAdded + 1) + "_of_" + str(totalNumberOfPictures) + ".pdf") 138 | c = canvas.Canvas(filename) 139 | img = utils.ImageReader(iPicture) 140 | imagesize = img.getSize() 141 | c.setPageSize(imagesize) 142 | c.drawImage(iPicture, 0, 0) 143 | c.showPage() 144 | numberOfPicturesAdded += 1 145 | if (numberOfPicturesAdded % numberOfEntitiesInOnePdf) == 0: 146 | c.save() 147 | print("created", filename) 148 | if (numberOfPicturesAdded % numberOfEntitiesInOnePdf) != 0: 149 | c.save() 150 | print("created", filename) 151 | elif splitType == "none": 152 | filename = os.path.join(pathToSavePdfTo, outputPdfName + ".pdf") 153 | c = canvas.Canvas(filename) 154 | for iPicture in picturesInsideFolderWithPictures: 155 | try: 156 | img = utils.ImageReader(iPicture) 157 | 158 | imagesize = img.getSize() 159 | c.setPageSize(imagesize) 160 | c.drawImage(iPicture, 0, 0) 161 | c.showPage() 162 | except Exception as e: 163 | print(e) 164 | c.save() 165 | print("created", filename) 166 | else: 167 | print("Wrong splitType value") 168 | else: 169 | print("No pictures found.") 170 | return 171 | 172 | 173 | 174 | h = { 175 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 176 | "Accept-Encoding": "gzip, deflate", 177 | "Accept-Language": "zh-CN,zh;q=0.9", 178 | "Cache-Control": "max-age=0", 179 | "Connection": "keep-alive", 180 | "Cookie": "bagid=undefined; isLogin=true; JSESSIONID=rj0vxa4cg5pc1utzs6xzfy1d1", 181 | "Host": "data.lilun.cn", 182 | "Referer": "http://data.lilun.cn/", 183 | "Upgrade-Insecure-Requests": "1", 184 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36" 185 | } 186 | 187 | @retry 188 | def getImgUrl(bookid: str, page: int) -> str : 189 | #print("getting ",bookid,page) 190 | url = 'http://data.lilun.cn/Service/?logic=PDFReaderController&call=createEncryptFileUrl&bookid={}&page={}' 191 | response = requests.get(url.format(bookid,page), headers = h) 192 | #print(response.json()) 193 | return response.json().get('result',{'imgurl': 'undefine'}).get('imgurl') 194 | 195 | @retry 196 | def getImgBuf(bookid: str, page: int) -> bytearray : 197 | imgurl = getImgUrl(bookid,page) 198 | #print(imgurl) 199 | url = 'http://data.lilun.cn/Service/?logic=PDFReaderController&call=ReadImg&imgurl={}'.format(quote(imgurl)) 200 | #print("imgurl:",url) 201 | #url = 'http://data.lilun.cn/Service/?logic=PDFReaderController&call=ReadImg&imgurl=Ql8wMTAyMDEyMF8wMDEoMSkuanBn' 202 | response = requests.get(url, headers = h) 203 | #print(response.content) 204 | return response.content 205 | 206 | def saveImgAs(imgBuf: bytearray, filename: str): 207 | with open(filename,"wb") as fb: 208 | #print("saving img",filename) 209 | fb.write(imgBuf) 210 | fb.close() 211 | 212 | def getBook(bookid: str) -> dict: 213 | print('========正在爬取{}========'.format(bookid)) 214 | if not os.path.exists(bookid): 215 | os.makedirs(bookid) 216 | for page in range(1,3000): 217 | try: 218 | if os.path.exists("{}/{}.jpg".format(bookid,str(page).zfill(3))): 219 | print('第{}页已存在'.format(page)) 220 | continue 221 | imgBuf = getImgBuf(bookid,page) 222 | if len(imgBuf) == 0: 223 | print("break out!") 224 | break 225 | print('下载第{}页'.format(page)) 226 | saveImgAs(imgBuf,"{}/{}.jpg".format(bookid,str(page).zfill(3))) 227 | except Exception as e: 228 | print(e) 229 | return {'status': False,'bookid': bookid,'failedAt': page} 230 | return {'status': True, 'bookid': bookid} 231 | 232 | def convert_to_pdf(bookid: str): 233 | print('========开始合成pdf========') 234 | outputPdfName = "{}".format(bookid) 235 | pathToSavePdfTo = "." 236 | pathToPictures = "{}".format(bookid) 237 | splitType = "none" 238 | numberOfEntitiesInOnePdf = 1 239 | listWithImagesExtensions = ["jpg"] 240 | picturesAreInRootFolder = True 241 | nameOfPart = "volume" 242 | unite_pictures_into_pdf(outputPdfName, pathToSavePdfTo, pathToPictures, splitType, numberOfEntitiesInOnePdf, listWithImagesExtensions, picturesAreInRootFolder, nameOfPart) 243 | print('pdf合成完毕!') 244 | 245 | if __name__ == "__main__": 246 | # bookidgen = lambda n: 'B_01018{}_001'.format(136 + n) # 列宁全集 第 n 卷 247 | # # bookid = 'B_01018865_001' # 马克思主义政治经济学人物谱系 248 | # for n in range(3,60): 249 | # getBook(bookidgen(n)) 250 | ''' 251 | bookid=B_01019144_001 252 | bookid=B_01019282_001 253 | bookid=B_01019283_001 254 | ''' 255 | parser = argparse.ArgumentParser(description='输入若干bookid,从中国共产党思想理论资源数据库 http://data.lilun.cn/ 爬取马克思主义电子书籍') 256 | parser.add_argument('booklist', type=str, nargs='+', 257 | help='bookid的列表') 258 | args = parser.parse_args() 259 | print('bookid的列表为:',args.booklist) 260 | resultlist = map(getBook,args.booklist) 261 | for res in resultlist: 262 | if res['status']: 263 | print('爬取{}成功!'.format(res['bookid'])) 264 | convert_to_pdf(res['bookid']) 265 | else: 266 | print('爬取{}的第{}页时出错!中断爬取'.format(res['bookid'],res['failedAt'])) 267 | 268 | 269 | 270 | 271 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.21.0 2 | retrying==1.3.3 3 | reportlab==3.5.23 4 | --------------------------------------------------------------------------------