├── .gitignore ├── EopCrawler.py ├── EopScorePageItem.py ├── LICENSE └── main.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | /.idea 103 | /__pycache__ 104 | -------------------------------------------------------------------------------- /EopCrawler.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from EopScorePageItem import EopPageItem 3 | import gzip 4 | import urllib.request 5 | import http.cookiejar 6 | import urllib.parse 7 | import os.path 8 | import time 9 | import codecs 10 | 11 | 12 | class EopCrawler(object): 13 | def __init__(self): 14 | return 15 | 16 | UrlPage = "http://www.everyonepiano.cn/Music.html?canshu=id&paixu=desc&p=" 17 | UrlHome = "http://www.everyonepiano.cn" 18 | op = None 19 | 20 | headers = { 21 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 22 | 'Accept-Encoding': 'gzip, deflate, sdch', 23 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 24 | 'Connection': 'keep-alive', 25 | 'Host': 'accounts.pixiv.net', 26 | 'Referer': 'http://www.everyonepiano.cn/Music.html?paixu=desc', 27 | 'Upgrade-Insecure-Requests': '1', 28 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 29 | 'Chrome/50.0.2661.102 Safari/537.36' 30 | } 31 | 32 | def ungzip(self,data): 33 | try: 34 | data = gzip.decompress(data) 35 | except Exception as e: 36 | print(e) 37 | return data 38 | 39 | def getopener(self, header): 40 | cj = http.cookiejar.CookieJar() 41 | cp = urllib.request.HTTPCookieProcessor(cj) 42 | op = urllib.request.build_opener(cp) 43 | h = [] 44 | for key, value in header.items(): 45 | elem = (key, value) 46 | h.append(elem) 47 | op.addheaders = h 48 | return op 49 | 50 | def getHtml (self, url): 51 | if self.op is None: 52 | self.op = self.getopener(self.headers) 53 | html = None 54 | with self.op.open(url) as f: 55 | if f.status == 200: 56 | op_key = self.op.open(url) 57 | data = op_key.read() 58 | op_key.close() 59 | html = self.ungzip(data).decode('utf-8') 60 | return html 61 | 62 | 63 | 64 | 65 | def getPageItems(self, html): 66 | rootSoup = BeautifulSoup(html, 'lxml') 67 | # 获得#EOPMain中的所有class=MusicIndexBox的div 68 | selector = rootSoup.select('div.MusicIndexBox') 69 | ''' 70 |
120 | ''' 121 | # 遍历处理div 122 | items = [] 123 | for child in selector: 124 | strid = str(child.select('div.MITitle > div')[0].string) 125 | author = str(child.select('div.MITitle > a')[1].string) 126 | title = str(child.select('div.MITitle > a')[0].string) 127 | title = title.replace("-" + author, '') 128 | url = child.select('div.MITitle > a')[0]['href'] 129 | url = self.UrlHome + url 130 | date = child.select('div.MIMusicUpdate')[0].string 131 | items.append(EopPageItem(strid, url, date,title,author)) 132 | return items 133 | 134 | # 分析出谱子图片的url 135 | def getImgUrls(self, item): 136 | # 处理五线谱页 137 | html = self.getHtml(item.staveUrl) 138 | if html is not None: 139 | rootSoup = BeautifulSoup(html, 'lxml') 140 | selector = rootSoup.select('div.PngDiv > ul > li') 141 | for child in selector: 142 | item.staveImgs.append(self.UrlHome + child.select('img')[0]['src']) 143 | # 处理简谱页 144 | html = self.getHtml(item.numberUrl) 145 | if html is not None: 146 | rootSoup = BeautifulSoup(html, 'lxml') 147 | selector = rootSoup.select('div.PngDiv > ul > li') 148 | for child in selector: 149 | item.numberImgs.append(self.UrlHome + child.select('img')[0]['src']) 150 | return item 151 | 152 | # 执行谱子下载 153 | def doDownLoadImgs(self,item, parentdir): 154 | # 储存路径 155 | path = item.getSavePath(parentdir) 156 | if os.path.exists(path) is False: 157 | os.makedirs(path) 158 | if self.op is None: 159 | self.op = self.getopener(self.headers) 160 | 161 | # 下载五线谱 162 | i = 0 163 | for url in item.staveImgs: 164 | i += 1 165 | imgPath = os.path.join(path, item.rep(item.title) + "_stave_" + str(i).zfill(3) + ".jpg") 166 | try: 167 | with self.op.open(url) as f: 168 | if f.status == 200: 169 | with open(imgPath, 'wb') as o: 170 | o.write(f.read()) 171 | print('成功下载 -> %s' % imgPath) 172 | o.close() 173 | # 等待,爬得太快容易被发现 174 | # time.sleep(0.5) 175 | except Exception as e: 176 | with codecs.open(os.path.join(parentdir, "log.txt"), "a", "UTF-8") as f: 177 | errinfo = '' 178 | for ii in range(0, len(e.args)): 179 | arg = e.args[ii] 180 | if self.isNum(arg): 181 | arg = str(arg) 182 | errinfo += ' ' + arg 183 | errinfo += ' stavD=> ' + imgPath 184 | f.write(errinfo + "\r\n") 185 | continue 186 | 187 | # 下载简谱 188 | i = 0 189 | for url in item.numberImgs: 190 | i += 1 191 | try: 192 | with self.op.open(url) as f: 193 | imgPath = os.path.join(path, item.rep(item.title) + "_number_" + str(i).zfill(3) + ".jpg") 194 | if f.status == 200: 195 | with open(imgPath, 'wb') as o: 196 | o.write(f.read()) 197 | print('成功下载 -> %s' % imgPath) 198 | o.close() 199 | # 等待,爬得太快容易被发现 200 | time.sleep(0.5) 201 | except Exception as e: 202 | with codecs.open(os.path.join(parentdir, "log.txt"), "a", "UTF-8") as f: 203 | errinfo = '' 204 | for ii in range(0, len(e.args)): 205 | arg = e.args[ii] 206 | if self.isNum(arg): 207 | arg = str(arg) 208 | errinfo += ' ' + arg 209 | errinfo += ' numbD=> ' + imgPath 210 | f.write(errinfo + "\r\n") 211 | continue 212 | # 保存信息 213 | try: 214 | with open(os.path.join(path, item.rep(item.title) + ".txt"), "w") as f: 215 | f.write(" Form:\t\t" + item.url + "\r\n") 216 | f.write(" ID:\t\t" + item.strid + "\r\n") 217 | f.write("UpdateDate:\t\t" + item.date + "\r\n") 218 | f.write(" Sorting:\t\tShawn\r\n") 219 | except Exception as e: 220 | with codecs.open(os.path.join(parentdir, "log.txt"), "a", "UTF-8") as f: 221 | errinfo = '' 222 | for ii in range(0, len(e.args)): 223 | arg = e.args[ii] 224 | if self.isNum(arg): 225 | arg = str(arg) 226 | errinfo += ' ' + arg 227 | errinfo += ' writeInfo=> ' + os.path.join(path, item.rep(item.title) + ".txt") 228 | f.write(errinfo + "\r\n") 229 | return 230 | 231 | # 判断是否为数字 232 | def isNum(self, value): 233 | try: 234 | value + 1 235 | except TypeError: 236 | return False 237 | else: 238 | return True 239 | -------------------------------------------------------------------------------- /EopScorePageItem.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | 4 | # 一篇谱子的抽象类 5 | class EopPageItem(object): 6 | def __init__(self, strid,url, date,title,author): 7 | self.strid = str(strid) 8 | self.id = int(strid) 9 | self.url = str(url) 10 | self.date = str(date) 11 | self.title = str(title) 12 | self.author = str(author) 13 | # 五线谱地址 14 | self.staveUrl = 'http://www.everyonepiano.cn/Stave-'+str(self.id)+'.html' 15 | self.staveImgs = [] 16 | # 简谱地址 17 | self.numberUrl = 'http://www.everyonepiano.cn/Number-'+str(self.id)+'.html' 18 | self.numberImgs = [] 19 | 20 | def print_attrs(self): 21 | print( 22 | 'id:', self.id, ',', 23 | 'url:', self.url, ',', 24 | 'date:', self.date, ',', 25 | 'title:', self.title, ',', 26 | 'author:', self.author, ',', 27 | ) 28 | # 获取存储路径 29 | def getSavePath(self, parentdir): 30 | # 文件夹取个长名字,免得重复了 31 | path = os.path.join(parentdir, self.rep(self.title) + " (" + self.rep(self.author) + ")_" + self.strid) 32 | return path 33 | 34 | def rep(self,val): 35 | # 去掉特殊字符 懒得写正则 36 | v = val.replace("|", "") 37 | v = v.replace(":", "") 38 | v = v.replace("<", "") 39 | v = v.replace(">", "") 40 | v = v.replace("?", "") 41 | v = v.replace("*", "") 42 | v = v.replace("/", "") 43 | v = v.replace("\\", "") 44 | v = v.replace("\r", "") 45 | v = v.replace("\n", "") 46 | return v.strip() 47 | 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | from EopCrawler import EopCrawler 3 | import shutil 4 | import codecs 5 | 6 | if __name__ == '__main__': 7 | # 爬虫的工作模式:0:全站扫描覆盖已有;1:全站扫描跳过已有;2:按id降序排序依次爬取直至遇到一个曾经爬过的项目 8 | workMode = 1 9 | # 乐谱保存的路径 10 | path = "D:\Piano" 11 | # 开始页 12 | start = 109 13 | # 结束页 14 | end = 109 15 | 16 | ec = EopCrawler() 17 | for i in range(start, end + 1): 18 | html = ec.getHtml(ec.UrlPage + str(i)) 19 | if html is not None: 20 | print("loaded =>" + ec.UrlPage + str(i)) 21 | # print(html) 22 | items = ec.getPageItems(html) 23 | j = 1 24 | for item in items: 25 | print(str(i) + ":" + str(j) + " / " + str(len(items))) 26 | j += 1 27 | # 先简单地通过文件夹名来判断是否已经下载过 28 | savePath = item.getSavePath(path) 29 | try: 30 | if os.path.exists(savePath) is True: 31 | if workMode == 1: 32 | # 文件夹已经存在,说明已下载过,跳过 33 | continue 34 | elif workMode == 2: 35 | # 文件夹已经存在,说明已下载过,结束程序 36 | break 37 | else: 38 | # 文件夹已经存在,删除原有,重新下载 39 | shutil.rmtree(savePath) 40 | item = ec.getImgUrls(item) 41 | ec.doDownLoadImgs(item, path) 42 | except Exception as e: 43 | with codecs.open(os.path.join(path, "log.txt"), "a", "UTF-8") as f: 44 | errinfo = '' 45 | for ii in range(0, len(e.args)): 46 | arg = e.args[ii] 47 | if ec.isNum(arg): 48 | arg = str(arg) 49 | errinfo += ' ' + arg 50 | errinfo += ' main=> ' + savePath 51 | f.write(errinfo + "\r\n") 52 | 53 | --------------------------------------------------------------------------------