├── .gitignore ├── 302count.txt ├── README.md ├── bdmms ├── __init__.py ├── items.py ├── models.py ├── pipelines.py ├── rotate_useragent.py ├── settings.py └── spiders │ ├── __init__.py │ └── bdmmspider.py ├── clear_stats.py ├── dog.py └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | .ropeproject 3 | *.db 4 | bdmms.log 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Packages 10 | *.egg 11 | *.egg-info 12 | dist 13 | build 14 | eggs 15 | parts 16 | bin 17 | var 18 | sdist 19 | develop-eggs 20 | .installed.cfg 21 | lib 22 | lib64 23 | 24 | # Installer logs 25 | pip-log.txt 26 | 27 | # Unit test / coverage reports 28 | .coverage 29 | .tox 30 | nosetests.xml 31 | 32 | # Translations 33 | *.mo 34 | 35 | # Mr Developer 36 | .mr.developer.cfg 37 | .project 38 | .pydevproject 39 | -------------------------------------------------------------------------------- /302count.txt: -------------------------------------------------------------------------------- 1 | 61 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 百度音乐爬虫 2 | ================= 3 | 4 | 通过: 5 | 6 | $ python dog.py 7 | 8 | 启动爬虫,没有设置时间间隔,所以会被百度k掉[经测试即使设置时间间隔为10秒也会被百度k掉], 9 | 大概30分钟会被k一次,一次k大概20分钟,所以dog.py实现了在被k的时候[百度会302到输入验证码],会 10 | 自动停止抓取,10分钟后自动尝试启动爬虫,如果还是302那么再继续停止10分钟,一般过 11 | 一会解禁了就可以抓取了。 12 | 13 | 使用redis支持断点续传。 14 | -------------------------------------------------------------------------------- /bdmms/__init__.py: -------------------------------------------------------------------------------- 1 | __doc__ = '**bdmms** means: **Baidu Mp3 Metadata Spider** ' 2 | -------------------------------------------------------------------------------- /bdmms/items.py: -------------------------------------------------------------------------------- 1 | # coding: u8 2 | 3 | from scrapy.item import Item, Field 4 | 5 | 6 | class BdmmsItem(Item): 7 | # 歌名 8 | song_name = Field() 9 | # 歌曲在百度mp3中的url 10 | song_link = Field() 11 | 12 | # 歌手 13 | singer = Field() 14 | # 歌手封面 15 | singer_face = Field() 16 | 17 | # 所属专辑 18 | album_name = Field() 19 | album_link = Field() 20 | # 专辑发行时间 21 | release_date = Field() 22 | # 所属公司 23 | company = Field() 24 | # 专辑封面 25 | album_cover = Field() 26 | # 专辑简介 27 | album_intro = Field() 28 | 29 | # 标签 30 | tags = Field() 31 | 32 | # 歌词 33 | lrc = Field() 34 | 35 | def copy(self): 36 | '''文档上面说可以用copy,可是我的会报错,所以自己实现一个''' 37 | return BdmmsItem(dict(self)) 38 | -------------------------------------------------------------------------------- /bdmms/models.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from sqlalchemy.ext.declarative import declarative_base 4 | from sqlalchemy import Column, String, Integer, Text, Date 5 | from sqlalchemy.schema import PrimaryKeyConstraint 6 | 7 | 8 | Base = declarative_base() 9 | 10 | 11 | class Singer(Base): 12 | '''歌手表''' 13 | __tablename__ = 'singer' 14 | 15 | pk = Column(Integer, primary_key=True, autoincrement=True) 16 | # 歌手名 17 | name = Column(String(50), index=True) 18 | # 封面url 19 | face = Column(String(200)) 20 | 21 | 22 | class Tag(Base): 23 | '''标签表''' 24 | __tablename__ = 'tag' 25 | 26 | pk = Column(Integer, primary_key=True, autoincrement=True) 27 | name = Column(String(20), nullable=True, index=True) 28 | 29 | 30 | class Album(Base): 31 | '''专辑表''' 32 | __tablename__ = 'album' 33 | 34 | pk = Column(Integer, primary_key=True, autoincrement=True) 35 | # 专辑名 36 | name = Column(String(100), nullable=True, index=True) 37 | # 简介 38 | intro = Column(Text) 39 | # 发行时间[release date] 40 | rdt = Column(Date) 41 | # 所属公司 42 | corp = Column(String(50)) 43 | # 封面url 44 | cover = Column(String(200)) 45 | 46 | 47 | class Song(Base): 48 | '''歌曲表''' 49 | __tablename__ = 'song' 50 | 51 | pk = Column(Integer, primary_key=True, autoincrement=True) 52 | # 歌名 53 | name = Column(String(100), nullable=True, index=True) 54 | # 歌手id 55 | singer = Column(Integer, nullable=True, index=True) 56 | # 所属专辑id 57 | album = Column(Integer) 58 | # 歌词 59 | lrc = Column(Text) 60 | 61 | 62 | class SongTag(Base): 63 | '''歌曲与标签关系表''' 64 | __tablename__ = 'song_and_tag' 65 | __table_args__ = (PrimaryKeyConstraint('sid', 'tid', name='sid_tid_pkc'),) 66 | 67 | # 歌曲id 68 | sid = Column(Integer, nullable=True, index=True) 69 | # 标签id 70 | tid = Column(Integer, nullable=True, index=True) 71 | 72 | 73 | def init_db(): 74 | # 创建各个表 75 | import settings 76 | metadata = Base.metadata 77 | metadata.create_all(settings.engine) 78 | 79 | 80 | if __name__ == '__main__': 81 | init_db() 82 | -------------------------------------------------------------------------------- /bdmms/pipelines.py: -------------------------------------------------------------------------------- 1 | # coding: u8 2 | 3 | from scrapy.http import Request 4 | from scrapy.exceptions import DropItem 5 | 6 | from settings import db 7 | from models import Singer, Album, Tag, Song, SongTag 8 | 9 | 10 | class Empty(object): 11 | def __getattr__(self, k): 12 | return None 13 | 14 | 15 | class BdmmsPipeline(object): 16 | def __init__(self): 17 | pass 18 | 19 | def process_item(self, item, spider): 20 | if item.get('song_name') is None: 21 | # 分页完 22 | raise DropItem('ajax page over.') 23 | singer = db.query( 24 | Singer.pk).filter_by(face=item['singer_face']).first() 25 | if singer is None: 26 | singer = Singer(name=item['singer'], face=item['singer_face']) 27 | db.add(singer) 28 | 29 | album_name = item.get('album_name') 30 | if album_name is not None: 31 | cover = item.get('album_cover') 32 | album = db.query(Album.pk).filter_by(cover=cover).first() 33 | if album is None: 34 | album = Album( 35 | name=album_name, 36 | intro=item.get('album_intro'), 37 | rdt=item['release_date'], 38 | cover=cover) 39 | db.add(album) 40 | else: 41 | album = Empty() 42 | 43 | db.commit() 44 | 45 | lrc = item.get('lrc') 46 | song = db.query(Song).filter_by( 47 | name=item['song_name'], singer=singer.pk).first() 48 | if song is None: 49 | song = Song( 50 | name=item['song_name'], 51 | singer=singer.pk, 52 | album=album.pk, 53 | lrc=lrc) 54 | db.add(song) 55 | db.commit() 56 | elif None not in (lrc, song.lrc): 57 | song.lrc = lrc 58 | 59 | tag_objs = [] 60 | for tag in item['tags']: 61 | t = db.query(Tag.pk).filter_by(name=tag).first() 62 | if t is None: 63 | t = Tag(name=tag) 64 | db.add(t) 65 | tag_objs.append(t) 66 | db.commit() 67 | 68 | for tag in tag_objs: 69 | db.merge(SongTag(sid=song.pk, tid=tag.pk)) 70 | db.commit() 71 | 72 | return item 73 | -------------------------------------------------------------------------------- /bdmms/rotate_useragent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #-*-coding:utf-8-*- 3 | 4 | import random 5 | from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware 6 | 7 | class RotateUserAgentMiddleware(UserAgentMiddleware): 8 | """ 9 | a useragent middleware which rotate the user agent when crawl websites 10 | 11 | if you set the USER_AGENT_LIST in settings,the rotate with it,if not,then use the default user_agent_list attribute instead. 12 | """ 13 | 14 | #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape 15 | #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php 16 | user_agent_list = [\ 17 | 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31',\ 18 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17',\ 19 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17',\ 20 | \ 21 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',\ 22 | 'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)',\ 23 | 'Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',\ 24 | \ 25 | 'Mozilla/6.0 (Windows NT 6.2; WOW64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1',\ 26 | 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1',\ 27 | 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:15.0) Gecko/20120910144328 Firefox/15.0.2',\ 28 | \ 29 | 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',\ 30 | 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9a3pre) Gecko/20070330',\ 31 | 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.13; ) Gecko/20101203',\ 32 | \ 33 | 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',\ 34 | 'Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50',\ 35 | 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52',\ 36 | \ 37 | 'Mozilla/5.0 (Windows; U; Win 9x 4.90; SG; rv:1.9.2.4) Gecko/20101104 Netscape/9.1.0285',\ 38 | 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.1.7pre) Gecko/20070815 Firefox/2.0.0.6 Navigator/9.0b3',\ 39 | 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',\ 40 | ] 41 | def process_request(self, request, spider): 42 | ua = random.choice(self.user_agent_list) 43 | request.headers.setdefault('User-Agent', ua) 44 | -------------------------------------------------------------------------------- /bdmms/settings.py: -------------------------------------------------------------------------------- 1 | # coding: u8 2 | 3 | BOT_NAME = 'bdmms' 4 | 5 | SPIDER_MODULES = ['bdmms.spiders'] 6 | NEWSPIDER_MODULE = 'bdmms.spiders' 7 | 8 | # 抓取时延n秒[scrapy默认会在DOWNLOAD_DELAY的基础上再随机乘上一个0.5~1.5的因子] 9 | # 百度会让输入验证码,为了不让蜘蛛停下来,时延长点 10 | #DOWNLOAD_DELAY = 2 11 | 12 | # 禁用cookie 13 | #COOKIES_ENABLED = False 14 | 15 | #LOG_LEVEL = 'WARNING' 16 | 17 | ITEM_PIPELINES = [ 18 | 'bdmms.pipelines.BdmmsPipeline', 19 | 'scrapy_redis.pipelines.RedisPipeline', 20 | ] 21 | 22 | DOWNLOADER_MIDDLEWARES = { 23 | 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None, 24 | 'bdmms.rotate_useragent.RotateUserAgentMiddleware': 400, 25 | } 26 | 27 | # 数据库设置 28 | from sqlalchemy import create_engine 29 | from sqlalchemy.orm import sessionmaker, scoped_session 30 | DB_NAME = 'baidu_music_metadata' 31 | DB_USER = 'root' 32 | DB_PASS = '111' 33 | DB_HOST_M = '127.0.0.1' 34 | DB_PORT = 3306 35 | engine = create_engine( 36 | 'mysql://%s:%s@%s:%s/%s?charset=utf8' % 37 | (DB_USER, DB_PASS, DB_HOST_M, DB_PORT, DB_NAME), 38 | encoding='utf8', 39 | echo=False, 40 | ) 41 | db = scoped_session(sessionmaker(bind=engine)) 42 | 43 | 44 | # scrapy_redis 45 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 46 | SCHEDULER_PERSIST = True 47 | -------------------------------------------------------------------------------- /bdmms/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /bdmms/spiders/bdmmspider.py: -------------------------------------------------------------------------------- 1 | # coding: u8 2 | 3 | import json 4 | from itertools import izip 5 | 6 | from scrapy.spider import BaseSpider 7 | from scrapy.http import Request 8 | from scrapy.selector import HtmlXPathSelector 9 | from scrapy.utils.response import get_base_url 10 | from scrapy.utils.url import urljoin_rfc 11 | 12 | from bdmms.items import BdmmsItem 13 | 14 | 15 | import logging 16 | from scrapy.log import ScrapyFileLogObserver 17 | 18 | logfile = open('bdmms.log', 'a') 19 | log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG) 20 | log_observer.start() 21 | 22 | 23 | class BdmmSpider(BaseSpider): 24 | # scrapy内建属性 25 | name = 'bdmms' 26 | allowed_domains = ['music.baidu.com'] 27 | start_urls = ['http://music.baidu.com/artist'] 28 | 29 | # 自定义属性 30 | host = 'http://music.baidu.com' 31 | 32 | def parse(self, response): 33 | '''从入口地址[歌手列表开始抓取]''' 34 | 35 | a = '/html/body/div[3]/div/div/div[3]/ul/li[position()>1]/ul/li/a/' 36 | singer_names = self._query(a + 'text()', response) 37 | singer_links = self._query(a + '@href', response) 38 | 39 | # 进入单页抓取 40 | for name, link in izip(singer_names, singer_links): 41 | yield Request( 42 | url=self.host + link, 43 | meta={'item': BdmmsItem(singer=name)}, 44 | callback=self.parse_single_singer) 45 | 46 | def parse_single_singer(self, response): 47 | '''歌手单页抓取歌手信息以及歌曲列表''' 48 | # 歌手的id 49 | artist_id = response.url.strip('/').rsplit('/', 1)[1] 50 | 51 | item = response.meta['item'] 52 | item['singer_face'] = self._get0(self._query( 53 | '//*[@id="baseInfo"]//span[@class="cover"]/img/@src', response)) 54 | 55 | # 分页的ajax地址 56 | plink = self.host + '/data/user/getsongs?start={0}&ting_uid={1}' 57 | plink += '&order=hot&.r={2}' 58 | start = 0 59 | step = 20 60 | page_nums = self._query( 61 | '//div[@id="songList"]//*[contains(@class, "navigator")]/text()', 62 | response, 63 | False).re('\d+') 64 | stop = (int(page_nums[-1]) - 1) * 20 if page_nums else 20 65 | stop += 1 66 | # 遍历所有的页码[ajax返回json数据] 67 | while start < stop: 68 | yield Request( 69 | url=plink.format(start, artist_id, self._r()), 70 | meta={'item': item}, 71 | callback=self.parse_song_page) 72 | start += step 73 | 74 | def parse_single_song(self, response): 75 | '''对每一首歌曲解析''' 76 | item = response.meta['item'] 77 | base_info = '//ul[contains(@class, "base-info")]/li/' 78 | 79 | a = base_info + 'a[contains(@href, "/album/")]/' 80 | album = self._query(a + 'text()', response) 81 | album_name = self._get0(album) 82 | if album_name is not None: 83 | album_name = album_name.strip(u'《》').strip() 84 | item['album_name'] = album_name 85 | item['album_link'] = self._get0(self._query(a + '@href', response)) 86 | 87 | item['release_date'] = self._get0(self._query( 88 | base_info + 'text()', response, False).re('\d{4}-\d{2}-\d{2}')) 89 | item['tags'] = self._query( 90 | base_info + '/a[@class="tag-list"]/text()', response) 91 | 92 | lrc_link = self._get0(self._query( 93 | '//a[@data-lyricdata]/@data-lyricdata', response)) 94 | if lrc_link: 95 | lrc_link = self.host + json.loads(lrc_link)['href'] 96 | return Request( 97 | url=lrc_link, 98 | meta={'item': item}, 99 | callback=self.parse_lrc) 100 | elif item['album_link']: 101 | return self._request_get_album(item) 102 | else: 103 | return item 104 | 105 | def _request_get_album(self, item): 106 | return Request( 107 | url= self.host + item['album_link'], 108 | meta={'item': item}, 109 | callback=self.parse_album) 110 | 111 | def parse_lrc(self, response): 112 | '''获取歌词''' 113 | item = response.meta['item'] 114 | item['lrc'] = response.body 115 | if item['album_link']: 116 | return self._request_get_album(item) 117 | else: 118 | return item 119 | 120 | def parse_album(self, response): 121 | '''获取专辑信息''' 122 | item = response.meta['item'] 123 | item['album_cover'] = self._get0(self._query( 124 | '//div[@class="album-info"]//span[@class="cover"]/img/@src', 125 | response)) 126 | item['album_intro'] = self._get0(self._query( 127 | '//span[@class="description-all"]/text()', response)) 128 | return item 129 | 130 | @staticmethod 131 | def _get0(x): 132 | return x[0].strip() if x else None 133 | 134 | def parse_song_page(self, response): 135 | '''解析歌曲列表分页ajax请求返回的数据''' 136 | item = response.meta['item'] 137 | html = response.body 138 | if 'title' not in html: 139 | yield item 140 | else: 141 | html = json.loads(html)['data']['html'] 142 | response = response.replace(body=html) 143 | a = '//span[contains(@class, "song-title")]/a/' 144 | song_names = self._query(a + '@title', response) 145 | song_links = self._query(a + '@href', response) 146 | for name, link in izip(song_names, song_links): 147 | # 复制一个,因为每首歌曲的以下属性不同,不然后者会覆盖前者 148 | item = item.copy() 149 | item['song_name'] = name.strip() 150 | item['song_link'] = link 151 | yield Request( 152 | url=self.host + link, 153 | meta={'item': item}, 154 | callback=self.parse_single_song) 155 | 156 | @staticmethod 157 | def _query(xpath, response, extract=True): 158 | ret = HtmlXPathSelector(response).select(xpath) 159 | return ret.extract() if extract else ret 160 | 161 | @staticmethod 162 | def _r(): 163 | import random 164 | return str(random.random()) 165 | -------------------------------------------------------------------------------- /clear_stats.py: -------------------------------------------------------------------------------- 1 | # coding: u8 2 | 3 | 4 | import redis 5 | 6 | 7 | REDIS_HOST = 'localhost' 8 | REDIS_PORT = 6379 9 | 10 | 11 | def clear_stats(): 12 | open('./bdmms.log', 'w').write('') 13 | server = redis.Redis(REDIS_HOST, REDIS_PORT) 14 | for key in ["bdmms:requests", "bdmms:items", "bdmms:dupefilter"]: 15 | server.delete(key) 16 | 17 | 18 | if __name__ == "__main__": 19 | clear_stats() 20 | -------------------------------------------------------------------------------- /dog.py: -------------------------------------------------------------------------------- 1 | # coding: u8 2 | 3 | 4 | # 监视日志文件,如果出现302那么禁用爬虫一会,再启动 5 | 6 | import commands 7 | import os 8 | import time 9 | 10 | 11 | os.system('killall scrapy') 12 | os.system('scrapy crawl bdmms&') 13 | 14 | while 1: 15 | time.sleep(0.1) 16 | 17 | # 得到日志中302的个数 18 | cnt_302 = int(commands.getstatusoutput('cat bdmms.log | grep -n "Redirecting (302)" | wc -l')[1]) 19 | # 读取上次302的个数 20 | last_cnt = int(open('./302count.txt').read().strip()) 21 | 22 | if cnt_302 != last_cnt: 23 | # 将新302个数写到文件 24 | open('./302count.txt', 'w').write(str(cnt_302)) 25 | 26 | # 杀死爬虫 27 | print 'killing...' 28 | os.system('killall scrapy') 29 | 30 | # 暂停 31 | N = 10 32 | print 'sleeping %s minute...' % N 33 | time.sleep(60 * N) 34 | 35 | # 启动爬虫 36 | print 'starting scrapy...' 37 | # 保险起见,再杀一遍 38 | os.system('killall scrapy') 39 | os.system('scrapy crawl bdmms&') 40 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/topics/scrapyd.html 5 | 6 | [settings] 7 | default = bdmms.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = bdmms 12 | --------------------------------------------------------------------------------