├── .gitignore
├── 302count.txt
├── README.md
├── bdmms
    ├── __init__.py
    ├── items.py
    ├── models.py
    ├── pipelines.py
    ├── rotate_useragent.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   └── bdmmspider.py
├── clear_stats.py
├── dog.py
└── scrapy.cfg


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | .ropeproject
 3 | *.db
 4 | bdmms.log
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Packages
10 | *.egg
11 | *.egg-info
12 | dist
13 | build
14 | eggs
15 | parts
16 | bin
17 | var
18 | sdist
19 | develop-eggs
20 | .installed.cfg
21 | lib
22 | lib64
23 | 
24 | # Installer logs
25 | pip-log.txt
26 | 
27 | # Unit test / coverage reports
28 | .coverage
29 | .tox
30 | nosetests.xml
31 | 
32 | # Translations
33 | *.mo
34 | 
35 | # Mr Developer
36 | .mr.developer.cfg
37 | .project
38 | .pydevproject
39 | 


--------------------------------------------------------------------------------
/302count.txt:
--------------------------------------------------------------------------------
1 | 61


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 百度音乐爬虫
 2 | =================
 3 | 
 4 | 通过:
 5 | 
 6 |     $ python dog.py
 7 | 
 8 | 启动爬虫，没有设置时间间隔，所以会被百度k掉[经测试即使设置时间间隔为10秒也会被百度k掉],
 9 | 大概30分钟会被k一次，一次k大概20分钟，所以dog.py实现了在被k的时候[百度会302到输入验证码]，会
10 | 自动停止抓取，10分钟后自动尝试启动爬虫，如果还是302那么再继续停止10分钟，一般过
11 | 一会解禁了就可以抓取了。
12 | 
13 | 使用redis支持断点续传。
14 | 


--------------------------------------------------------------------------------
/bdmms/__init__.py:
--------------------------------------------------------------------------------
1 | __doc__ = '**bdmms** means: **Baidu Mp3 Metadata Spider** '
2 | 


--------------------------------------------------------------------------------
/bdmms/items.py:
--------------------------------------------------------------------------------
 1 | # coding: u8
 2 | 
 3 | from scrapy.item import Item, Field
 4 | 
 5 | 
 6 | class BdmmsItem(Item):
 7 |     # 歌名
 8 |     song_name = Field()
 9 |     # 歌曲在百度mp3中的url
10 |     song_link = Field()
11 | 
12 |     # 歌手
13 |     singer = Field()
14 |     # 歌手封面
15 |     singer_face = Field()
16 | 
17 |     # 所属专辑
18 |     album_name = Field()
19 |     album_link = Field()
20 |     # 专辑发行时间
21 |     release_date = Field()
22 |     # 所属公司
23 |     company = Field()
24 |     # 专辑封面
25 |     album_cover = Field()
26 |     # 专辑简介
27 |     album_intro = Field()
28 | 
29 |     # 标签
30 |     tags = Field()
31 | 
32 |     # 歌词
33 |     lrc = Field()
34 | 
35 |     def copy(self):
36 |         '''文档上面说可以用copy，可是我的会报错，所以自己实现一个'''
37 |         return BdmmsItem(dict(self))
38 | 


--------------------------------------------------------------------------------
/bdmms/models.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from sqlalchemy.ext.declarative import declarative_base
 4 | from sqlalchemy import Column, String, Integer, Text, Date
 5 | from sqlalchemy.schema import PrimaryKeyConstraint
 6 | 
 7 | 
 8 | Base = declarative_base()
 9 | 
10 | 
11 | class Singer(Base):
12 |     '''歌手表'''
13 |     __tablename__ = 'singer'
14 | 
15 |     pk = Column(Integer, primary_key=True, autoincrement=True)
16 |     # 歌手名
17 |     name = Column(String(50), index=True)
18 |     # 封面url
19 |     face = Column(String(200))
20 | 
21 | 
22 | class Tag(Base):
23 |     '''标签表'''
24 |     __tablename__ = 'tag'
25 | 
26 |     pk = Column(Integer, primary_key=True, autoincrement=True)
27 |     name = Column(String(20), nullable=True, index=True)
28 | 
29 | 
30 | class Album(Base):
31 |     '''专辑表'''
32 |     __tablename__ = 'album'
33 | 
34 |     pk = Column(Integer, primary_key=True, autoincrement=True)
35 |     # 专辑名
36 |     name = Column(String(100), nullable=True, index=True)
37 |     # 简介
38 |     intro = Column(Text)
39 |     # 发行时间[release date]
40 |     rdt = Column(Date)
41 |     # 所属公司
42 |     corp = Column(String(50))
43 |     # 封面url
44 |     cover = Column(String(200))
45 | 
46 | 
47 | class Song(Base):
48 |     '''歌曲表'''
49 |     __tablename__ = 'song'
50 | 
51 |     pk = Column(Integer, primary_key=True, autoincrement=True)
52 |     # 歌名
53 |     name = Column(String(100), nullable=True, index=True)
54 |     # 歌手id
55 |     singer = Column(Integer, nullable=True, index=True)
56 |     # 所属专辑id
57 |     album = Column(Integer)
58 |     # 歌词
59 |     lrc = Column(Text)
60 | 
61 | 
62 | class SongTag(Base):
63 |     '''歌曲与标签关系表'''
64 |     __tablename__ = 'song_and_tag'
65 |     __table_args__ = (PrimaryKeyConstraint('sid', 'tid', name='sid_tid_pkc'),)
66 | 
67 |     # 歌曲id
68 |     sid = Column(Integer, nullable=True, index=True)
69 |     # 标签id
70 |     tid = Column(Integer, nullable=True, index=True)
71 | 
72 | 
73 | def init_db():
74 |     # 创建各个表
75 |     import settings
76 |     metadata = Base.metadata
77 |     metadata.create_all(settings.engine)
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     init_db()
82 | 


--------------------------------------------------------------------------------
/bdmms/pipelines.py:
--------------------------------------------------------------------------------
 1 | # coding: u8
 2 | 
 3 | from scrapy.http import Request
 4 | from scrapy.exceptions import DropItem
 5 | 
 6 | from settings import db
 7 | from models import Singer, Album, Tag, Song, SongTag
 8 | 
 9 | 
10 | class Empty(object):
11 |     def __getattr__(self, k):
12 |         return None
13 | 
14 | 
15 | class BdmmsPipeline(object):
16 |     def __init__(self):
17 |         pass
18 | 
19 |     def process_item(self, item, spider):
20 |         if item.get('song_name') is None:
21 |             # 分页完
22 |             raise DropItem('ajax page over.')
23 |         singer = db.query(
24 |             Singer.pk).filter_by(face=item['singer_face']).first()
25 |         if singer is None:
26 |             singer = Singer(name=item['singer'], face=item['singer_face'])
27 |             db.add(singer)
28 | 
29 |         album_name = item.get('album_name')
30 |         if album_name is not None:
31 |             cover = item.get('album_cover')
32 |             album = db.query(Album.pk).filter_by(cover=cover).first()
33 |             if album is None:
34 |                 album = Album(
35 |                     name=album_name,
36 |                     intro=item.get('album_intro'),
37 |                     rdt=item['release_date'],
38 |                     cover=cover)
39 |                 db.add(album)
40 |         else:
41 |             album = Empty()
42 | 
43 |         db.commit()
44 | 
45 |         lrc = item.get('lrc')
46 |         song = db.query(Song).filter_by(
47 |             name=item['song_name'], singer=singer.pk).first()
48 |         if song is None:
49 |             song = Song(
50 |                 name=item['song_name'],
51 |                 singer=singer.pk,
52 |                 album=album.pk,
53 |                 lrc=lrc)
54 |             db.add(song)
55 |             db.commit()
56 |         elif None not in (lrc, song.lrc):
57 |             song.lrc = lrc
58 | 
59 |         tag_objs = []
60 |         for tag in item['tags']:
61 |             t = db.query(Tag.pk).filter_by(name=tag).first()
62 |             if t is None:
63 |                 t = Tag(name=tag)
64 |                 db.add(t)
65 |             tag_objs.append(t)
66 |         db.commit()
67 | 
68 |         for tag in tag_objs:
69 |             db.merge(SongTag(sid=song.pk, tid=tag.pk))
70 |         db.commit()
71 | 
72 |         return item
73 | 


--------------------------------------------------------------------------------
/bdmms/rotate_useragent.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*-coding:utf-8-*-
 3 | 
 4 | import random
 5 | from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
 6 | 
 7 | class RotateUserAgentMiddleware(UserAgentMiddleware):
 8 |     """
 9 |         a useragent middleware which rotate the user agent when crawl websites
10 | 
11 |         if you set the USER_AGENT_LIST in settings,the rotate with it,if not,then use the default user_agent_list attribute instead.
12 |     """
13 | 
14 |     #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
15 |     #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
16 |     user_agent_list = [\
17 |         'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31',\
18 |         'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17',\
19 |         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17',\
20 |         \
21 |         'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.2; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',\
22 |         'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)',\
23 |         'Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',\
24 |         \
25 |         'Mozilla/6.0 (Windows NT 6.2; WOW64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1',\
26 |         'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1',\
27 |         'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:15.0) Gecko/20120910144328 Firefox/15.0.2',\
28 |         \
29 |         'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',\
30 |         'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9a3pre) Gecko/20070330',\
31 |         'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.13; ) Gecko/20101203',\
32 |         \
33 |         'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',\
34 |         'Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50',\
35 |         'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52',\
36 |         \
37 |         'Mozilla/5.0 (Windows; U; Win 9x 4.90; SG; rv:1.9.2.4) Gecko/20101104 Netscape/9.1.0285',\
38 |         'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.8.1.7pre) Gecko/20070815 Firefox/2.0.0.6 Navigator/9.0b3',\
39 |         'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',\
40 |     ]
41 |     def process_request(self, request, spider):
42 |         ua = random.choice(self.user_agent_list)
43 |         request.headers.setdefault('User-Agent', ua)
44 | 


--------------------------------------------------------------------------------
/bdmms/settings.py:
--------------------------------------------------------------------------------
 1 | # coding: u8
 2 | 
 3 | BOT_NAME = 'bdmms'
 4 | 
 5 | SPIDER_MODULES = ['bdmms.spiders']
 6 | NEWSPIDER_MODULE = 'bdmms.spiders'
 7 | 
 8 | # 抓取时延n秒[scrapy默认会在DOWNLOAD_DELAY的基础上再随机乘上一个0.5~1.5的因子]
 9 | # 百度会让输入验证码，为了不让蜘蛛停下来，时延长点
10 | #DOWNLOAD_DELAY = 2
11 | 
12 | # 禁用cookie
13 | #COOKIES_ENABLED = False
14 | 
15 | #LOG_LEVEL = 'WARNING'
16 | 
17 | ITEM_PIPELINES = [
18 |     'bdmms.pipelines.BdmmsPipeline',
19 |     'scrapy_redis.pipelines.RedisPipeline',
20 | ]
21 | 
22 | DOWNLOADER_MIDDLEWARES = {
23 |     'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
24 |     'bdmms.rotate_useragent.RotateUserAgentMiddleware': 400,
25 | }
26 | 
27 | # 数据库设置
28 | from sqlalchemy import create_engine
29 | from sqlalchemy.orm import sessionmaker, scoped_session
30 | DB_NAME = 'baidu_music_metadata'
31 | DB_USER = 'root'
32 | DB_PASS = '111'
33 | DB_HOST_M = '127.0.0.1'
34 | DB_PORT = 3306
35 | engine = create_engine(
36 |     'mysql://%s:%s@%s:%s/%s?charset=utf8' %
37 |     (DB_USER, DB_PASS, DB_HOST_M, DB_PORT, DB_NAME),
38 |     encoding='utf8',
39 |     echo=False,
40 | )
41 | db = scoped_session(sessionmaker(bind=engine))
42 | 
43 | 
44 | # scrapy_redis
45 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
46 | SCHEDULER_PERSIST = True
47 | 


--------------------------------------------------------------------------------
/bdmms/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/bdmms/spiders/bdmmspider.py:
--------------------------------------------------------------------------------
  1 | # coding: u8
  2 | 
  3 | import json
  4 | from itertools import izip
  5 | 
  6 | from scrapy.spider import BaseSpider
  7 | from scrapy.http import Request
  8 | from scrapy.selector import HtmlXPathSelector
  9 | from scrapy.utils.response import get_base_url
 10 | from scrapy.utils.url import urljoin_rfc
 11 | 
 12 | from bdmms.items import BdmmsItem
 13 | 
 14 | 
 15 | import logging
 16 | from scrapy.log import ScrapyFileLogObserver
 17 | 
 18 | logfile = open('bdmms.log', 'a')
 19 | log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG)
 20 | log_observer.start()
 21 | 
 22 | 
 23 | class BdmmSpider(BaseSpider):
 24 |     # scrapy内建属性
 25 |     name = 'bdmms'
 26 |     allowed_domains = ['music.baidu.com']
 27 |     start_urls = ['http://music.baidu.com/artist']
 28 | 
 29 |     # 自定义属性
 30 |     host = 'http://music.baidu.com'
 31 | 
 32 |     def parse(self, response):
 33 |         '''从入口地址[歌手列表开始抓取]'''
 34 | 
 35 |         a = '/html/body/div[3]/div/div/div[3]/ul/li[position()>1]/ul/li/a/'
 36 |         singer_names = self._query(a + 'text()', response)
 37 |         singer_links = self._query(a + '@href', response)
 38 | 
 39 |         # 进入单页抓取
 40 |         for name, link in izip(singer_names, singer_links):
 41 |             yield Request(
 42 |                 url=self.host + link,
 43 |                 meta={'item': BdmmsItem(singer=name)},
 44 |                 callback=self.parse_single_singer)
 45 | 
 46 |     def parse_single_singer(self, response):
 47 |         '''歌手单页抓取歌手信息以及歌曲列表'''
 48 |         # 歌手的id
 49 |         artist_id = response.url.strip('/').rsplit('/', 1)[1]
 50 | 
 51 |         item = response.meta['item']
 52 |         item['singer_face'] = self._get0(self._query(
 53 |             '//*[@id="baseInfo"]//span[@class="cover"]/img/@src', response))
 54 | 
 55 |         # 分页的ajax地址
 56 |         plink = self.host + '/data/user/getsongs?start={0}&ting_uid={1}'
 57 |         plink += '&order=hot&.r={2}'
 58 |         start = 0
 59 |         step = 20
 60 |         page_nums = self._query(
 61 |             '//div[@id="songList"]//*[contains(@class, "navigator")]/text()',
 62 |             response,
 63 |             False).re('\d+')
 64 |         stop = (int(page_nums[-1]) - 1) * 20 if page_nums else 20
 65 |         stop += 1
 66 |         # 遍历所有的页码[ajax返回json数据]
 67 |         while start < stop:
 68 |             yield Request(
 69 |                 url=plink.format(start, artist_id, self._r()),
 70 |                 meta={'item': item},
 71 |                 callback=self.parse_song_page)
 72 |             start += step
 73 | 
 74 |     def parse_single_song(self, response):
 75 |         '''对每一首歌曲解析'''
 76 |         item = response.meta['item']
 77 |         base_info = '//ul[contains(@class, "base-info")]/li/'
 78 | 
 79 |         a = base_info + 'a[contains(@href, "/album/")]/'
 80 |         album = self._query(a + 'text()', response)
 81 |         album_name = self._get0(album)
 82 |         if album_name is not None:
 83 |             album_name = album_name.strip(u'《》').strip()
 84 |         item['album_name'] = album_name
 85 |         item['album_link'] = self._get0(self._query(a + '@href', response))
 86 | 
 87 |         item['release_date'] = self._get0(self._query(
 88 |             base_info + 'text()', response, False).re('\d{4}-\d{2}-\d{2}'))
 89 |         item['tags'] = self._query(
 90 |             base_info + '/a[@class="tag-list"]/text()', response)
 91 | 
 92 |         lrc_link = self._get0(self._query(
 93 |             '//a[@data-lyricdata]/@data-lyricdata', response))
 94 |         if lrc_link:
 95 |             lrc_link = self.host + json.loads(lrc_link)['href']
 96 |             return Request(
 97 |                 url=lrc_link,
 98 |                 meta={'item': item},
 99 |                 callback=self.parse_lrc)
100 |         elif item['album_link']:
101 |             return self._request_get_album(item)
102 |         else:
103 |             return item
104 | 
105 |     def _request_get_album(self, item):
106 |         return Request(
107 |             url= self.host + item['album_link'],
108 |             meta={'item': item},
109 |             callback=self.parse_album)
110 | 
111 |     def parse_lrc(self, response):
112 |         '''获取歌词'''
113 |         item = response.meta['item']
114 |         item['lrc'] = response.body
115 |         if item['album_link']:
116 |             return self._request_get_album(item)
117 |         else:
118 |             return item
119 | 
120 |     def parse_album(self, response):
121 |         '''获取专辑信息'''
122 |         item = response.meta['item']
123 |         item['album_cover'] = self._get0(self._query(
124 |             '//div[@class="album-info"]//span[@class="cover"]/img/@src',
125 |             response))
126 |         item['album_intro'] = self._get0(self._query(
127 |             '//span[@class="description-all"]/text()', response))
128 |         return item
129 | 
130 |     @staticmethod
131 |     def _get0(x):
132 |         return x[0].strip() if x else None
133 | 
134 |     def parse_song_page(self, response):
135 |         '''解析歌曲列表分页ajax请求返回的数据'''
136 |         item = response.meta['item']
137 |         html = response.body
138 |         if 'title' not in html:
139 |             yield item
140 |         else:
141 |             html = json.loads(html)['data']['html']
142 |             response = response.replace(body=html)
143 |             a = '//span[contains(@class, "song-title")]/a/'
144 |             song_names = self._query(a + '@title', response)
145 |             song_links = self._query(a + '@href', response)
146 |             for name, link in izip(song_names, song_links):
147 |                 # 复制一个，因为每首歌曲的以下属性不同，不然后者会覆盖前者
148 |                 item = item.copy()
149 |                 item['song_name'] = name.strip()
150 |                 item['song_link'] = link
151 |                 yield Request(
152 |                     url=self.host + link,
153 |                     meta={'item': item},
154 |                     callback=self.parse_single_song)
155 | 
156 |     @staticmethod
157 |     def _query(xpath, response, extract=True):
158 |         ret = HtmlXPathSelector(response).select(xpath)
159 |         return ret.extract() if extract else ret
160 | 
161 |     @staticmethod
162 |     def _r():
163 |         import random
164 |         return str(random.random())
165 | 


--------------------------------------------------------------------------------
/clear_stats.py:
--------------------------------------------------------------------------------
 1 | # coding: u8
 2 | 
 3 | 
 4 | import redis
 5 | 
 6 | 
 7 | REDIS_HOST = 'localhost'
 8 | REDIS_PORT = 6379
 9 | 
10 | 
11 | def clear_stats():
12 |     open('./bdmms.log', 'w').write('')
13 |     server = redis.Redis(REDIS_HOST, REDIS_PORT)
14 |     for key in ["bdmms:requests", "bdmms:items", "bdmms:dupefilter"]:
15 |         server.delete(key)
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     clear_stats()
20 | 


--------------------------------------------------------------------------------
/dog.py:
--------------------------------------------------------------------------------
 1 | # coding: u8
 2 | 
 3 | 
 4 | # 监视日志文件，如果出现302那么禁用爬虫一会，再启动
 5 | 
 6 | import commands
 7 | import os
 8 | import time
 9 | 
10 | 
11 | os.system('killall scrapy')
12 | os.system('scrapy crawl bdmms&')
13 | 
14 | while 1:
15 |     time.sleep(0.1)
16 | 
17 |     # 得到日志中302的个数
18 |     cnt_302 = int(commands.getstatusoutput('cat bdmms.log | grep -n "Redirecting (302)" | wc -l')[1])
19 |     # 读取上次302的个数
20 |     last_cnt = int(open('./302count.txt').read().strip())
21 | 
22 |     if cnt_302 != last_cnt:
23 |         # 将新302个数写到文件
24 |         open('./302count.txt', 'w').write(str(cnt_302))
25 | 
26 |         # 杀死爬虫
27 |         print 'killing...'
28 |         os.system('killall scrapy')
29 | 
30 |         # 暂停
31 |         N = 10
32 |         print 'sleeping %s minute...' % N
33 |         time.sleep(60 * N)
34 | 
35 |         # 启动爬虫
36 |         print 'starting scrapy...'
37 |         # 保险起见，再杀一遍
38 |         os.system('killall scrapy')
39 |         os.system('scrapy crawl bdmms&')
40 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = bdmms.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = bdmms
12 | 


--------------------------------------------------------------------------------