├── QQMusicSpider ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── music_spider.py ├── README.md ├── image └── jay.png ├── requirements.txt └── scrapy.cfg /QQMusicSpider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangjianxin1/QQMusicSpider/71617400ee3c4a4361c225917f18bfc8a1c32561/QQMusicSpider/__init__.py -------------------------------------------------------------------------------- /QQMusicSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class QqmusicspiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | 16 | class MusicItem(scrapy.Item): 17 | # define the fields for your item here like: 18 | # name = scrapy.Field() 19 | singer_id = scrapy.Field() 20 | singer_mid = scrapy.Field() 21 | singer_name = scrapy.Field() 22 | subtitle = scrapy.Field() 23 | song_id = scrapy.Field() 24 | song_mid = scrapy.Field() 25 | song_name = scrapy.Field() 26 | song_time_public = scrapy.Field() 27 | lyric = scrapy.Field() 28 | album_name = scrapy.Field() 29 | language = scrapy.Field() 30 | song_type = scrapy.Field() 31 | hot_comments = scrapy.Field() 32 | song_url = scrapy.Field() 33 | # company = scrapy.Field() -------------------------------------------------------------------------------- /QQMusicSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | import random 10 | 11 | 12 | class MyUseragent(object): 13 | def process_request(self, request, spider): 14 | user_agent_list = [ 15 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 16 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 17 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 18 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 19 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 20 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 21 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 22 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 23 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 24 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 25 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 26 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 27 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 28 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 29 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 30 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 31 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 32 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 33 | ] 34 | user_agent = random.choice(user_agent_list) 35 | request.headers['User_Agent'] = user_agent 36 | 37 | 38 | class QqmusicspiderSpiderMiddleware(object): 39 | # Not all methods need to be defined. If a method is not defined, 40 | # scrapy acts as if the spider middleware does not modify the 41 | # passed objects. 42 | 43 | @classmethod 44 | def from_crawler(cls, crawler): 45 | # This method is used by Scrapy to create your spiders. 46 | s = cls() 47 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 48 | return s 49 | 50 | def process_spider_input(self, response, spider): 51 | # Called for each response that goes through the spider 52 | # middleware and into the spider. 53 | 54 | # Should return None or raise an exception. 55 | return None 56 | 57 | def process_spider_output(self, response, result, spider): 58 | # Called with the results returned from the Spider, after 59 | # it has processed the response. 60 | 61 | # Must return an iterable of Request, dict or Item objects. 62 | for i in result: 63 | yield i 64 | 65 | def process_spider_exception(self, response, exception, spider): 66 | # Called when a spider or process_spider_input() method 67 | # (from other spider middleware) raises an exception. 68 | 69 | # Should return either None or an iterable of Response, dict 70 | # or Item objects. 71 | pass 72 | 73 | def process_start_requests(self, start_requests, spider): 74 | # Called with the start requests of the spider, and works 75 | # similarly to the process_spider_output() method, except 76 | # that it doesn’t have a response associated. 77 | 78 | # Must return only requests (not items). 79 | for r in start_requests: 80 | yield r 81 | 82 | def spider_opened(self, spider): 83 | spider.logger.info('Spider opened: %s' % spider.name) 84 | 85 | 86 | class QqmusicspiderDownloaderMiddleware(object): 87 | # Not all methods need to be defined. If a method is not defined, 88 | # scrapy acts as if the downloader middleware does not modify the 89 | # passed objects. 90 | 91 | @classmethod 92 | def from_crawler(cls, crawler): 93 | # This method is used by Scrapy to create your spiders. 94 | s = cls() 95 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 96 | return s 97 | 98 | def process_request(self, request, spider): 99 | # Called for each request that goes through the downloader 100 | # middleware. 101 | 102 | # Must either: 103 | # - return None: continue processing this request 104 | # - or return a Response object 105 | # - or return a Request object 106 | # - or raise IgnoreRequest: process_exception() methods of 107 | # installed downloader middleware will be called 108 | return None 109 | 110 | def process_response(self, request, response, spider): 111 | # Called with the response returned from the downloader. 112 | 113 | # Must either; 114 | # - return a Response object 115 | # - return a Request object 116 | # - or raise IgnoreRequest 117 | return response 118 | 119 | def process_exception(self, request, exception, spider): 120 | # Called when a download handler or a process_request() 121 | # (from other downloader middleware) raises an exception. 122 | 123 | # Must either: 124 | # - return None: continue processing this exception 125 | # - return a Response object: stops process_exception() chain 126 | # - return a Request object: stops process_exception() chain 127 | pass 128 | 129 | def spider_opened(self, spider): 130 | spider.logger.info('Spider opened: %s' % spider.name) 131 | -------------------------------------------------------------------------------- /QQMusicSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from QQMusicSpider.items import MusicItem 8 | import json 9 | from scrapy.exceptions import DropItem 10 | 11 | 12 | class DuplicatesPipeline(object): 13 | """ 14 | 根据音乐的song_id,对爬取过的音乐进行去重 15 | """ 16 | 17 | def __init__(self): 18 | self.song_ids = set() 19 | 20 | def process_item(self, item, spider): 21 | if isinstance(item, MusicItem): 22 | if item['song_id'] in self.song_ids: 23 | raise DropItem("Duplicate item found: %s" % item) 24 | else: 25 | self.song_ids.add(item['song_id']) 26 | return item 27 | 28 | 29 | class QqmusicspiderPipeline(object): 30 | def __init__(self): 31 | music_path = "music" 32 | self.file = open(music_path, "w", encoding="utf8") 33 | 34 | def process_item(self, item, spider): 35 | if isinstance(item, MusicItem): 36 | line = json.dumps(dict(item), ensure_ascii=False) + "\n" 37 | self.file.write(line) 38 | return item 39 | 40 | def close_spider(self, spider): 41 | self.file.close() 42 | -------------------------------------------------------------------------------- /QQMusicSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for QQMusicSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'QQMusicSpider' 13 | 14 | SPIDER_MODULES = ['QQMusicSpider.spiders'] 15 | NEWSPIDER_MODULE = 'QQMusicSpider.spiders' 16 | 17 | # SINGER_PAGE_NUM = 9809 # 歌手列表的页数 18 | SINGER_PAGE_NUM = 40 # 歌手列表的页数 19 | SINGER_PAGE_SIZE = 80 # 歌手列表中,每页歌手的数量 20 | SONG_PAGE_NUM = 15 # 每个歌手的歌曲爬取的最大页数 21 | SONG_PAGE_SIZE = 100 # 每个歌手每页爬取多少条歌曲 22 | # SINGER_PAGE_NUM = 3 # 歌手列表的页数 23 | # SINGER_PAGE_SIZE = 80 # 歌手列表中,每页歌手的户数量 24 | # SONG_PAGE_NUM = 3 # 每个歌手的歌曲爬取的最大页数 25 | # SONG_PAGE_SIZE = 10 # 每个歌手每页爬取多少条歌曲 26 | 27 | # SONGER_NUM = 1000 # 每个歌手爬取多少条歌曲,最多只能设置1000 28 | 29 | 30 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 31 | # USER_AGENT = 'QQMusicSpider (+http://www.yourdomain.com)' 32 | 33 | # Obey robots.txt rules 34 | ROBOTSTXT_OBEY = False 35 | 36 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 37 | # CONCURRENT_REQUESTS = 32 38 | 39 | # Configure a delay for requests for the same website (default: 0) 40 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 41 | # See also autothrottle settings and docs 42 | # DOWNLOAD_DELAY = 1 43 | # The download delay setting will honor only one of: 44 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 45 | # CONCURRENT_REQUESTS_PER_IP = 16 46 | 47 | # Disable cookies (enabled by default) 48 | # COOKIES_ENABLED = False 49 | 50 | # Disable Telnet Console (enabled by default) 51 | # TELNETCONSOLE_ENABLED = False 52 | 53 | # Override the default request headers: 54 | # DEFAULT_REQUEST_HEADERS = { 55 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 56 | # 'Accept-Language': 'en', 57 | # } 58 | 59 | # Enable or disable spider middlewares 60 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 61 | # SPIDER_MIDDLEWARES = { 62 | # 'QQMusicSpider.middlewares.QqmusicspiderSpiderMiddleware': 543, 63 | # } 64 | 65 | # Enable or disable downloader middlewares 66 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 67 | # DOWNLOADER_MIDDLEWARES = { 68 | # 'QQMusicSpider.middlewares.QqmusicspiderDownloaderMiddleware': 543, 69 | # } 70 | DOWNLOADER_MIDDLEWARES = { 71 | 'QQMusicSpider.middlewares.MyUseragent': 543, 72 | } 73 | 74 | # Enable or disable extensions 75 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 76 | # EXTENSIONS = { 77 | # 'scrapy.extensions.telnet.TelnetConsole': None, 78 | # } 79 | 80 | # Configure item pipelines 81 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 82 | # ITEM_PIPELINES = { 83 | # 'QQMusicSpider.pipelines.QqmusicspiderPipeline': 300, 84 | # } 85 | ITEM_PIPELINES = { 86 | 'QQMusicSpider.pipelines.DuplicatesPipeline': 300, 87 | 'QQMusicSpider.pipelines.QqmusicspiderPipeline': 800, 88 | } 89 | 90 | # Enable and configure the AutoThrottle extension (disabled by default) 91 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 92 | # AUTOTHROTTLE_ENABLED = True 93 | # The initial download delay 94 | # AUTOTHROTTLE_START_DELAY = 5 95 | # The maximum download delay to be set in case of high latencies 96 | # AUTOTHROTTLE_MAX_DELAY = 60 97 | # The average number of requests Scrapy should be sending in parallel to 98 | # each remote server 99 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 100 | # Enable showing throttling stats for every response received: 101 | # AUTOTHROTTLE_DEBUG = False 102 | 103 | # Enable and configure HTTP caching (disabled by default) 104 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 105 | # HTTPCACHE_ENABLED = True 106 | # HTTPCACHE_EXPIRATION_SECS = 0 107 | # HTTPCACHE_DIR = 'httpcache' 108 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 109 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 110 | -------------------------------------------------------------------------------- /QQMusicSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /QQMusicSpider/spiders/music_spider.py: -------------------------------------------------------------------------------- 1 | from QQMusicSpider.items import MusicItem 2 | from scrapy import Request 3 | import json 4 | import re 5 | from scrapy.spiders import Spider 6 | 7 | 8 | # DmozSpider 9 | 10 | class QQMusicSpider(Spider): 11 | # 根据地区area参数筛选歌手,-100:全部,200:内地,2:港台,5:欧美,4:日本,3:韩国,6:其他 12 | name = "qqmusic" 13 | start_urls = [ 14 | 'https://u.y.qq.com/cgi-bin/musicu.fcg?data=%7B%22singerList%22%3A%7B%22module%22%3A%22Music.SingerListServer' \ 15 | '%22%2C%22method%22%3A%22get_singer_list%22%2C%22param%22%3A%7B%22area%22%3A{area}%2C%22sex%22%3A-100%2C%22genr' \ 16 | 'e%22%3A-100%2C%22index%22%3A-100%2C%22sin%22%3A{index}%2C%22cur_page%22%3A{cur_page}%7D%7D%7D' 17 | ] 18 | 19 | # 根据singerid获取歌曲num首歌曲 20 | song_list_url = "https://u.y.qq.com/cgi-bin/musicu.fcg?data=%7B%22comm%22%3A%7B%22ct%22%3A24%2C%22cv%22%3A0%7D%2C%22singerSongList%22%3A%7B%22method%22%3A%22GetSingerSongList%22%2C%22param%22%3A%7B%22order%22%3A1%2C%22singerMid%22%3A%22{singer_mid}%22%2C%22begin%22%3A{begin}%2C%22num%22%3A{num}%7D%2C%22module%22%3A%22musichall.song_list_server%22%7D%7D" 21 | # 获取歌词,需要指定song_id 22 | lyric_url = "https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg?nobase64=1&musicid={song_id}&format=json" 23 | # 获取歌词时,必须带上该referer header,需要指定song_mid 24 | referer = "https://y.qq.com/n/yqq/song/{song_mid}.html" 25 | # 歌曲评论,需要song_id 26 | comment_url = 'https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg?biztype=1&topid={song_id}&cmd=8&pagenum={pagenum}&pagesize={pagesize}' 27 | # 歌曲的url,需要指定song_mid 28 | song_url = "https://y.qq.com/n/yqq/song/{song_mid}.html" 29 | # 记录爬虫当前爬取的歌曲数量 30 | music_num = 0 31 | 32 | def start_requests(self): 33 | for i in range(1, self.settings.get('SINGER_PAGE_NUM') + 1): # 在配置信息里获取歌手页数 34 | # 港台歌手 35 | request = Request( 36 | self.start_urls[0].format(index=self.settings.get('SINGER_PAGE_SIZE') * (i - 1), cur_page=i, 37 | area=2), 38 | callback=self.parse_singer) 39 | yield request 40 | # 内地歌手 41 | request = Request( 42 | self.start_urls[0].format(index=self.settings.get('SINGER_PAGE_SIZE') * (i - 1), cur_page=i, 43 | area=200), 44 | callback=self.parse_singer) 45 | yield request 46 | 47 | def parse_singer(self, response): 48 | """ 49 | 爬取歌手 50 | """ 51 | singer_list = json.loads(response.text).get('singerList').get('data').get('singerlist') # 获取歌手列表 52 | if singer_list: 53 | for singer in singer_list: 54 | singer_id = singer.get('singer_id') # 歌手id 55 | singer_mid = singer.get('singer_mid') # 歌手mid 56 | singer_name = singer.get('singer_name') # 歌手名字 57 | singer_pic = singer.get('singer_pic') # 歌手照片 58 | for page in range(0, self.settings.get("SONG_PAGE_NUM")): 59 | # 爬取歌手的歌曲 60 | request = Request( 61 | self.song_list_url.format(singer_mid=singer_mid, 62 | begin=page * self.settings.get('SONG_PAGE_SIZE'), 63 | num=self.settings.get('SONG_PAGE_SIZE')), 64 | callback=self.parse_song) 65 | yield request 66 | 67 | def parse_song(self, response): 68 | """ 69 | 爬取歌手的歌曲 70 | """ 71 | song_list = json.loads(response.text).get('singerSongList').get('data').get("songList") 72 | if song_list: 73 | for song in song_list: 74 | music_item = MusicItem() 75 | songInfo = song.get('songInfo') 76 | singer_name = [] # 歌手名字 77 | singer_id = [] 78 | singer_mid = [] 79 | for singer in songInfo.get('singer'): 80 | singer_name.append(singer.get("name")) 81 | singer_id.append(singer.get("id")) 82 | singer_mid.append(singer.get("mid")) 83 | music_item["singer_name"] = singer_name # 歌手名 84 | music_item["song_name"] = songInfo.get('title') # 歌曲名字 85 | music_item["subtitle"] = songInfo.get('subtitle') # 歌曲子标题,即歌曲名称后面括号的部分 86 | music_item["album_name"] = songInfo.get('album').get('name') # 专辑名字 87 | music_item["singer_id"] = singer_id 88 | music_item["singer_mid"] = singer_mid 89 | music_item["song_time_public"] = songInfo.get('time_public') # 歌曲发布时间 90 | music_item["song_type"] = songInfo.get('type') # 歌曲风格,是个数字 91 | music_item["language"] = songInfo.get('language') # 歌曲语种,是个数字 92 | music_item["song_id"] = songInfo.get('id') 93 | music_item["song_mid"] = songInfo.get('mid') 94 | music_item["song_url"] = self.song_url.format(song_mid=songInfo.get('mid')) 95 | request = Request(url=self.comment_url.format(song_id=music_item["song_id"], pagenum=0, pagesize=20), 96 | callback=self.parse_comments, 97 | meta={'music_item': music_item}) 98 | yield request 99 | 100 | def parse_comments(self, response): 101 | """ 102 | 爬取歌曲的评论 103 | """ 104 | music_item = response.meta.get('music_item') 105 | hot_comments = json.loads(response.text).get('hot_comment').get('commentlist') # 精彩评论 106 | if hot_comments: 107 | hot_comments = [{'comment_name': comment.get('nick'), 'comment_text': comment.get('rootcommentcontent')} for 108 | comment in hot_comments] 109 | else: 110 | hot_comments = 'null' 111 | music_item['hot_comments'] = hot_comments 112 | # 请求歌词需要加上referer,否则无法返回结果 113 | request = Request(url=self.lyric_url.format(song_id=music_item["song_id"]), 114 | callback=self.parse_lyric, 115 | meta={'music_item': music_item}) 116 | request.headers['referer'] = self.referer.format(song_mid=music_item["song_mid"]) 117 | yield request 118 | 119 | def process_lyric(self, lyric): 120 | """ 121 | 处理歌词信息 122 | :param lyric: 123 | :return: 124 | """ 125 | # 返回的单词格式有两种 126 | re_lyric = re.findall(r'[[0-9]+&#[0-9]+;[0-9]+&#[0-9]+;[0-9]+].*', lyric) 127 | if re_lyric: # 以 128 | lyric = re_lyric[0] 129 | lyric = lyric.replace(" ", " ") 130 | lyric = lyric.replace("(", "(") 131 | lyric = lyric.replace(")", ")") 132 | lyric = lyric.replace("-", "-") 133 | lyric = lyric.replace(" ", "") 134 | lyric = lyric.replace("'", "'") 135 | result = [] 136 | for sentence in re.split(u"[[0-9]+&#[0-9]+;[0-9]+&#[0-9]+;[0-9]+]", lyric): 137 | if sentence.strip() != "": 138 | result.append(sentence) 139 | return "\\n".join(result) 140 | else: 141 | lyric = lyric.replace(" ", " ") 142 | lyric = lyric.replace("(", "(") 143 | lyric = lyric.replace(")", ")") 144 | lyric = lyric.replace("-", "-") 145 | lyric = lyric.replace(" ", "\\n") 146 | lyric = lyric.replace("'", "'") 147 | return lyric 148 | 149 | def parse_lyric(self, response): 150 | """ 151 | 爬取歌曲的歌词 152 | :param response: 153 | :return: 154 | """ 155 | music_item = response.meta.get('music_item') 156 | response_dict = json.loads(response.text) 157 | if response_dict["retcode"] == 0: # 有歌词 158 | raw_lyric = response_dict["lyric"] 159 | lyric = self.process_lyric(raw_lyric) 160 | music_item["lyric"] = lyric 161 | self.music_num += 1 162 | print("成功爬取第{}条歌曲".format(self.music_num)) 163 | yield music_item 164 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # QQ音乐爬虫(with scrapy)/QQ Music Spider 2 | 3 | ## UPDATE 2019.12.23 4 | 已实现对QQ音乐文件的下载,出于版权考虑,不对此部分代码进行公开。**此项目仅作为学习交流使用,支持正版,人人有责** 5 | 6 | ![jaychou](image/jay.png) 7 | 8 | ## 项目介绍 9 | - 在写一个项目的时候需要用到一些音乐的信息,但是在网上找了许久也没找到满意的音乐语料,于是便用scrapy写了一个QQ音乐的爬虫 10 | - 由于本人只需要用到中文歌曲,所以仅使用该爬虫爬取了QQ音乐中排名前6400名的内地和港台歌手的49万+的歌曲信息,该资源也分享到了百度云(**该资源仅用于学习交流,请勿用于商业用途,如有侵权,请联系删除**) 11 | - QQ音乐的歌曲信息是使用js动态填充的,虽然QQ音乐的歌手和歌曲信息是使用GET进行明文请求,但是开发人员却在url请求参数上添加了一些冗余信息,对参数进行了一些加密,因此大部分精力还是花在了解析url上 12 | 13 | ## 运行环境 14 | scrapy==1.5.1 15 | 16 | ## 使用方法 17 | 进入项目根目录,运行如下命令即可:scrapy crawl qqmusic 18 | 19 | ## 歌曲格式 20 | 爬取到的歌曲信息保存在根目录下的music文件中,每一行表示一条歌曲,存储格式为json 21 | 22 | 歌曲的字段说明: 23 | - singer_name:歌手名称,数组形式,因为一首歌可能由多名歌手合唱 24 | - song_name:歌曲名称 25 | - subtitle:歌曲的子标题 26 | - album_name:专辑名称 27 | - singer_id:歌手id,数组形式 28 | - singer_mid:歌手的mid,数组形式 29 | - song_time_public:歌曲发行时间 30 | - song_type:歌曲类型 31 | - language:歌曲语种 32 | - song_id:歌曲id 33 | - song_mid:歌曲mid 34 | - song_url:歌曲播放的url 35 | - lyric:歌词 36 | - hot_comments:歌曲的精彩评论(此处只爬取了歌曲的精彩评论,部分比较冷门的歌曲有最新评论,但是没有精彩评论),数组形式。若无精彩评论,置为"null" 37 | - comment_name:评论者的昵称 38 | - comment_text:评论内容 39 | 40 | 爬取[说好不哭(with 五月天阿信)](https://y.qq.com/n/yqq/song/001qvvgF38HVc4.html)样例: 41 | ``` 42 | { 43 | 'singer_name': ['周杰伦'], 44 | 'song_name': '说好不哭(with 五月天阿信)', 45 | 'subtitle': '', 46 | 'album_name': '说好不哭(with 五月天阿信)', 47 | 'singer_id': [4558], 48 | 'singer_mid': ['0025NhlN2yWrP4'], 49 | 'song_time_public': '2019-09-16', 50 | 'song_type': 0, 51 | 'language': 0, 52 | 'song_id': 237773700, 53 | 'song_mid': '001qvvgF38HVc4', 54 | 'song_url': 'https://y.qq.com/n/yqq/song/001qvvgF38HVc4.html', 55 | 'hot_comments': [{ 56 | 'comment_name': 'Cohen', 57 | 'comment_text': '《说好不哭》是听众很期待的一个作品,前奏钢琴曲的渐入和后续的弦乐给整首歌奠定了温暖的基调。不知道大家有没有好好去看了这首歌的背景介绍“这是一首关于「约定」和「成全」的情歌,整首歌以钢琴为主要故事线,弦乐编织出抒情场景,营造出一种爱情电影的氛围”。依然的周式情歌,杰伦最总拿捏住每一首歌的感情基调,总能用最精妙的词曲和编曲打动他的听众。\\n\\n这首歌当然在合乎听众期待的同时,也给予了我们些许特别,第二段阿信的加入相信会令不少听众感到惊喜。对于我来说,我最喜欢整首歌的钢琴曲,温暖纯净感动,或许初遍听并非能完全理解杰伦想要告诉我们怎样的一个爱情故事,但是我们也能从简单的歌词和干净的旋律里感受到情感的部分。' 58 | }, { 59 | 'comment_name': '野渡无人', 60 | 'comment_text': '“我坚持我的音乐,我喜欢我的音乐,谁叫我是周杰伦。”或许喜欢他,正是因为他对音乐独有的专注认真,才令人对这个音乐才子着迷。\\n我相信很多人的青春里都有他,从曾经除了才华一无所有,无人知晓的少年,到如今的现代流行天王,周杰伦用一首首歌陪伴了我们整个青春,说生如逆旅,多少迷茫和气馁,却总能在他的歌声里找回自己。今岁不复去岁,人生这条人行道上,兜兜转转,还是一个周杰伦。\\n而这首《说好不哭》更像是一个约定,“你会微笑放手 说好不哭让我走” ,而我们不远万里赴约,这就是周杰伦,即青春。\\n' 61 | }, { 62 | 'comment_name': '是硬糖啊', 63 | 'comment_text': '对于很多80后90后来说,青春或许就这样悄然慢慢过去了,我依稀还记得那个夏天里七里香的香味,还有那个唱着《简单爱》,表情酷酷的男孩。\\n静静听了一遍,还是那个熟悉的风格,那个熟悉的Jay。也许杰伦的嗓音已经有了变化,但是在歌里他依旧延续着那份感动。\\n喜欢Jay的歌,忘不了有多少个夜晚的歌单循环,忘不了有多少次的辗转反侧。他的歌曲,经常会在某个时间点引发你的共鸣。\\n说好不哭,可是有多少人还是会泪流满面,可能是因为40岁的杰伦终于发了新歌,也可能是因为有些过往,有些感动,真的再也回不去了……' 64 | }, { 65 | 'comment_name': '墨染栀', 66 | 'comment_text': 'Jay式情歌重磅回归,期待了已久的歌。方文山再次作词走心之作,前奏已沉醉。最惊喜的是阿信献声。一个弹琴一个弹吉他,简直是神仙合作。1分40秒左右旋律是五月天的“突然好想你,你会在哪里?”。1分47秒的旋律是《说好的幸福呢》,然后进入阿信的part。杰伦歌里总能藏着这些细腻。我们的青春圆满了。缓缓的钢琴声诉说着凄美的爱情。有一种爱叫默默奉献,无需言语。有一种爱叫放手,你过得幸福就好。距上一次发新歌时隔一年了,40岁的周董是多少人的青春年华,熟悉的旋律是不是依旧耳边回荡。从第一张个人专辑《Jay》到如今,我们热情不减,周董惊喜不断。周杰伦——一个代表时代歌手拥有的魅力。说好不哭,你哭了吗?' 67 | }, { 68 | 'comment_name': '森岛帆高', 69 | 'comment_text': '初中时的哥哥买的一盒盒堆积成山周杰伦封面的磁带。\\n磁带放入收音机,磁带转动起,每一个停顿瞬间,每一首风格独特,每一句声声入耳。从懵懂到成熟,都是喜欢周杰伦的年轮。\\n在这磁带转动的年轮里,歌曲播放的列表里,生活情绪的喜悲里,一尘不变的还是周杰伦的歌。\\n他的歌包揽了我的一整个年少,度过了我的大半个学生时代,陪伴了我的无数个孤独夜晚。\\n百听不厌可能是对他的歌最完美的诠释吧。\\n周杰伦是我的信念,是我的力量。\\n如今哥哥也已经参加工作,我的学生时代也转瞬即逝。\\n对周杰伦的喜欢怕是听着他音乐的旋律融到了心里,伴随而去了吧。' 70 | }, { 71 | 'comment_name': '玫瑰少年', 72 | 'comment_text': '今年,杰伦40岁了,但我脑海中却还是那个充满个性,说着“可不可以多唱歌少讲话”的男孩子....《说好不哭》其实可能也是杰伦的心里话,可能以后出歌不会像以前那样快了,也在担心自己歌曲是否还能符合粉丝的要求,但说好不哭哦。\\n我相信对于很多粉丝来说,杰伦已经给予了一个完整的音乐海洋,无论伤心快乐,他好像总有为你匹配的歌曲,时时刻刻呵护着自己的情感,即使在各类歌曲百花齐放的今天,每一个夜晚依旧喜欢在他的歌声中入睡。\\n不会哭的,无论过去还是未来,我们还是愿意让你的歌声陪伴着我们,陪伴着整个青春,陪伴一辈子。' 73 | }, { 74 | 'comment_name': '蜗牛..', 75 | 'comment_text': '我想,你的青春里一定有一首歌是属于Jay的吧。躺在心里,偶而翻出,正好遇上那个情绪,便产生某些新的情感和新的认知。一场青春,一首周杰伦,沮丧时听听,不论是温柔的还是奋进的,总能唱到元气满满。好的音乐,是会说话的。\n仿佛每次在挣扎,迷茫和无助的时候,都能在他的音乐世界里慢慢治愈,而音乐和周杰伦本就是一次相互成全的相遇。\n青春难以留住,夏天已然散场。人生的道路也许各不相同,但只要在他需要我们的时候为他加油喝彩,就足够了,那才叫青春。\n不管什么时候,什么地点,我希望所有歌迷回过头来看到的还是同一个周杰伦。' 76 | }, { 77 | 'comment_name': '此用户已被封', 78 | 'comment_text': '周杰伦三个字是一代青春,还记得08年晚会上听的那首《青花瓷》顿时茅塞顿开,世间静会有如此好听的歌曲。之后迷上他的所有歌曲,自己买磁带上学的时候在宿舍晚上听,陪我渡过了5年学习生涯。这次我们的周青春出新歌,前些天一直在等待,熬夜等待但我觉得这些所有的等待都是值得的。' 79 | }, { 80 | 'comment_name': '黄子韬TAO', 81 | 'comment_text': '人气top周杰伦❗💎💖✨🌈\\n实力歌手周杰伦❗💎💖✨🌈\\n音乐鬼才周杰伦❗💎💖✨🌈\\n亚洲天王周杰伦❗💎💖✨🌈\\n家庭美满周杰伦❗💎💖✨🌈\\n魔术大师周杰伦❗️💎💖✨🌈\\n华语天王周杰伦❗️💎💖✨🌈\\n无与伦比周杰伦❗️💎💖✨🌈\\n奶茶仙子周杰伦❗️💎💖✨🌈\\n守护全世界最好的周杰伦💖💖💖' 82 | }, { 83 | 'comment_name': '指法芬芳张大仙z', 84 | 'comment_text': '曾经傲娇的少年,\\n也已到了不惑之年,\\n从一个总是喜欢将脸庞隐藏在鸭舌帽下的腼腆男孩,\\n到如今撒娇卖萌幽默语句频出还时时撒狗粮的小公举,\\n新专辑如约而至,\\n能在这个时代遇到杰伦哥是人生最大幸运,\\n他的作品照亮你的路,陪伴你度过漫长的深夜,\\n你的支持和包容,也让他从默默无闻变成万众瞩目,\\n也许有一天 你忙于生活 他归于沉寂 渐渐不再会有交集\xa0,\\n希望在慢慢老去 还能回忆起青春时 曾为一个人疯狂追逐过,\\n他的名字就是\\n周!杰!伦!一生所爱 JAY\\n感谢你的音乐\\n让我有了一直学习吉他的动力,\\n感谢你的音乐\\n陪我度过的每一个夜晚,\\n感谢你的音乐\\n陪伴我在篮球场上的每一天,\\n青春有你,如此甚好,JAY !' 85 | }, { 86 | 'comment_name': '指法芬芳张大仙z', 87 | 'comment_text': '40岁的杰伦就坐在那里,\\n深情的目光望过去,\\n满眼都是自己当年22岁横空出世范特西少年身影......\\n作为华语乐坛最成功最具有影响力的歌手音乐人,\\n15座金曲奖、八届大中华区销量冠军、4届世界音乐奖WMA、全球25大创意人物、《Fast Company》全球百大创意人物,2010年歌曲下载量世界第三,历史上第一首好莱坞中文主题曲,亚洲天王,世界十大鬼才音乐人之一,\\n这就是周杰伦,是信仰,是天才,是一个时代,是传奇,是华人之光,是80后的回忆,90后的青春,他还见证着00后的成长;\\n感谢我伦,\\n你走过的轨迹是青春的记忆,\\n致敬,周杰伦!\\n永远的小公举,永远的热爱!' 88 | }, { 89 | 'comment_name': '指法芬芳张大仙z', 90 | 'comment_text': '难以忘记初次遇见你,是在《红尘客栈》的《大笨钟》下,你笑得《甜甜的》像是天边的一道《彩虹》色的《麦芽糖》;他们都说爱情来的太快像是《龙卷风》,你和我《一点点》靠近,相识,相恋;我经常会《安静》的看着你《傻笑》,表白《说好不哭》陪你到永远;在每个《晴天》里,我都会在学校的篮球场上《等你下课》;我的女孩,你就像是我《不能说的秘密》,我想给你《告白气球》般的浪漫,我想和你许下《蒲公英的约定》,然后牵起你的手,用吉他弹起那《手写的从前》,就这样一直牵着你去看《最长的电影》,在每个夜晚里去看最美的《星晴》,直到永远;我的女孩,你就像是海里的《美人鱼》公主,我愿做你的王子,一直爱你❤一直守护在你身边。😘' 91 | }, { 92 | 'comment_name': '黄子韬TAO', 93 | 'comment_text': '手牵手,一步两步三步四步望着天\\n看星星,一颗两颗三颗四颗连成线\\n周杰伦“说”三部曲将会在2019年9月16日正式完结\\n千万别错过,这一错就是一辈子!' 94 | }, { 95 | 'comment_name': '\u2062', 96 | 'comment_text': '周杰伦\\n我QQ音乐里唯一的主角\\n你说要听妈妈的话喝爷爷泡的茶\\n你说最美的是与你躲过雨的屋檐下\\n我提一笔想用几行字形容你是我的谁\\n礼物用香榭的落叶就不会觉得有点难追\\n时间是解药也是我现在服下的这一剂毒药\\n我的十八年青春因为有你的出现而引以为傲\\n只要你还在为我们唱还在带着笑面对逆境环绕\\n风就不会把距离吹得好远我的青春也不会老\\n有人说你江郎才尽才华不复当年模样\\n那是他们嫉妒你绝世的巅峰和辉煌\\n今晚说好不哭\\n谢谢你在一直陪伴着我的青春\\n❤️❤️❤️' 97 | }, { 98 | 'comment_name': '指法芬芳张大仙z', 99 | 'comment_text': '青春不散!东方之殿开轩窗,绘梦之卷迷迭香,献世青花瓷世无双,情书却渐黄,手书兰亭序,孤饮女儿红,乱舞春秋夜未央,园游会下灿烂七里香,遥思娘子在西厢,抚断了的弦,奏夜的第七章,同一种调调徒感伤。枫叶落光,花海安静夜曲入乡,忍者星晴夜望,摩羯座非寻常,梯田闻稻香,甜甜的心雨洒晴天,将军屋顶发呆,懦夫将被淘汰,我不配周大侠龙战骑士那神态,没借口不退后,以父之名远走,拜别霍元甲,身披黄金甲,持双截棍耍帅,一路向北过千山万水漂移而来,赴千里之外菊花台, 龙卷风兼蓝色风暴雨,四面楚歌响起,止战之殇的记忆、逆鳞的轨迹,乘坐时光机,世界末日又回到过去,简单爱在爱的飞行日记里一直搁浅下去。' 100 | }], 101 | 'lyric': '说好不哭(with 五月天阿信) - 周杰伦 (Jay Chou)\\n词:方文山\\n曲:周杰伦\\n周杰伦:\\n没有了联络 后来的生活\\n我都是听别人说\\n说你怎么了 说你怎么过\\n放不下的人是我\\n人多的时候 就待在角落\\n就怕别人问起我\\n你们怎么了 你低着头\\n护着我连抱怨都没有\\n电话开始躲 从不对我说\\n不习惯一个人生活\\n离开我以后 要我好好过\\n怕打扰想自由的我\\n都这个时候 你还在意着\\n别人是怎么怎么看我的\\n拼命解释着 不是我的错 是你要走\\n眼看着你难过 挽留的话却没有说\\n你会微笑放手 说好不哭让我走\\n阿信:\\n电话开始躲 从不对我说\\n不习惯一个人生活\\n离开我以后 要我好好过\\n怕打扰想自由的我\\n都这个时候 你还在意着\\n别人是怎么怎么看我的\\n拼命解释着 不是我的错 是你要走\\n合:\\n眼看着你难过 挽留的话却没有说\\n你会微笑放手 说好不哭让我走\\n周杰伦:\\n你什么都没有 却还为我的梦加油\\n阿信:\\n心疼过了多久\\n周杰伦:\\n过了多久\\n合:\\n还在找理由等我' 102 | } 103 | ``` 104 | 105 | ## 爬虫的大致逻辑 106 | - 先爬取指定数量的歌手 107 | - 根据歌手的id,获取每一位歌手的歌曲列表(歌曲列表中包含歌曲的一些信息,但不包括歌词和评论信息) 108 | - 根据歌曲id,获取歌曲的歌词信息 109 | - 根据歌曲id,获取歌曲的评论信息 110 | - 将歌曲写入文件 111 | 112 | ## 语料分享 113 | **该资源仅用于学习交流,请勿用于商业用途,如有侵权,请联系删除。** 114 | 115 | |语料名称 | 语料地址 |语料描述| 116 | |---------|--------|--------| 117 | |49万港台内地的歌曲信息|[百度网盘【提取码:vyuz】](https://pan.baidu.com/s/18QmZQDGpzVX-1W51p-sqiQ?pwd=vyuz)|包含QQ音乐中排名前6400名的内地和港台歌手的49万+的歌曲信息| 118 | 119 | 120 | ## 解析QQ音乐的url 121 | QQ音乐的歌手和歌曲信息,都是使用js进行动态填充的,所以不可能通过爬取html网页,然后解析网页内容来获取歌曲信息。既然是通过js进行动态填充,那就需要对请求的url的格式进行解析 122 | 123 | ### 歌手列表url解析 124 | 打开QQ音乐的 [歌手页面](https://y.qq.com/portal/singer_list.html) ,用开发者工具查看,找到请求歌手列表的url如下: 125 | ``` 126 | https://u.y.qq.com/cgi-bin/musicu.fcg?-=getUCGI9574303950614538&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%2C%22cv%22%3A0%7D%2C%22singerList%22%3A%7B%22module%22%3A%22Music.SingerListServer%22%2C%22method%22%3A%22get_singer_list%22%2C%22param%22%3A%7B%22area%22%3A-100%2C%22sex%22%3A-100%2C%22genre%22%3A-100%2C%22index%22%3A-100%2C%22sin%22%3A0%2C%22cur_page%22%3A1%7D%7D%7D 127 | ``` 128 | 可以看到url中携带了很多参数,包括:g_tk、loginUin、hostUin、format、inCharset、outCharset、notice、platform、needNewCode、data 129 | 130 | 将该url放到postman中,尝试一个接一个地取消参数,找到有用的参数。最终可以知道,其实只有data这个参数对请求有实际的作用,把其他参数去掉,得到简化后的url如下: 131 | ``` 132 | https://u.y.qq.com/cgi-bin/musicu.fcg?data=%7B%22comm%22%3A%7B%22ct%22%3A24%2C%22cv%22%3A0%7D%2C%22singerList%22%3A%7B%22module%22%3A%22Music.SingerListServer%22%2C%22method%22%3A%22get_singer_list%22%2C%22param%22%3A%7B%22area%22%3A-100%2C%22sex%22%3A-100%2C%22genre%22%3A-100%2C%22index%22%3A-100%2C%22sin%22%3A0%2C%22cur_page%22%3A1%7D%7D%7D 133 | ``` 134 | 结合歌手页面,仔细分析一下上述简化后的url,会发现data参数中隐含地携带了很多实际的请求参数: 135 | - area:歌手的地域(内地、港台、欧美等)。-100:全部、200:内地、2:港台、5:欧美、4:日本、3:韩国、6:其他 136 | - genre:歌手风格(流行、嘻哈等)。-100:全部、1:流行、6:嘻哈、2:摇滚、4:电子、3:民谣、8:R&B、10:民歌、9:轻音乐、5:爵士、14:古典、25:乡村、20:蓝调 137 | - cur_page:当前歌手列表的页码 138 | - index:cur_page*page_size(index表示当前页的起始index,page_size表示每一页歌手的数量) 139 | 140 | 141 | 使用控制变量法,固定area和genre变量,比较下列请求第一、二、三页歌手的url,可以发现其中index和cur_page中存在一些潜在规律 142 | 143 | 在下列三个url中(*是人为添加的,方便描述),index后面跟着的用**标记的数字就是变量index,cur_page后面用 144 | **标记的数字就是变量cur_page。在[歌手页面](https://y.qq.com/portal/singer_list.html),可以看到每一页有80个歌手。很显然,当要请求第n页歌手的时候,cur_page=n,index=80(n-1)。 145 | ``` 146 | https://u.y.qq.com/cgi-bin/musicu.fcg?data=%7B%22comm%22%3A%7B%22ct%22%3A24%2C%22cv%22%3A0%7D%2C%22singerList%22%3A%7B%22module%22%3A%22Music.SingerListServer%22%2C%22method%22%3A%22get_singer_list%22%2C%22param%22%3A%7B%22area%22%3A-100%2C%22sex%22%3A-100%2C%22genre%22%3A-100%2C%22index%22%3A-100%2C%22sin%22%3A**0**%2C%22cur_page%22%3A**1**%7D%7D%7D 147 | 148 | https://u.y.qq.com/cgi-bin/musicu.fcg?data=%7B%22comm%22%3A%7B%22ct%22%3A24%2C%22cv%22%3A0%7D%2C%22singerList%22%3A%7B%22module%22%3A%22Music.SingerListServer%22%2C%22method%22%3A%22get_singer_list%22%2C%22param%22%3A%7B%22area%22%3A-100%2C%22sex%22%3A-100%2C%22genre%22%3A-100%2C%22index%22%3A-100%2C%22sin%22%3A**80**%2C%22cur_page%22%3A**2**%7D%7D%7D 149 | 150 | https://u.y.qq.com/cgi-bin/musicu.fcg?data=%7B%22comm%22%3A%7B%22ct%22%3A24%2C%22cv%22%3A0%7D%2C%22singerList%22%3A%7B%22module%22%3A%22Music.SingerListServer%22%2C%22method%22%3A%22get_singer_list%22%2C%22param%22%3A%7B%22area%22%3A-100%2C%22sex%22%3A-100%2C%22genre%22%3A-100%2C%22index%22%3A-100%2C%22sin%22%3A**160**%2C%22cur_page%22%3A**3**%7D%7D%7D 151 | ``` 152 | 通过以上分析,可以得到请求歌手列表的url格式如下: 153 | ``` 154 | singer_list_url = "https://u.y.qq.com/cgi-bin/musicu.fcg?data=%7B%22comm%22%3A%7B%22ct%22%3A24%2C%22cv%22%3A0%7D%2C%22singerList%22%3A%7B%22module%22%3A%22Music.SingerListServer%22%2C%22method%22%3A%22get_singer_list%22%2C%22param%22%3A%7B%22area%22%3A{area}%2C%22sex%22%3A-100%2C%22genre%22%3A{genre}%2C%22index%22%3A-100%2C%22sin%22%3A{index}%2C%22cur_page%22%3A{cur_page}%7D%7D%7D" 155 | ``` 156 | 157 | ### 歌曲列表url解析 158 | 同理找到请求歌曲列表的url 159 | ``` 160 | https://u.y.qq.com/cgi-bin/musicu.fcg?-=getSingerSong8235365887193979&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%2C%22cv%22%3A0%7D%2C%22singerSongList%22%3A%7B%22method%22%3A%22GetSingerSongList%22%2C%22param%22%3A%7B%22order%22%3A1%2C%22singerMid%22%3A%22004Be55m1SJaLk%22%2C%22begin%22%3A0%2C%22num%22%3A10%7D%2C%22module%22%3A%22musichall.song_list_server%22%7D%7D 161 | ``` 162 | 过滤掉无用的参数,得到简化后的url: 163 | ``` 164 | https://u.y.qq.com/cgi-bin/musicu.fcg?data=%7B%22comm%22%3A%7B%22ct%22%3A24%2C%22cv%22%3A0%7D%2C%22singerSongList%22%3A%7B%22method%22%3A%22GetSingerSongList%22%2C%22param%22%3A%7B%22order%22%3A1%2C%22singerMid%22%3A%22004Be55m1SJaLk%22%2C%22begin%22%3A0%2C%22num%22%3A10%7D%2C%22module%22%3A%22musichall.song_list_server%22%7D%7D 165 | ``` 166 | data中隐藏的参数: 167 | - singerMid:歌手的mid 168 | - num:相当于page_size,表示每一页歌曲的数量 169 | - begin:page*page_size(begin表示当前页的起始index) 170 | 171 | 通过以上分析,可以得到请求歌曲列表的url格式如下: 172 | ``` 173 | song_list_url = "https://u.y.qq.com/cgi-bin/musicu.fcg?data=%7B%22comm%22%3A%7B%22ct%22%3A24%2C%22cv%22%3A0%7D%2C%22singerSongList%22%3A%7B%22method%22%3A%22GetSingerSongList%22%2C%22param%22%3A%7B%22order%22%3A1%2C%22singerMid%22%3A%22{singer_mid}%22%2C%22begin%22%3A{begin}%2C%22num%22%3A{num}%7D%2C%22module%22%3A%22musichall.song_list_server%22%7D%7D" 174 | ``` 175 | 176 | ### 请求歌词url解析 177 | 通过请求歌曲列表的url发现,并没有返回歌曲的歌词信息,因此肯定是通过额外的url请求获得歌词,找到请求歌词的url如下: 178 | ``` 179 | https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg?nobase64=1&musicid=105648715&-=jsonp1&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0 180 | ``` 181 | 将以上url放到postman中发送请求,却得不到正确的回复,经过一番研究,发现该url需要加上referer这个header才可以正常运行 182 | ``` 183 | referer:https://y.qq.com/n/yqq/song/004RDW5Q2ol2jj.html 184 | ``` 185 | 请求歌词的url经过简化参数后得到: 186 | ``` 187 | https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg?musicid=105648715&format=json 188 | ``` 189 | 容易看出 190 | - musicid:song_id 191 | - referer中的"004RDW5Q2ol2jj"表示song_mid 192 | 193 | 最终得到请求歌词的url如下,lyric_url需要带上referer这个header: 194 | ``` 195 | lyric_url = "https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg?nobase64=1&musicid={song_id}&format=json" 196 | 197 | referer = "https://y.qq.com/n/yqq/song/{song_mid}.html" 198 | ``` 199 | ### 歌曲评论url解析 200 | 找到歌曲评论的url如下: 201 | ``` 202 | https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg?g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=GB2312¬ice=0&platform=yqq.json&needNewCode=0&cid=205360772&reqtype=2&biztype=1&topid=105648715&cmd=8&needmusiccrit=0&pagenum=0&pagesize=25&lasthotcommentid=&domain=qq.com&ct=24&cv=10101010 203 | ``` 204 | 205 | 经过参数简化后,得到如下url: 206 | ``` 207 | https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg?biztype=1&topid=105648715&cmd=8&pagenum=0&pagesize=25 208 | ``` 209 | 参数说明: 210 | - topid:歌曲的song_id 211 | - pagenum:"最新评论"的页数 212 | - pagesize:每页"最新评论"的评论数量 213 | 214 | 注意:此处的pagenum和pagesize影响的是"最新评论"的返回结果,而不影响"精彩评论",该url中没有参数可以控制"精彩评论"的返回结果 215 | 216 | 请求歌曲评论的url格式如下: 217 | ``` 218 | comment_url = 'https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg?biztype=1&topid={song_id}&cmd=8&pagenum={pagenum}&pagesize={pagesize}' 219 | ``` 220 | 221 | ### 歌曲url 222 | 歌曲的url格式如下: 223 | ``` 224 | https://y.qq.com/n/yqq/song/{song_mid}.html 225 | ``` 226 | ## 歌词解析 227 | 以周杰伦的[说好不哭(with 五月天阿信)](https://y.qq.com/n/yqq/song/001qvvgF38HVc4.html)为例 228 | 229 | 通过lyric_url请求获得的歌词格式如下,格式看起来还是比较杂乱的,包含各种字符 230 | ``` 231 | [ti:说好不哭(With 五月天阿信)] [ar:周杰伦] [al:说好不哭(With 五月天阿信)] 232 | [by:] [offset:0] [00:00.00]说好不哭(with 五月天阿信) - 周杰伦 (Jay Chou) 233 | [00:14.94]词:方文山 [00:19.09]曲:周杰伦 [00:23.24]周杰伦: 234 | [00:26.51]没有了联络 后来的生活 [00:29.76]我都是听别人说 [00:32.85]说你怎么了 说你怎么过 235 | [00:36.02]放不下的人是我 [00:39.20]人多的时候 就待在角落 [00:42.34]就怕别人问起我 236 | [00:45.37]你们怎么了 你低着头 [00:48.30]护着我连抱怨都没有 [00:51.82]电话开始躲 从不对我说 237 | [00:54.98]不习惯一个人生活 [00:58.11]离开我以后 要我好好过 [01:01.32]怕打扰想自由的我 238 | [01:04.39]都这个时候 你还在意着 [01:07.56]别人是怎么怎么看我的 [01:10.77]拼命解释着 不是我的错 是你要走 239 | [01:15.55]眼看着你难过 挽留的话却没有说 [01:28.14]你会微笑放手 说好不哭让我走 [01:52.13]阿信: 240 | [01:54.95]电话开始躲 从不对我说 [01:58.17]不习惯一个人生活 [02:01.26]离开我以后 要我好好过 241 | [02:04.41]怕打扰想自由的我 [02:07.62]都这个时候 你还在意着 [02:10.62]别人是怎么怎么看我的 242 | [02:13.90]拼命解释着 不是我的错 是你要走 [02:18.51]合: [02:18.71]眼看着你难过 挽留的话却没有说 243 | [02:31.28]你会微笑放手 说好不哭让我走 [02:50.54]周杰伦: [02:53.38]你什么都没有 却还为我的梦加油 244 | [03:04.99]阿信: [03:05.92]心疼过了多久 [03:09.83]周杰伦: [03:10.02]过了多久 245 | [03:12.58]合: [03:12.77]还在找理由等我 246 | ``` 247 | 通过正则表达式得到比较工整的歌词(每句歌词之间用\\\\n间隔开): 248 | ``` 249 | 说好不哭(with 五月天阿信) - 周杰伦 (Jay Chou)\\n 250 | 词:方文山\\n 251 | 曲:周杰伦\\n 252 | 周杰伦:\\n 253 | 没有了联络 后来的生活\\n 254 | 我都是听别人说\\n 255 | 说你怎么了 说你怎么过\\n 256 | 放不下的人是我\\n 257 | 人多的时候 就待在角落\\n 258 | 就怕别人问起我\\n 259 | 你们怎么了 你低着头\\n 260 | 护着我连抱怨都没有\\n 261 | 电话开始躲 从不对我说\\n 262 | 不习惯一个人生活\\n 263 | 离开我以后 要我好好过\\n 264 | 怕打扰想自由的我\\n 265 | 都这个时候 你还在意着\\n 266 | 别人是怎么怎么看我的\\n 267 | 拼命解释着 不是我的错 是你要走\\n 268 | 眼看着你难过 挽留的话却没有说\\n 269 | 你会微笑放手 说好不哭让我走\\n 270 | 阿信:\\n 271 | 电话开始躲 从不对我说\\n 272 | 不习惯一个人生活\\n 273 | 离开我以后 要我好好过\\n 274 | 怕打扰想自由的我\\n 275 | 都这个时候 你还在意着\\n 276 | 别人是怎么怎么看我的\\n 277 | 拼命解释着 不是我的错 是你要走\\n 278 | 合:\\n 279 | 眼看着你难过 挽留的话却没有说\\n 280 | 你会微笑放手 说好不哭让我走\\n 281 | 周杰伦:\\n 282 | 你什么都没有 却还为我的梦加油\\n 283 | 阿信:\\n 284 | 心疼过了多久\\n 285 | 周杰伦:\\n 286 | 过了多久\\n 287 | 合:\\n 288 | 还在找理由等我 289 | ``` 290 | 291 | ## settings.py的参数说明 292 | - DOWNLOAD_DELAY:每个request请求的间隔时间 293 | - ROBOTSTXT_OBEY:是否遵守网站的爬虫协议 294 | - SINGER_PAGE_NUM:爬取的歌手页数 295 | - SINGER_PAGE_SIZE:每页歌手的数量 296 | - SONG_PAGE_NUM:每个歌手的歌曲爬取的页数 297 | - SONG_PAGE_SIZE:每个歌手每页歌曲的数量 298 | 299 | ## 避免爬虫被ban 300 | - 使用user agent池,轮流选择其中一个作为user agent 301 | - 设置下载延迟DOWNLOAD_DELAY,设为1或者更大。(一开始怕QQ音乐官方会有一些反爬虫的检测,于是把DOWNLOAD_DELAY设为1,也就是每隔一秒钟才发送一次请求,发觉爬取速度过于慢,又改成了0.1。最终索性设为0,发现原来QQ音乐对于歌词信息的url并没有反爬虫措施) 302 | - 使用IP池,动态变更IP地址 303 | 304 | user agent池如下: 305 | ``` 306 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 307 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 308 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 309 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 310 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 311 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 312 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 313 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 314 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 315 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 316 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 317 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 318 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 319 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 320 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 321 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 322 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 323 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 324 | ``` 325 | 326 | ## Future Work 327 | - 解析QQ音乐的歌曲文件的请求方式 328 | - 韩文和英文歌词中包含一些特殊的符号,部分特殊字符还没能较好地进行转义 329 | - 完善setting,使用户能更灵活地爬取不同地域、风格的歌曲 330 | 331 | ## 未解决问题 332 | 程序可以正常运行,但是当爬取结束时(日志信息写着'finish_reason':'finished',表明爬虫爬取任务完成),报了一个错,虽然不影响结果,但报错原因暂时不明 333 | ``` 334 | 2019-12-21 15:08:53 [scrapy.core.engine] INFO: Closing spider (finished) 335 | 2019-12-21 15:08:53 [scrapy.statscollectors] INFO: Dumping Scrapy stats: 336 | {'downloader/exception_count': 8, 337 | 'downloader/exception_type_count/twisted.internet.error.TimeoutError': 8, 338 | 'downloader/request_bytes': 682269284, 339 | 'downloader/request_count': 1067470, 340 | 'downloader/request_method_count/GET': 1067470, 341 | 'downloader/response_bytes': 1563129445, 342 | 'downloader/response_count': 1067462, 343 | 'downloader/response_status_count/200': 1067460, 344 | 'downloader/response_status_count/404': 2, 345 | 'dupefilter/filtered': 30476, 346 | 'finish_reason': 'finished', 347 | 'finish_time': datetime.datetime(2019, 12, 21, 7, 8, 53, 816618), 348 | 'item_scraped_count': 489860, 349 | 'log_count/DEBUG': 1557332, 350 | 'log_count/ERROR': 1, 351 | 'log_count/INFO': 229, 352 | 'request_depth_max': 3, 353 | 'response_received_count': 1067462, 354 | 'retry/count': 8, 355 | 'retry/reason_count/twisted.internet.error.TimeoutError': 8, 356 | 'scheduler/dequeued': 1067468, 357 | 'scheduler/dequeued/memory': 1067468, 358 | 'scheduler/enqueued': 1067468, 359 | 'scheduler/enqueued/memory': 1067468, 360 | 'start_time': datetime.datetime(2019, 12, 20, 14, 59, 43, 749919)} 361 | 2019-12-21 15:08:53 [scrapy.core.engine] INFO: Spider closed (finished) 362 | 2019-12-21 15:08:53 [scrapy.utils.signal] ERROR: Error caught on signal handler: > 364 | Traceback (most recent call last): 365 | File "c:\users\administrator\.conda\envs\ppy36\lib\site-packages\twisted\internet\defer.py", line 151, in maybeDeferred 366 | result = f(*args, **kw) 367 | File "c:\users\administrator\.conda\envs\ppy36\lib\site-packages\pydispatch\robustapply.py", line 54, in robustApply 368 | return receiver(*arguments, **named) 369 | File "c:\users\administrator\.conda\envs\ppy36\lib\site-packages\scrapy\extensions\memusage.py", line 70, in engine_stopped 370 | for tsk in self.tasks: 371 | AttributeError: 'MemoryUsage' object has no attribute 'tasks' 372 | 373 | ``` 374 | -------------------------------------------------------------------------------- /image/jay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yangjianxin1/QQMusicSpider/71617400ee3c4a4361c225917f18bfc8a1c32561/image/jay.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy==1.5.1 -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = QQMusicSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = QQMusicSpider 12 | --------------------------------------------------------------------------------