├── MusicSpider ├── __init__.py ├── validate │ ├── __init__.py │ ├── __init__.pyc │ ├── validate.pyc │ └── validate.py ├── items.pyc ├── __init__.pyc ├── settings.pyc ├── pipelines.pyc ├── spiders │ ├── music.pyc │ ├── person.pyc │ ├── __init__.pyc │ ├── __init__.py │ ├── music.py │ └── person.py ├── items.py ├── middlewares.py ├── pipelines.py └── settings.py ├── README.md ├── .idea ├── vcs.xml ├── misc.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml ├── MusicSpider.iml ├── mongoSettings.xml └── workspace.xml └── scrapy.cfg /MusicSpider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MusicSpider/validate/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MusicSpider(网易云音乐爬虫) 2 | ### 环境 3 | scrapy + redis + mongodb 分布式 -------------------------------------------------------------------------------- /MusicSpider/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/items.pyc -------------------------------------------------------------------------------- /MusicSpider/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/__init__.pyc -------------------------------------------------------------------------------- /MusicSpider/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/settings.pyc -------------------------------------------------------------------------------- /MusicSpider/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/pipelines.pyc -------------------------------------------------------------------------------- /MusicSpider/spiders/music.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/spiders/music.pyc -------------------------------------------------------------------------------- /MusicSpider/spiders/person.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/spiders/person.pyc -------------------------------------------------------------------------------- /MusicSpider/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/spiders/__init__.pyc -------------------------------------------------------------------------------- /MusicSpider/validate/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/validate/__init__.pyc -------------------------------------------------------------------------------- /MusicSpider/validate/validate.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/validate/validate.pyc -------------------------------------------------------------------------------- /MusicSpider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = MusicSpider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = MusicSpider 12 | -------------------------------------------------------------------------------- /.idea/MusicSpider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/mongoSettings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 19 | -------------------------------------------------------------------------------- /MusicSpider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class MusicspiderItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | 16 | class typeItem(scrapy.Item): 17 | _id = scrapy.Field() 18 | type = scrapy.Field() 19 | url = scrapy.Field() 20 | 21 | class playListItem(scrapy.Item): 22 | _id = scrapy.Field() 23 | list_id = scrapy.Field() 24 | list_name = scrapy.Field() 25 | list_play = scrapy.Field() 26 | # list_comment = scrapy.Field() 27 | list_collection = scrapy.Field() 28 | list_creator = scrapy.Field() 29 | list_creator_id = scrapy.Field() 30 | list_tag = scrapy.Field() 31 | type = scrapy.Field() 32 | 33 | class detailItem(scrapy.Item): 34 | _id = scrapy.Field() 35 | music_id = scrapy.Field() 36 | music_name = scrapy.Field() 37 | music_album = scrapy.Field() 38 | music_artist = scrapy.Field() 39 | music_comment_num = scrapy.Field() 40 | music_comment = scrapy.Field() 41 | 42 | class personItem(scrapy.Item): 43 | _id = scrapy.Field() 44 | person_id = scrapy.Field() 45 | person_name = scrapy.Field() 46 | person_fan = scrapy.Field() 47 | person_follow = scrapy.Field() 48 | person_music_play = scrapy.Field() 49 | person_age = scrapy.Field() 50 | person_address = scrapy.Field() 51 | person_event = scrapy.Field() 52 | 53 | -------------------------------------------------------------------------------- /MusicSpider/validate/validate.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | import json 4 | from Crypto.Cipher import AES 5 | import base64 6 | 7 | 8 | class Validate: 9 | def __init__(self, Id): 10 | self.id = Id 11 | 12 | def get_music_json(self): 13 | return test(self.id) 14 | 15 | def aesEncrypt(text, secKey): 16 | pad = 16 - len(text) % 16 17 | text = text + pad * chr(pad) 18 | encryptor = AES.new(secKey, 2, '0102030405060708') 19 | ciphertext = encryptor.encrypt(text) 20 | ciphertext = base64.b64encode(ciphertext) 21 | return ciphertext 22 | 23 | 24 | def createSecretKey(size): 25 | return (''.join(map(lambda xx: (hex(ord(xx))[2:]), os.urandom(size))))[0:16] 26 | 27 | 28 | def rsaEncrypt(text, pubKey, modulus): 29 | text = text[::-1] 30 | rs = int(text.encode('hex'), 16) ** int(pubKey, 16) % int(modulus, 16) 31 | return format(rs, 'x').zfill(256) 32 | 33 | 34 | def test(id): 35 | url = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_'+id+'/?csrf_token=' 36 | headers = {'Cookie': 'appver=1.5.0.75771', 'Referer': 'http://music.163.com/'} 37 | text = {'username': '13308172964', 'password': 'liubolun', 'rememberLogin': 'true'} 38 | modulus = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7' 39 | nonce = '0CoJUm6Qyw8W8jud' 40 | pubKey = '010001' 41 | text = json.dumps(text) 42 | secKey = createSecretKey(16) 43 | encText = aesEncrypt(aesEncrypt(text, nonce), secKey) 44 | encSecKey = rsaEncrypt(secKey, pubKey, modulus) 45 | data = {'params': encText, 'encSecKey': encSecKey} 46 | req = requests.post(url, headers=headers, data=data) 47 | return req.text 48 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /MusicSpider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class MusicspiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /MusicSpider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from scrapy.conf import settings 8 | from items import typeItem 9 | from items import playListItem, detailItem, personItem 10 | from pymongo import MongoClient 11 | 12 | class MusicspiderPipeline(object): 13 | 14 | def __init__(self): 15 | host = settings['MONGODB_HOST'] 16 | port = settings['MONGODB_PORT'] 17 | db_name = settings['MONGODB_DB'] 18 | client = MongoClient(host, port) 19 | self.tdb = client["music"] 20 | # self.post = tdb['demoOne'] 21 | 22 | 23 | def process_item(self, item, spider): 24 | if isinstance(item, typeItem): 25 | try: 26 | type_info = dict(item) 27 | post = self.tdb['demo'] 28 | if post.find_one({'type':item['type']}): 29 | pass 30 | else: 31 | if post.insert(type_info): 32 | print 'ssss' 33 | except Exception: 34 | print 'failed' 35 | elif isinstance(item, playListItem): 36 | try: 37 | list_info = dict(item) 38 | post = self.tdb['play_list'] 39 | list_id = post.find_one({'list_id': item['list_id']}) 40 | if list_id: 41 | post.update({"list_id": list_id['list_id']}, list_info) 42 | else: 43 | post.insert(list_info) 44 | except Exception: 45 | pass 46 | elif isinstance(item, detailItem): 47 | try: 48 | music_info = dict(item) 49 | post = self.tdb['music_detail'] 50 | music_id = post.find_one({'music_id': item['music_id']}) 51 | if music_id: 52 | post.update({"music_id": music_id['music_id']}, music_info) 53 | else: 54 | post.insert(music_info) 55 | except Exception: 56 | pass 57 | elif isinstance(item, personItem): 58 | try: 59 | person_info = dict(item) 60 | post = self.tdb['person'] 61 | person_id = post.find_one({'person_id': item['person_id']}) 62 | if person_id: 63 | try: 64 | post.update({"person_id": person_id['person_id']}, person_info) 65 | except Exception: 66 | pass 67 | else: 68 | post.insert(person_info) 69 | except Exception: 70 | pass 71 | return item 72 | -------------------------------------------------------------------------------- /MusicSpider/spiders/music.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import scrapy 3 | import demjson 4 | from ..items import playListItem, detailItem 5 | from scrapy.selector import Selector 6 | from scrapy_redis.spiders import RedisSpider 7 | #from ..validate import validate 8 | 9 | class MusicSpider(RedisSpider): 10 | name = 'music' 11 | start_urls = 'http://music.163.com/discover/playlist' 12 | redis_key = 'music:urls' 13 | allowed_domains = ["music.163.com"] 14 | 15 | 16 | def start_requests(self): 17 | yield scrapy.Request(url=self.start_urls, method='GET', callback=self.parse) 18 | 19 | def parse(self, response): 20 | body = response.body 21 | type_list = Selector(text=body).xpath("//a[@class='s-fc1 ']/text()").extract() 22 | url = 'http://music.163.com/discover/playlist/?cat=' 23 | for tmp in type_list: 24 | try: 25 | true_url = url + tmp 26 | yield scrapy.Request(url=true_url, method="GET", 27 | callback=self.list_parse, meta={"cat": tmp}) 28 | 29 | except Exception: 30 | pass 31 | 32 | def test_parse(self, response): 33 | print response.body 34 | 35 | def list_parse(self, response): 36 | selector = Selector(text=response.body) 37 | list = selector.xpath("//li//a[@class='msk']/@title") 38 | urls = selector.xpath("//a[@class='zpgi']/@href").extract() 39 | start_url = "http://music.163.com" 40 | for tmp_url in urls: 41 | yield scrapy.Request(url=start_url + tmp_url, method="GET", callback=self.list_parse, 42 | meta={"cat": response.meta['cat']}) 43 | i = 1 44 | for tmp in list: 45 | list_id = selector.xpath("//li[" + str(i) 46 | + "]//a[@class='icon-play f-fr']/@data-res-id").extract_first() 47 | i = i + 1 48 | # 歌单列表 49 | yield scrapy.Request(url=start_url+"/playlist?id="+list_id, method="GET", callback=self.play_list_parse, 50 | meta={"cat": response.meta['cat'], "id": list_id}) 51 | 52 | def play_list_parse(self, response): 53 | start_url = "http://music.163.com" 54 | item = playListItem() 55 | selector = Selector(text=response.body) 56 | item['list_play'] = int(selector.xpath("//strong[@id='play-count']/text()").extract_first()) 57 | item['list_collection'] = int(selector.xpath("//a[@class='u-btni u-btni-fav ']/@data-count").extract_first()) 58 | # item['list_comment'] = int(selector.xpath("//span[@id='cnt_comment_count']/text()").extract_first()) 59 | item['list_name'] = selector.xpath("//h2[@class='f-ff2 f-brk']/text()").extract_first() 60 | item['list_id'] = response.meta['id'] 61 | item['list_tag'] = selector.xpath("//a[@class='u-tag']/i/text()").extract() 62 | item['list_creator'] = selector.xpath("//span[@class='name']/a/text()").extract_first() 63 | item['list_creator_id'] = selector.xpath("//span[@class='name']/a/@href").extract_first() 64 | item['type'] = response.meta['cat'] 65 | # urls = selector.xpath("//ul[@class='f-hide']/li/a/@href").extract() 66 | # for url in urls: 67 | # yield scrapy.Request(url=start_url + url, method="GET", callback=self.detail_parse) 68 | yield item 69 | 70 | # def detail_parse(self, response): 71 | # selector = Selector(text=response.body) 72 | # id = selector.xpath("//div[@id='content-operation']/@data-rid").extract_first() 73 | # detail = validate.Validate(str(id)) 74 | # info = demjson.decode(detail.get_music_json()) 75 | # if info['total'] > 10000: 76 | # item = detailItem() 77 | # item['music_id'] = id 78 | # item['music_name'] = selector.xpath("//em[@class='f-ff2']/text()").extract_first() 79 | # item['music_album'] = selector.xpath("//p[@class='des s-fc4']/a/text()").extract_first() 80 | # item['music_artist'] = selector.xpath("//p[@class='des s-fc4']/span/@title").extract_first() 81 | # item['music_comment_num'] = int(info['total']) 82 | # item['music_comment'] = info['hotComments'] 83 | # yield item 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /MusicSpider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for MusicSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'MusicSpider' 13 | 14 | SPIDER_MODULES = ['MusicSpider.spiders'] 15 | NEWSPIDER_MODULE = 'MusicSpider.spiders' 16 | 17 | MONGODB_HOST = '121.42.205.238' 18 | # MONGODB_HOST = 'localhost' 19 | MONGODB_PORT = 27017 20 | MONGODB_DB = 'music' 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \ 23 | 'Chrome/56.0.2924.87 Safari/537.36' 24 | 25 | # Obey robots.txt rules 26 | ROBOTSTXT_OBEY = False 27 | 28 | 29 | # 并发请求设置 30 | CONCURRENT_ITEMS = 100 31 | CONCURRENT_REQUESTS = 16 32 | 33 | ITEM_PIPELINES = { 34 | 'MusicSpider.pipelines.MusicspiderPipeline': 300, 35 | } 36 | 37 | SCHEDULER = 'scrapy_redis.scheduler.Scheduler' #scrapy_redis 调度器 38 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" #去重 39 | # SCHEDULER_PERSIST = True #不清理Redis队列 40 | SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" #队列 41 | 42 | REDIS_HOST = '121.42.205.238' 43 | # REDIS_HOST = 'localhost' 44 | REDIS_PORT = 6379 45 | 46 | 47 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 48 | #CONCURRENT_REQUESTS = 32 49 | 50 | # Configure a delay for requests for the same website (default: 0) 51 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 52 | # See also autothrottle settings and docs 53 | #DOWNLOAD_DELAY = 3 54 | # The download delay setting will honor only one of: 55 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 56 | #CONCURRENT_REQUESTS_PER_IP = 16 57 | 58 | # Disable cookies (enabled by default) 59 | #COOKIES_ENABLED = False 60 | 61 | # Disable Telnet Console (enabled by default) 62 | #TELNETCONSOLE_ENABLED = False 63 | 64 | # Override the default request headers: 65 | # DEFAULT_REQUEST_HEADERS = { 66 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 67 | # 'Accept-Language': 'en', 68 | # } 69 | 70 | # Enable or disable spider middlewares 71 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 72 | #SPIDER_MIDDLEWARES = { 73 | # 'MusicSpider.middlewares.MusicspiderSpiderMiddleware': 543, 74 | #} 75 | 76 | # Enable or disable downloader middlewares 77 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 78 | #DOWNLOADER_MIDDLEWARES = { 79 | # 'MusicSpider.middlewares.MyCustomDownloaderMiddleware': 543, 80 | #} 81 | 82 | # Enable or disable extensions 83 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 84 | #EXTENSIONS = { 85 | # 'scrapy.extensions.telnet.TelnetConsole': None, 86 | #} 87 | 88 | # Configure item pipelines 89 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 90 | #ITEM_PIPELINES = { 91 | # 'MusicSpider.pipelines.MusicspiderPipeline': 300, 92 | #} 93 | 94 | # Enable and configure the AutoThrottle extension (disabled by default) 95 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 96 | #AUTOTHROTTLE_ENABLED = True 97 | # The initial download delay 98 | #AUTOTHROTTLE_START_DELAY = 5 99 | # The maximum download delay to be set in case of high latencies 100 | #AUTOTHROTTLE_MAX_DELAY = 60 101 | # The average number of requests Scrapy should be sending in parallel to 102 | # each remote server 103 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 104 | # Enable showing throttling stats for every response received: 105 | #AUTOTHROTTLE_DEBUG = False 106 | 107 | # Enable and configure HTTP caching (disabled by default) 108 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 109 | #HTTPCACHE_ENABLED = True 110 | #HTTPCACHE_EXPIRATION_SECS = 0 111 | #HTTPCACHE_DIR = 'httpcache' 112 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 113 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 114 | -------------------------------------------------------------------------------- /MusicSpider/spiders/person.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import sys 3 | default_encoding = 'utf-8' 4 | if sys.getdefaultencoding() != default_encoding: 5 | reload(sys) 6 | sys.setdefaultencoding(default_encoding) 7 | import re 8 | import time 9 | import random 10 | from ..items import personItem 11 | from scrapy_redis.spiders import RedisSpider 12 | from scrapy import Request 13 | from scrapy.selector import Selector 14 | 15 | class PersonSpider(RedisSpider): 16 | name = "person" 17 | redis_key = 'person:urls' 18 | 19 | def start_requests(self): 20 | while True: 21 | person_id = random.randint(0, 1000000000) 22 | yield Request(url="http://music.163.com/user/home?id=" + str(person_id), callback=self.parse, meta={"id": person_id}) 23 | # yield Request(url="http://music.163.com/user/home?id=1", callback=self.parse, meta={"id": 1}) 24 | # yield Request(url="http://music.163.com/user/home?id=1000000", callback=self.parse, meta={"id": 1000000}) 25 | # yield Request(url="http://music.163.com/user/home?id=5000000", callback=self.parse, meta={"id": 5000000}) 26 | # yield Request(url="http://music.163.com/user/home?id=10000000", callback=self.parse, meta={"id": 10000000}) 27 | # yield Request(url="http://music.163.com/user/home?id=15000000", callback=self.parse, meta={"id": 1500000}) 28 | # yield Request(url="http://music.163.com/user/home?id=20000000", callback=self.parse, meta={"id": 20000000}) 29 | # yield Request(url="http://music.163.com/user/home?id=25000000", callback=self.parse, meta={"id": 25000000}) 30 | # yield Request(url="http://music.163.com/user/home?id=30000000", callback=self.parse, meta={"id": 30000000}) 31 | # yield Request(url="http://music.163.com/user/home?id=40000000", callback=self.parse, meta={"id": 40000000}) 32 | # yield Request(url="http://music.163.com/user/home?id=50000000", callback=self.parse, meta={"id": 50000000}) 33 | # yield Request(url="http://music.163.com/user/home?id=60000000", callback=self.parse, meta={"id": 60000000}) 34 | # yield Request(url="http://music.163.com/user/home?id=70000000", callback=self.parse, meta={"id": 70000000}) 35 | # yield Request(url="http://music.163.com/user/home?id=80000000", callback=self.parse, meta={"id": 80000000}) 36 | # yield Request(url="http://music.163.com/user/home?id=90000000", callback=self.parse, meta={"id": 90000000}) 37 | # yield Request(url="http://music.163.com/user/home?id=100000000", callback=self.parse, meta={"id": 100000000}) 38 | # yield Request(url="http://music.163.com/user/home?id=150000000", callback=self.parse, meta={"id": 150000000}) 39 | # yield Request(url="http://music.163.com/user/home?id=200000000", callback=self.parse, meta={"id": 200000000}) 40 | # yield Request(url="http://music.163.com/user/home?id=250000000", callback=self.parse, meta={"id": 250000000}) 41 | # yield Request(url="http://music.163.com/user/home?id=300000000", callback=self.parse, meta={"id": 300000000}) 42 | # yield Request(url="http://music.163.com/user/home?id=350000000", callback=self.parse, meta={"id": 350000000}) 43 | # yield Request(url="http://music.163.com/user/home?id=400000000", callback=self.parse, meta={"id": 400000000}) 44 | # yield Request(url="http://music.163.com/user/home?id=450000000", callback=self.parse, meta={"id": 450000000}) 45 | # yield Request(url="http://music.163.com/user/home?id=500000000", callback=self.parse, meta={"id": 500000000}) 46 | # yield Request(url="http://music.163.com/user/home?id=550000000", callback=self.parse, meta={"id": 550000000}) 47 | # yield Request(url="http://music.163.com/user/home?id=600000000", callback=self.parse, meta={"id": 600000000}) 48 | # yield Request(url="http://music.163.com/user/home?id=650000000", callback=self.parse, meta={"id": 650000000}) 49 | # yield Request(url="http://music.163.com/user/home?id=700000000", callback=self.parse, meta={"id": 700000000}) 50 | # yield Request(url="http://music.163.com/user/home?id=750000000", callback=self.parse, meta={"id": 750000000}) 51 | # yield Request(url="http://music.163.com/user/home?id=800000000", callback=self.parse, meta={"id": 800000000}) 52 | # yield Request(url="http://music.163.com/user/home?id=850000000", callback=self.parse, meta={"id": 850000000}) 53 | # yield Request(url="http://music.163.com/user/home?id=900000000", callback=self.parse, meta={"id": 900000000}) 54 | # yield Request(url="http://music.163.com/user/home?id=950000000", callback=self.parse, meta={"id": 950000000}) 55 | # yield Request(url="http://music.163.com/user/home?id=1000000000", callback=self.parse, meta={"id": 1000000000}) 56 | 57 | 58 | 59 | def parse(self, response): 60 | selector = Selector(text=response.body) 61 | address = selector.xpath("//div[@class='inf s-fc3']/span[1]/text()").extract_first() 62 | age = selector.xpath("//span[@id='age']/@data-age").extract_first() 63 | fans = selector.xpath("//strong[@id='fan_count']/text()").extract_first() 64 | follow = selector.xpath("//strong[@id='follow_count']/text()").extract_first() 65 | event = selector.xpath("//strong[@id='event_count']/text()").extract_first() 66 | count = selector.xpath("//h4/text()").extract_first() 67 | name = selector.xpath("//span[@class='tit f-ff2 s-fc0 f-thide']/text()").extract_first() 68 | 69 | id = response.meta["id"] - 1 70 | yield Request(url="http://music.163.com/user/home?id=" + str(id), callback=self.parse, meta={"id": id}) 71 | if name != None: 72 | id = response.meta["id"] 73 | item = personItem() 74 | item['person_name'] = name 75 | if age != None: 76 | age = int(age) / 1000 77 | age = time.gmtime(int(age)) 78 | age = time.strftime("%Y-%m-%d %H:%M:%S", age) 79 | item['person_age'] = age 80 | if address != None: 81 | address = address.replace(" ", "") 82 | item['person_address'] = address.split(":")[1].split("-") 83 | if count != None: 84 | music_count = re.sub('\D', '', count) 85 | item['person_music_play'] = int(music_count) 86 | item['person_follow'] = int(follow) 87 | item['person_fan'] = int(fans) 88 | item['person_event'] = int(event) 89 | item['person_id'] = id 90 | yield item 91 | 92 | 93 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 80 | 81 | 82 | 90 | 91 | 92 | 93 | 94 | true 95 | DEFINITION_ORDER 96 | 97 | 98 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 127 | 128 | 131 | 132 | 133 | 134 | 137 | 138 | 141 | 142 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 200 | 201 | 214 | 215 | 232 | 233 | 245 | 246 | project 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 281 | 282 | 301 | 302 | 323 | 324 | 346 | 347 | 371 | 372 | 373 | 375 | 376 | 377 | 378 | 1489290692781 379 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 422 | 423 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | --------------------------------------------------------------------------------