├── MusicSpider
├── __init__.py
├── validate
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── validate.pyc
│ └── validate.py
├── items.pyc
├── __init__.pyc
├── settings.pyc
├── pipelines.pyc
├── spiders
│ ├── music.pyc
│ ├── person.pyc
│ ├── __init__.pyc
│ ├── __init__.py
│ ├── music.py
│ └── person.py
├── items.py
├── middlewares.py
├── pipelines.py
└── settings.py
├── README.md
├── .idea
├── vcs.xml
├── misc.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
├── MusicSpider.iml
├── mongoSettings.xml
└── workspace.xml
└── scrapy.cfg
/MusicSpider/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/MusicSpider/validate/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MusicSpider(网易云音乐爬虫)
2 | ### 环境
3 | scrapy + redis + mongodb 分布式
--------------------------------------------------------------------------------
/MusicSpider/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/items.pyc
--------------------------------------------------------------------------------
/MusicSpider/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/__init__.pyc
--------------------------------------------------------------------------------
/MusicSpider/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/settings.pyc
--------------------------------------------------------------------------------
/MusicSpider/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/pipelines.pyc
--------------------------------------------------------------------------------
/MusicSpider/spiders/music.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/spiders/music.pyc
--------------------------------------------------------------------------------
/MusicSpider/spiders/person.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/spiders/person.pyc
--------------------------------------------------------------------------------
/MusicSpider/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/spiders/__init__.pyc
--------------------------------------------------------------------------------
/MusicSpider/validate/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/validate/__init__.pyc
--------------------------------------------------------------------------------
/MusicSpider/validate/validate.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UaHaLiubolun/MusicSpider/HEAD/MusicSpider/validate/validate.pyc
--------------------------------------------------------------------------------
/MusicSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = MusicSpider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = MusicSpider
12 |
--------------------------------------------------------------------------------
/.idea/MusicSpider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/mongoSettings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
18 |
19 |
--------------------------------------------------------------------------------
/MusicSpider/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class MusicspiderItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
16 | class typeItem(scrapy.Item):
17 | _id = scrapy.Field()
18 | type = scrapy.Field()
19 | url = scrapy.Field()
20 |
21 | class playListItem(scrapy.Item):
22 | _id = scrapy.Field()
23 | list_id = scrapy.Field()
24 | list_name = scrapy.Field()
25 | list_play = scrapy.Field()
26 | # list_comment = scrapy.Field()
27 | list_collection = scrapy.Field()
28 | list_creator = scrapy.Field()
29 | list_creator_id = scrapy.Field()
30 | list_tag = scrapy.Field()
31 | type = scrapy.Field()
32 |
33 | class detailItem(scrapy.Item):
34 | _id = scrapy.Field()
35 | music_id = scrapy.Field()
36 | music_name = scrapy.Field()
37 | music_album = scrapy.Field()
38 | music_artist = scrapy.Field()
39 | music_comment_num = scrapy.Field()
40 | music_comment = scrapy.Field()
41 |
42 | class personItem(scrapy.Item):
43 | _id = scrapy.Field()
44 | person_id = scrapy.Field()
45 | person_name = scrapy.Field()
46 | person_fan = scrapy.Field()
47 | person_follow = scrapy.Field()
48 | person_music_play = scrapy.Field()
49 | person_age = scrapy.Field()
50 | person_address = scrapy.Field()
51 | person_event = scrapy.Field()
52 |
53 |
--------------------------------------------------------------------------------
/MusicSpider/validate/validate.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import os
3 | import json
4 | from Crypto.Cipher import AES
5 | import base64
6 |
7 |
8 | class Validate:
9 | def __init__(self, Id):
10 | self.id = Id
11 |
12 | def get_music_json(self):
13 | return test(self.id)
14 |
15 | def aesEncrypt(text, secKey):
16 | pad = 16 - len(text) % 16
17 | text = text + pad * chr(pad)
18 | encryptor = AES.new(secKey, 2, '0102030405060708')
19 | ciphertext = encryptor.encrypt(text)
20 | ciphertext = base64.b64encode(ciphertext)
21 | return ciphertext
22 |
23 |
24 | def createSecretKey(size):
25 | return (''.join(map(lambda xx: (hex(ord(xx))[2:]), os.urandom(size))))[0:16]
26 |
27 |
28 | def rsaEncrypt(text, pubKey, modulus):
29 | text = text[::-1]
30 | rs = int(text.encode('hex'), 16) ** int(pubKey, 16) % int(modulus, 16)
31 | return format(rs, 'x').zfill(256)
32 |
33 |
34 | def test(id):
35 | url = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_'+id+'/?csrf_token='
36 | headers = {'Cookie': 'appver=1.5.0.75771', 'Referer': 'http://music.163.com/'}
37 | text = {'username': '13308172964', 'password': 'liubolun', 'rememberLogin': 'true'}
38 | modulus = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
39 | nonce = '0CoJUm6Qyw8W8jud'
40 | pubKey = '010001'
41 | text = json.dumps(text)
42 | secKey = createSecretKey(16)
43 | encText = aesEncrypt(aesEncrypt(text, nonce), secKey)
44 | encSecKey = rsaEncrypt(secKey, pubKey, modulus)
45 | data = {'params': encText, 'encSecKey': encSecKey}
46 | req = requests.post(url, headers=headers, data=data)
47 | return req.text
48 |
49 |
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/MusicSpider/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class MusicspiderSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/MusicSpider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | from scrapy.conf import settings
8 | from items import typeItem
9 | from items import playListItem, detailItem, personItem
10 | from pymongo import MongoClient
11 |
12 | class MusicspiderPipeline(object):
13 |
14 | def __init__(self):
15 | host = settings['MONGODB_HOST']
16 | port = settings['MONGODB_PORT']
17 | db_name = settings['MONGODB_DB']
18 | client = MongoClient(host, port)
19 | self.tdb = client["music"]
20 | # self.post = tdb['demoOne']
21 |
22 |
23 | def process_item(self, item, spider):
24 | if isinstance(item, typeItem):
25 | try:
26 | type_info = dict(item)
27 | post = self.tdb['demo']
28 | if post.find_one({'type':item['type']}):
29 | pass
30 | else:
31 | if post.insert(type_info):
32 | print 'ssss'
33 | except Exception:
34 | print 'failed'
35 | elif isinstance(item, playListItem):
36 | try:
37 | list_info = dict(item)
38 | post = self.tdb['play_list']
39 | list_id = post.find_one({'list_id': item['list_id']})
40 | if list_id:
41 | post.update({"list_id": list_id['list_id']}, list_info)
42 | else:
43 | post.insert(list_info)
44 | except Exception:
45 | pass
46 | elif isinstance(item, detailItem):
47 | try:
48 | music_info = dict(item)
49 | post = self.tdb['music_detail']
50 | music_id = post.find_one({'music_id': item['music_id']})
51 | if music_id:
52 | post.update({"music_id": music_id['music_id']}, music_info)
53 | else:
54 | post.insert(music_info)
55 | except Exception:
56 | pass
57 | elif isinstance(item, personItem):
58 | try:
59 | person_info = dict(item)
60 | post = self.tdb['person']
61 | person_id = post.find_one({'person_id': item['person_id']})
62 | if person_id:
63 | try:
64 | post.update({"person_id": person_id['person_id']}, person_info)
65 | except Exception:
66 | pass
67 | else:
68 | post.insert(person_info)
69 | except Exception:
70 | pass
71 | return item
72 |
--------------------------------------------------------------------------------
/MusicSpider/spiders/music.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 | import scrapy
3 | import demjson
4 | from ..items import playListItem, detailItem
5 | from scrapy.selector import Selector
6 | from scrapy_redis.spiders import RedisSpider
7 | #from ..validate import validate
8 |
9 | class MusicSpider(RedisSpider):
10 | name = 'music'
11 | start_urls = 'http://music.163.com/discover/playlist'
12 | redis_key = 'music:urls'
13 | allowed_domains = ["music.163.com"]
14 |
15 |
16 | def start_requests(self):
17 | yield scrapy.Request(url=self.start_urls, method='GET', callback=self.parse)
18 |
19 | def parse(self, response):
20 | body = response.body
21 | type_list = Selector(text=body).xpath("//a[@class='s-fc1 ']/text()").extract()
22 | url = 'http://music.163.com/discover/playlist/?cat='
23 | for tmp in type_list:
24 | try:
25 | true_url = url + tmp
26 | yield scrapy.Request(url=true_url, method="GET",
27 | callback=self.list_parse, meta={"cat": tmp})
28 |
29 | except Exception:
30 | pass
31 |
32 | def test_parse(self, response):
33 | print response.body
34 |
35 | def list_parse(self, response):
36 | selector = Selector(text=response.body)
37 | list = selector.xpath("//li//a[@class='msk']/@title")
38 | urls = selector.xpath("//a[@class='zpgi']/@href").extract()
39 | start_url = "http://music.163.com"
40 | for tmp_url in urls:
41 | yield scrapy.Request(url=start_url + tmp_url, method="GET", callback=self.list_parse,
42 | meta={"cat": response.meta['cat']})
43 | i = 1
44 | for tmp in list:
45 | list_id = selector.xpath("//li[" + str(i)
46 | + "]//a[@class='icon-play f-fr']/@data-res-id").extract_first()
47 | i = i + 1
48 | # 歌单列表
49 | yield scrapy.Request(url=start_url+"/playlist?id="+list_id, method="GET", callback=self.play_list_parse,
50 | meta={"cat": response.meta['cat'], "id": list_id})
51 |
52 | def play_list_parse(self, response):
53 | start_url = "http://music.163.com"
54 | item = playListItem()
55 | selector = Selector(text=response.body)
56 | item['list_play'] = int(selector.xpath("//strong[@id='play-count']/text()").extract_first())
57 | item['list_collection'] = int(selector.xpath("//a[@class='u-btni u-btni-fav ']/@data-count").extract_first())
58 | # item['list_comment'] = int(selector.xpath("//span[@id='cnt_comment_count']/text()").extract_first())
59 | item['list_name'] = selector.xpath("//h2[@class='f-ff2 f-brk']/text()").extract_first()
60 | item['list_id'] = response.meta['id']
61 | item['list_tag'] = selector.xpath("//a[@class='u-tag']/i/text()").extract()
62 | item['list_creator'] = selector.xpath("//span[@class='name']/a/text()").extract_first()
63 | item['list_creator_id'] = selector.xpath("//span[@class='name']/a/@href").extract_first()
64 | item['type'] = response.meta['cat']
65 | # urls = selector.xpath("//ul[@class='f-hide']/li/a/@href").extract()
66 | # for url in urls:
67 | # yield scrapy.Request(url=start_url + url, method="GET", callback=self.detail_parse)
68 | yield item
69 |
70 | # def detail_parse(self, response):
71 | # selector = Selector(text=response.body)
72 | # id = selector.xpath("//div[@id='content-operation']/@data-rid").extract_first()
73 | # detail = validate.Validate(str(id))
74 | # info = demjson.decode(detail.get_music_json())
75 | # if info['total'] > 10000:
76 | # item = detailItem()
77 | # item['music_id'] = id
78 | # item['music_name'] = selector.xpath("//em[@class='f-ff2']/text()").extract_first()
79 | # item['music_album'] = selector.xpath("//p[@class='des s-fc4']/a/text()").extract_first()
80 | # item['music_artist'] = selector.xpath("//p[@class='des s-fc4']/span/@title").extract_first()
81 | # item['music_comment_num'] = int(info['total'])
82 | # item['music_comment'] = info['hotComments']
83 | # yield item
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
--------------------------------------------------------------------------------
/MusicSpider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for MusicSpider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'MusicSpider'
13 |
14 | SPIDER_MODULES = ['MusicSpider.spiders']
15 | NEWSPIDER_MODULE = 'MusicSpider.spiders'
16 |
17 | MONGODB_HOST = '121.42.205.238'
18 | # MONGODB_HOST = 'localhost'
19 | MONGODB_PORT = 27017
20 | MONGODB_DB = 'music'
21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
22 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \
23 | 'Chrome/56.0.2924.87 Safari/537.36'
24 |
25 | # Obey robots.txt rules
26 | ROBOTSTXT_OBEY = False
27 |
28 |
29 | # 并发请求设置
30 | CONCURRENT_ITEMS = 100
31 | CONCURRENT_REQUESTS = 16
32 |
33 | ITEM_PIPELINES = {
34 | 'MusicSpider.pipelines.MusicspiderPipeline': 300,
35 | }
36 |
37 | SCHEDULER = 'scrapy_redis.scheduler.Scheduler' #scrapy_redis 调度器
38 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" #去重
39 | # SCHEDULER_PERSIST = True #不清理Redis队列
40 | SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" #队列
41 |
42 | REDIS_HOST = '121.42.205.238'
43 | # REDIS_HOST = 'localhost'
44 | REDIS_PORT = 6379
45 |
46 |
47 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
48 | #CONCURRENT_REQUESTS = 32
49 |
50 | # Configure a delay for requests for the same website (default: 0)
51 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
52 | # See also autothrottle settings and docs
53 | #DOWNLOAD_DELAY = 3
54 | # The download delay setting will honor only one of:
55 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
56 | #CONCURRENT_REQUESTS_PER_IP = 16
57 |
58 | # Disable cookies (enabled by default)
59 | #COOKIES_ENABLED = False
60 |
61 | # Disable Telnet Console (enabled by default)
62 | #TELNETCONSOLE_ENABLED = False
63 |
64 | # Override the default request headers:
65 | # DEFAULT_REQUEST_HEADERS = {
66 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
67 | # 'Accept-Language': 'en',
68 | # }
69 |
70 | # Enable or disable spider middlewares
71 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
72 | #SPIDER_MIDDLEWARES = {
73 | # 'MusicSpider.middlewares.MusicspiderSpiderMiddleware': 543,
74 | #}
75 |
76 | # Enable or disable downloader middlewares
77 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
78 | #DOWNLOADER_MIDDLEWARES = {
79 | # 'MusicSpider.middlewares.MyCustomDownloaderMiddleware': 543,
80 | #}
81 |
82 | # Enable or disable extensions
83 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
84 | #EXTENSIONS = {
85 | # 'scrapy.extensions.telnet.TelnetConsole': None,
86 | #}
87 |
88 | # Configure item pipelines
89 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
90 | #ITEM_PIPELINES = {
91 | # 'MusicSpider.pipelines.MusicspiderPipeline': 300,
92 | #}
93 |
94 | # Enable and configure the AutoThrottle extension (disabled by default)
95 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
96 | #AUTOTHROTTLE_ENABLED = True
97 | # The initial download delay
98 | #AUTOTHROTTLE_START_DELAY = 5
99 | # The maximum download delay to be set in case of high latencies
100 | #AUTOTHROTTLE_MAX_DELAY = 60
101 | # The average number of requests Scrapy should be sending in parallel to
102 | # each remote server
103 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
104 | # Enable showing throttling stats for every response received:
105 | #AUTOTHROTTLE_DEBUG = False
106 |
107 | # Enable and configure HTTP caching (disabled by default)
108 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
109 | #HTTPCACHE_ENABLED = True
110 | #HTTPCACHE_EXPIRATION_SECS = 0
111 | #HTTPCACHE_DIR = 'httpcache'
112 | #HTTPCACHE_IGNORE_HTTP_CODES = []
113 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
114 |
--------------------------------------------------------------------------------
/MusicSpider/spiders/person.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 | import sys
3 | default_encoding = 'utf-8'
4 | if sys.getdefaultencoding() != default_encoding:
5 | reload(sys)
6 | sys.setdefaultencoding(default_encoding)
7 | import re
8 | import time
9 | import random
10 | from ..items import personItem
11 | from scrapy_redis.spiders import RedisSpider
12 | from scrapy import Request
13 | from scrapy.selector import Selector
14 |
15 | class PersonSpider(RedisSpider):
16 | name = "person"
17 | redis_key = 'person:urls'
18 |
19 | def start_requests(self):
20 | while True:
21 | person_id = random.randint(0, 1000000000)
22 | yield Request(url="http://music.163.com/user/home?id=" + str(person_id), callback=self.parse, meta={"id": person_id})
23 | # yield Request(url="http://music.163.com/user/home?id=1", callback=self.parse, meta={"id": 1})
24 | # yield Request(url="http://music.163.com/user/home?id=1000000", callback=self.parse, meta={"id": 1000000})
25 | # yield Request(url="http://music.163.com/user/home?id=5000000", callback=self.parse, meta={"id": 5000000})
26 | # yield Request(url="http://music.163.com/user/home?id=10000000", callback=self.parse, meta={"id": 10000000})
27 | # yield Request(url="http://music.163.com/user/home?id=15000000", callback=self.parse, meta={"id": 1500000})
28 | # yield Request(url="http://music.163.com/user/home?id=20000000", callback=self.parse, meta={"id": 20000000})
29 | # yield Request(url="http://music.163.com/user/home?id=25000000", callback=self.parse, meta={"id": 25000000})
30 | # yield Request(url="http://music.163.com/user/home?id=30000000", callback=self.parse, meta={"id": 30000000})
31 | # yield Request(url="http://music.163.com/user/home?id=40000000", callback=self.parse, meta={"id": 40000000})
32 | # yield Request(url="http://music.163.com/user/home?id=50000000", callback=self.parse, meta={"id": 50000000})
33 | # yield Request(url="http://music.163.com/user/home?id=60000000", callback=self.parse, meta={"id": 60000000})
34 | # yield Request(url="http://music.163.com/user/home?id=70000000", callback=self.parse, meta={"id": 70000000})
35 | # yield Request(url="http://music.163.com/user/home?id=80000000", callback=self.parse, meta={"id": 80000000})
36 | # yield Request(url="http://music.163.com/user/home?id=90000000", callback=self.parse, meta={"id": 90000000})
37 | # yield Request(url="http://music.163.com/user/home?id=100000000", callback=self.parse, meta={"id": 100000000})
38 | # yield Request(url="http://music.163.com/user/home?id=150000000", callback=self.parse, meta={"id": 150000000})
39 | # yield Request(url="http://music.163.com/user/home?id=200000000", callback=self.parse, meta={"id": 200000000})
40 | # yield Request(url="http://music.163.com/user/home?id=250000000", callback=self.parse, meta={"id": 250000000})
41 | # yield Request(url="http://music.163.com/user/home?id=300000000", callback=self.parse, meta={"id": 300000000})
42 | # yield Request(url="http://music.163.com/user/home?id=350000000", callback=self.parse, meta={"id": 350000000})
43 | # yield Request(url="http://music.163.com/user/home?id=400000000", callback=self.parse, meta={"id": 400000000})
44 | # yield Request(url="http://music.163.com/user/home?id=450000000", callback=self.parse, meta={"id": 450000000})
45 | # yield Request(url="http://music.163.com/user/home?id=500000000", callback=self.parse, meta={"id": 500000000})
46 | # yield Request(url="http://music.163.com/user/home?id=550000000", callback=self.parse, meta={"id": 550000000})
47 | # yield Request(url="http://music.163.com/user/home?id=600000000", callback=self.parse, meta={"id": 600000000})
48 | # yield Request(url="http://music.163.com/user/home?id=650000000", callback=self.parse, meta={"id": 650000000})
49 | # yield Request(url="http://music.163.com/user/home?id=700000000", callback=self.parse, meta={"id": 700000000})
50 | # yield Request(url="http://music.163.com/user/home?id=750000000", callback=self.parse, meta={"id": 750000000})
51 | # yield Request(url="http://music.163.com/user/home?id=800000000", callback=self.parse, meta={"id": 800000000})
52 | # yield Request(url="http://music.163.com/user/home?id=850000000", callback=self.parse, meta={"id": 850000000})
53 | # yield Request(url="http://music.163.com/user/home?id=900000000", callback=self.parse, meta={"id": 900000000})
54 | # yield Request(url="http://music.163.com/user/home?id=950000000", callback=self.parse, meta={"id": 950000000})
55 | # yield Request(url="http://music.163.com/user/home?id=1000000000", callback=self.parse, meta={"id": 1000000000})
56 |
57 |
58 |
59 | def parse(self, response):
60 | selector = Selector(text=response.body)
61 | address = selector.xpath("//div[@class='inf s-fc3']/span[1]/text()").extract_first()
62 | age = selector.xpath("//span[@id='age']/@data-age").extract_first()
63 | fans = selector.xpath("//strong[@id='fan_count']/text()").extract_first()
64 | follow = selector.xpath("//strong[@id='follow_count']/text()").extract_first()
65 | event = selector.xpath("//strong[@id='event_count']/text()").extract_first()
66 | count = selector.xpath("//h4/text()").extract_first()
67 | name = selector.xpath("//span[@class='tit f-ff2 s-fc0 f-thide']/text()").extract_first()
68 |
69 | id = response.meta["id"] - 1
70 | yield Request(url="http://music.163.com/user/home?id=" + str(id), callback=self.parse, meta={"id": id})
71 | if name != None:
72 | id = response.meta["id"]
73 | item = personItem()
74 | item['person_name'] = name
75 | if age != None:
76 | age = int(age) / 1000
77 | age = time.gmtime(int(age))
78 | age = time.strftime("%Y-%m-%d %H:%M:%S", age)
79 | item['person_age'] = age
80 | if address != None:
81 | address = address.replace(" ", "")
82 | item['person_address'] = address.split(":")[1].split("-")
83 | if count != None:
84 | music_count = re.sub('\D', '', count)
85 | item['person_music_play'] = int(music_count)
86 | item['person_follow'] = int(follow)
87 | item['person_fan'] = int(fans)
88 | item['person_event'] = int(event)
89 | item['person_id'] = id
90 | yield item
91 |
92 |
93 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 | true
95 | DEFINITION_ORDER
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 | project
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 | 1489290692781
379 |
380 |
381 | 1489290692781
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
--------------------------------------------------------------------------------