├── .gitignore ├── 58pbdn.py ├── CookiesOfZhihu ├── README.rst ├── config.ini └── login.py ├── JDspider ├── JDspider │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── SpiderofJD.py │ │ └── __init__.py └── scrapy.cfg ├── exercisebs4.py ├── pbdnof58 ├── .idea │ ├── modules.xml │ ├── pbdnof58.iml │ └── workspace.xml ├── README.rst ├── pbdnof58 │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── 58Urlspider.py │ │ ├── 58spider-redis.py │ │ └── __init__.py └── scrapy.cfg └── tayleswift.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.pyc 3 | .DS_Store 4 | -------------------------------------------------------------------------------- /58pbdn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'qcl' 3 | 4 | from bs4 import BeautifulSoup 5 | import requests 6 | 7 | ListOfUrl = [] 8 | def GetUrl(url): 9 | web_info = requests.get(url).text 10 | soup = BeautifulSoup(web_info, 'html5lib') 11 | listofa = soup.select('#infolist > table > tbody > tr') 12 | for i in listofa: 13 | for j in i.find_all('td'): 14 | if j['class'] == ['tc']: 15 | if j.get_text() == '': 16 | for k in i.find_all('td'): 17 | if k['class'] == ['t']: 18 | ListOfUrl.append(k.a['href']) 19 | 20 | def GetAllInfo(start, end): 21 | for i in xrange(start, end + 1): 22 | url = url = 'http://bj.58.com/pbdn/0/pn' + str(i) 23 | GetUrl(url) 24 | for i in ListOfUrl: 25 | wb_info = requests.get(i).text 26 | soup = BeautifulSoup(wb_info, 'lxml') 27 | # titles = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.mainTitle > h1 ') 28 | title = soup.title.text 29 | # time = soup.select('#index_show > ul.mtit_con_left.fl > li.time') 30 | time = soup.select('.time') 31 | price = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li > div.su_con > span') 32 | quality = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(2) > div.su_con > span') 33 | area = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span > a') 34 | print title, time[0].get_text(), price[0].get_text(), quality[0].get_text().strip() 35 | if len(area) == 2: 36 | print area[0].get_text() + '-' + area[1].get_text() 37 | elif len(area) == 1: 38 | print area[0].get_text() 39 | print '----------------------------------------------------------------------------' 40 | print 'done', len(ListOfUrl) 41 | 42 | 43 | GetAllInfo(1, 20) 44 | 45 | -------------------------------------------------------------------------------- /CookiesOfZhihu/README.rst: -------------------------------------------------------------------------------- 1 | 知乎可能会随着时间的推移而改变cookies变量,需要自己手动进行设置,修改变量的值。 2 | -------------------------------------------------------------------------------- /CookiesOfZhihu/config.ini: -------------------------------------------------------------------------------- 1 | [info] 2 | email = xxxxxxx 3 | password = xxxx 4 | 5 | [cookies] 6 | l_n_c=1 7 | q_c1=650ddc07bd754017820871e5188ac1db|1463311811000|1463311811000 8 | _xsrf=a5b69090b2bdc67fd4987318958219e4 9 | cap_id="MDVhMzIzZDQ1MWFhNDk3ZDkwMjU3MmNkNWI1MDUzMzk=|1463311811|595e134256102926f52ce541ee1b0b302e832e93" 10 | l_cap_id="YmZkYWYxN2M1MjUwNDgwZmJiZDNkMWIxMzg2NzI0ZDc=|1463311811|639b2df5e9affa6aa901ca5e6f9354e185a32e3b" 11 | d_c0="ABCAZD0f7QmPTtosO-72HbNdU6o9uCkyom8=|1463311812" 12 | _zap=23eb1f89-b696-4775-92e0-11a289d24cc8 13 | _za=d7b6603b-6391-435d-b83f-ae5eef570fda 14 | __utmt=1 15 | login="MjJjNjdhNTlmZGIzNGJmYzkzMmYwZGMzZmExYjdkZGQ=|1463311835|7820d87c772e131fd637936a0eaa93b76cc0a3db" 16 | z_c0=Mi4wQUFBQVYyODFBQUFBRUlCa1BSX3RDUmNBQUFCaEFsVk44T3BmVndCZGlod2ktVVh2WDluM1oxa2xlb1FKMWt3R1FR|1463311856|b45e94045da643cff4775b4a8036070dcff8bfd4 17 | __utma=51854390.635799030.1463311831.1463311831.1463311831.1 18 | __utmb=51854390.4.10.1463311831 19 | __utmc=51854390 20 | __utmz=51854390.1463311831.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none) 21 | __utmv=51854390.100-1|2=registration_date=20140820=1^3=entry_date=20140820=1 -------------------------------------------------------------------------------- /CookiesOfZhihu/login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | 网络爬虫之用户名密码及验证码登陆:爬取知乎网站 4 | ''' 5 | import requests 6 | import ConfigParser 7 | 8 | def create_session(): 9 | cf = ConfigParser.ConfigParser() 10 | cf.read('config.ini') 11 | cookies = cf.items('cookies') 12 | cookies = dict(cookies) 13 | print cookies 14 | email = cf.get('info', 'email') 15 | password = cf.get('info', 'password') 16 | 17 | session = requests.session() 18 | login_data = {'email': email, 'password': password} 19 | header = { 20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36', 21 | 'Host': 'www.zhihu.com', 22 | 'Referer': 'http://www.zhihu.com/' 23 | } 24 | r = session.post('http://www.zhihu.com/login/email', data=login_data, headers=header) 25 | print r.json() 26 | if r.json()['r'] == 1: 27 | print 'Login Failed, reason is:', 28 | print 'So we use cookies to login in...' 29 | has_cookies = False 30 | for key in cookies: 31 | if key != '__name__' and cookies[key] != '': 32 | has_cookies = True 33 | break 34 | if has_cookies is False: 35 | raise ValueError('请填写config.ini文件中的cookies项.') 36 | else: 37 | # r = requests.get('http://www.zhihu.com/login/email', cookies=cookies) # 实现验证码登陆 38 | r = session.get('http://www.zhihu.com/login/email', cookies=cookies) # 实现验证码登陆 39 | 40 | with open('login.html', 'w') as fp: 41 | fp.write(r.content) 42 | 43 | return session, cookies 44 | 45 | 46 | if __name__ == '__main__': 47 | requests_session, requests_cookies = create_session() 48 | 49 | # url = 'http://www.zhihu.com/login/email' 50 | url = 'http://www.zhihu.com/topic/19552832' 51 | # content = requests_session.get(url).content # 未登陆 52 | # content = requests.get(url, cookies=requests_cookies).content # 已登陆 53 | content = requests_session.get(url, cookies=requests_cookies).content # 已登陆 54 | with open('url.html', 'w') as fp: 55 | fp.write(content) 56 | -------------------------------------------------------------------------------- /JDspider/JDspider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qcl643062/spider/d83ab1d597440bae27c90bf1436e620cfdd233f2/JDspider/JDspider/__init__.py -------------------------------------------------------------------------------- /JDspider/JDspider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from scrapy.loader import ItemLoader 10 | from scrapy.loader.processors import MapCompose, TakeFirst, Join 11 | 12 | 13 | class JdspiderItem(scrapy.Item): 14 | # define the fields for your item here like: 15 | title = scrapy.Field() 16 | desc = scrapy.Field() 17 | price = scrapy.Field() 18 | haoping = scrapy.Field() 19 | zhongping = scrapy.Field() 20 | chaping = scrapy.Field() 21 | shaitu = scrapy.Field() 22 | 23 | class JDspiderLoader(ItemLoader): 24 | default_item_class = JdspiderItem 25 | default_input_processor = MapCompose(lambda s: s.strip()) 26 | default_output_processor = TakeFirst() 27 | description_out = Join() 28 | -------------------------------------------------------------------------------- /JDspider/JDspider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class JdspiderPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /JDspider/JDspider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for JDspider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'JDspider' 13 | 14 | SPIDER_MODULES = ['JDspider.spiders'] 15 | NEWSPIDER_MODULE = 'JDspider.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'JDspider (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'JDspider.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'JDspider.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'JDspider.pipelines.JdspiderPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /JDspider/JDspider/spiders/SpiderofJD.py: -------------------------------------------------------------------------------- 1 | from scrapy_redis.spiders import RedisSpider 2 | from JDspider.items import JDspiderLoader 3 | from splinter import Browser 4 | from scrapy import log 5 | 6 | 7 | 8 | class Myspider(RedisSpider): 9 | '''spider that reads urls from redis queue (myspider:start_urls).''' 10 | name = 'jdspider' 11 | redis_key = 'jdspider_urls' 12 | 13 | def __init__(self, *args, **kwargs): 14 | domain = kwargs.pop('domain', '') 15 | self.allowed_domans = filter(None, domain.split(',')) 16 | super(Myspider, self).__init__(*args, **kwargs) 17 | 18 | def parse(self, response): 19 | el = JDspiderLoader(response=response) 20 | el.add_xpath('title', '//*[@id="name"]/h1/text()') 21 | with Browser() as browser: 22 | url = response.url 23 | browser.visit(url) 24 | price = browser.find_by_id('jd-price') 25 | if price == []: 26 | price = browser.find_by_xpath('//*[@id="price"]/strong') 27 | # self.log(price[0].value, level=log.DEBUG) 28 | el.add_value('price', price[0].value[1:]) 29 | with Browser() as browser: 30 | number = response.url.split('/')[-1].split('.')[0] 31 | url = 'http://club.jd.com/review/' + number + '-2-1.html' 32 | browser.visit(url) 33 | shaitu = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[5]/a/em') 34 | el.add_value('shaitu', shaitu[0].value[1:-1]) 35 | haoping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[2]/a/em') 36 | el.add_value('haoping', haoping[0].value[1:-1]) 37 | zhongping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[3]/a/em') 38 | el.add_value('zhongping', zhongping[0].value[1:-1]) 39 | chaping = browser.find_by_xpath('//*[@id="comments-list"]/div[1]/ul/li[4]/a/em') 40 | el.add_value('chaping', chaping[0].value[1:-1]) 41 | return el.load_item() 42 | 43 | -------------------------------------------------------------------------------- /JDspider/JDspider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /JDspider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = JDspider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = JDspider 12 | -------------------------------------------------------------------------------- /exercisebs4.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'qcl' 3 | 4 | # 采用的是小猪短租的杭州4月14到4月16日的信息。取用了300个数据,如果你采用的不足300个,注意要去掉重复的。目前发现了一个问题是有人将 5 | # 同样的房源发了多次,目前并没有做这部分的校验工作。 6 | 7 | 8 | from bs4 import BeautifulSoup 9 | import requests 10 | 11 | ListOfUrls = [] 12 | page = 1 13 | LenOfUrls = 0 14 | while len(ListOfUrls) != 300: 15 | url = 'http://hz.xiaozhu.com/search-duanzufang-p'+ str(page) + '-0/?startDate=2016-04-15&endDate=2016-04-16' 16 | print url 17 | duanzu = requests.get(url) 18 | soup = BeautifulSoup(duanzu.text, 'html5lib') 19 | urls = soup.select('#page_list > ul > li > a') 20 | try: 21 | for url1 in urls: 22 | ListOfUrls.append(url1['href']) 23 | if len(ListOfUrls) == 300: 24 | break 25 | except: 26 | print url1, page 27 | finally: 28 | page += 1 29 | for i in ListOfUrls: 30 | data = requests.get(i) 31 | soup = BeautifulSoup(data.text, 'lxml') 32 | title = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em') 33 | address = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span.pr5') 34 | price = soup.select('#pricePart > div.day_l > span') 35 | PhotoOfHome = soup.select('#detailImageBox > div.pho_show_r > div > ul > li:nth-of-type(1) > img') 36 | NameOfHoster = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a') 37 | sex = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > div') 38 | PhotoOfHoster = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img') 39 | print '网址 : %s' % (i.encode('utf-8')) 40 | print '标题 : %s' % (title[0].get_text().encode('utf-8')) 41 | print '地址 : %s' % (address[0].get_text().encode('utf-8')) 42 | print '价钱 : %s' % (price[0].get_text().encode('utf-8')) 43 | print '房间照片 : %s' % (PhotoOfHome[0]['data-bigimg'].encode('utf-8')) 44 | print '房东名字 : %s' % (NameOfHoster[0].get_text().encode('utf-8')) 45 | print '房东照片 : %s' % (PhotoOfHoster[0]['src'].encode('utf-8')) 46 | if sex[0]['class'] == ['member_ico']: 47 | sex = '男' 48 | else: 49 | sex = '女' 50 | print ('房东性别 : %s') % (sex) 51 | print '_______________________________________________________________________________________________' 52 | 53 | -------------------------------------------------------------------------------- /pbdnof58/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /pbdnof58/.idea/pbdnof58.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /pbdnof58/.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 14 | 15 | 16 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 120 | 121 | 122 | 134 | 135 | 136 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 173 | 174 | 175 | 176 | 179 | 180 | 183 | 184 | 185 | 186 | 189 | 190 | 193 | 194 | 197 | 198 | 199 | 200 | 203 | 204 | 207 | 208 | 211 | 212 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 245 | 246 | 257 | 258 | 276 | 277 | 295 | 296 | 316 | 317 | 338 | 339 | 362 | 363 | 364 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 1461386327912 373 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 406 | 409 | 410 | 411 | 413 | 414 | 415 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | -------------------------------------------------------------------------------- /pbdnof58/README.rst: -------------------------------------------------------------------------------- 1 | 采用了scrapy-redis做了一个分布式的爬虫。最后用mysql存储数据。需要事先安装以及配置好redis以及mysql。 2 | 3 | 进入工程后,scrapy crawl myspider_58page运行这个命令就会开始爬页码直到最后结束。需要在redis中给出初始页码。 4 | 5 | scrapy crawl myspider_58运行这个命令就会开始爬取具体的个人卖家的信息,会存储在相应的mysql表中。 6 | 7 | 上述2个爬虫都可以多开来达到多进程爬取的效果。 8 | 9 | -------------------------------------------------------------------------------- /pbdnof58/pbdnof58/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qcl643062/spider/d83ab1d597440bae27c90bf1436e620cfdd233f2/pbdnof58/pbdnof58/__init__.py -------------------------------------------------------------------------------- /pbdnof58/pbdnof58/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | from sched import scheduler 8 | 9 | import scrapy 10 | from scrapy.loader import ItemLoader 11 | from scrapy.loader.processors import MapCompose, TakeFirst, Join 12 | 13 | 14 | class Pbdnof58Item(scrapy.Item): 15 | # define the fields for your item here like: 16 | title = scrapy.Field() 17 | time = scrapy.Field() 18 | price = scrapy.Field() 19 | quality = scrapy.Field() 20 | area = scrapy.Field() 21 | UrlofPage = scrapy.Field() 22 | 23 | class Pbdnof58Loader(ItemLoader): 24 | default_item_class = Pbdnof58Item 25 | default_input_processor = MapCompose(lambda s: s.strip()) 26 | default_output_processor = TakeFirst() 27 | description_out = Join() -------------------------------------------------------------------------------- /pbdnof58/pbdnof58/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from scrapy.exceptions import DropItem 8 | from twisted.enterprise import adbapi 9 | import MySQLdb 10 | import MySQLdb.cursors 11 | from scrapy import log 12 | 13 | 14 | class CleanPipeline(object): 15 | 16 | def __init__(self): 17 | self.has = set() 18 | 19 | def process_item(self, item, spider): 20 | if item.keys() >= 5: 21 | if item in self.has: 22 | raise DropItem("Duplicate item found: %s" % item) 23 | else: 24 | self.has.add(item) 25 | return item 26 | 27 | 28 | class MySQLPipeline(object): 29 | def __init__(self, dbpool): 30 | self.dbpool = dbpool 31 | 32 | @classmethod 33 | def from_settings(cls, settings): 34 | dbargs = dict( 35 | host=settings['MYSQL_HOST'], 36 | db=settings['MYSQL_DBNAME'], 37 | user=settings['MYSQL_USER'], 38 | passwd=settings['MYSQL_PASSWD'], 39 | charset='utf8', 40 | cursorclass = MySQLdb.cursors.DictCursor, 41 | use_unicode= True, 42 | ) 43 | dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs) 44 | return cls(dbpool) 45 | 46 | def process_item(self, item, spider): 47 | d = self.dbpool.runInteraction(self.__do__insert, item, spider) 48 | d.addBoth(lambda _: item) 49 | return d 50 | 51 | def __do__insert(self, conn, item, spider): 52 | try: 53 | conn.execute(""" 54 | insert into 58pbdndb set title = %s, area = %s, price = %s, quality = %s, time = %s 55 | """, (item['title'], item['area'], item['price'], item['quality'], item['time'])) 56 | 57 | except MySQLdb.Error, e: 58 | spider.log("Mysql Error %d: %s" % (e.args[0], e.args[1]), level=log.DEBUG) 59 | 60 | 61 | -------------------------------------------------------------------------------- /pbdnof58/pbdnof58/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for pbdnof58 project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'pbdnof58' 13 | 14 | SPIDER_MODULES = ['pbdnof58.spiders'] 15 | NEWSPIDER_MODULE = 'pbdnof58.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'pbdnof58 (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'pbdnof58.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'pbdnof58.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'pbdnof58.pipelines.CleanPipeline': 300, 69 | 'pbdnof58.pipelines.MySQLPipeline': 400, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | 93 | # start MySQL database configure setting 94 | MYSQL_HOST = 'localhost' 95 | MYSQL_DBNAME = 'qcl' 96 | MYSQL_USER = 'root' 97 | MYSQL_PASSWD = '' 98 | # end of MySQL database configure setting 99 | -------------------------------------------------------------------------------- /pbdnof58/pbdnof58/spiders/58Urlspider.py: -------------------------------------------------------------------------------- 1 | from scrapy_redis.spiders import RedisSpider 2 | from pbdnof58.items import Pbdnof58Loader 3 | from redis import Redis 4 | from scrapy import log 5 | from time import sleep 6 | 7 | class Myspider(RedisSpider): 8 | '''spider that reads urls from redis queue (myspider:start_urls).''' 9 | name = 'myspider_58page' 10 | redis_key = 'myspider:58_urls' 11 | 12 | def __init__(self, *args, **kwargs): 13 | domain = kwargs.pop('domain', '') 14 | self.allowed_domans = filter(None, domain.split(',')) 15 | super(Myspider, self).__init__(*args, **kwargs) 16 | self.url = 'http://bj.58.com' 17 | 18 | def parse(self, response): 19 | el = Pbdnof58Loader(response=response) 20 | PageUrl = response.xpath('//a[contains(@class, "next")]/@href').extract() 21 | self.log(PageUrl, level=log.DEBUG) 22 | r = Redis() 23 | if PageUrl != []: 24 | r.lpush('myspider:58_urls', self.url + PageUrl[0]) 25 | sleep(1) 26 | el.add_value('UrlofPage', self.url + PageUrl[0]) 27 | urls = response.xpath('//table[contains(@class, "tbimg")]/tr') 28 | for url in urls: 29 | url = url.xpath('td[contains(@class, "t")]/a/@href').extract() 30 | if len(url) == 1 and 'zhuan' not in url[0]: 31 | r.lpush('myspider:start_urls', url[0]) 32 | return el.load_item() 33 | -------------------------------------------------------------------------------- /pbdnof58/pbdnof58/spiders/58spider-redis.py: -------------------------------------------------------------------------------- 1 | from scrapy_redis.spiders import RedisSpider 2 | from pbdnof58.items import Pbdnof58Loader 3 | # from scrapy import log 4 | 5 | 6 | class Myspider(RedisSpider): 7 | '''spider that reads urls from redis queue (myspider:start_urls).''' 8 | name = 'myspider_58' 9 | redis_key = 'myspider:start_urls' 10 | 11 | def __init__(self, *args, **kwargs): 12 | domain = kwargs.pop('domain', '') 13 | self.allowed_domans = filter(None, domain.split(',')) 14 | super(Myspider, self).__init__(*args, **kwargs) 15 | 16 | 17 | def parse(self, response): 18 | 19 | el = Pbdnof58Loader(response=response) 20 | el.add_xpath('title', '//h1/text()') 21 | el.add_xpath('price', '//span[contains(@class, "price c_f50")]/text()'.strip()) 22 | quality = response.xpath('//ul[contains(@class, "suUl")]/li') 23 | quality = quality[1].xpath('div[contains(@class, "su_con")]/span/text()').extract()[0].strip() 24 | el.add_value('quality', quality) 25 | area = response.xpath('//span[contains(@class, "c_25d")]/a/text()'.strip()) 26 | if area == []: 27 | area = 'None' 28 | elif len(area) == 1: 29 | area = area[0].extract() 30 | else: 31 | area = area[0].extract() + '-' + area[1].extract() 32 | el.add_value('area', area) 33 | el.add_xpath('time', '//li[contains(@class, "time")]/text()') 34 | return el.load_item() 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /pbdnof58/pbdnof58/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /pbdnof58/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = pbdnof58.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = pbdnof58 12 | -------------------------------------------------------------------------------- /tayleswift.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = 'qcl' 3 | 4 | from urllib import urlretrieve 5 | from bs4 import BeautifulSoup 6 | import requests 7 | 8 | listofpic = [] 9 | def getpiclist(url): 10 | wb_data = requests.get(url).text 11 | soup = BeautifulSoup(wb_data, 'html5lib') 12 | urls = soup.select('#main-container > div > div.grid-thumb.grid-responsive > div > div > div > a > img') 13 | for url in urls: 14 | listofpic.append(url['src']) 15 | def getpicture(first, last): 16 | for i in xrange(first, last + 1): 17 | url = 'http://weheartit.com/inspirations/taylorswift?page=' + str(i) 18 | getpiclist(url) 19 | for i in xrange(len(listofpic)): 20 | postfix = listofpic[i].split('.')[-1] 21 | local = 'E:\\tayleswift\\tayleswift' + str(i) + '.' + postfix 22 | urlretrieve(listofpic[i], local) 23 | print 'done' 24 | getpicture(1, 20) 25 | 26 | --------------------------------------------------------------------------------