├── taobao ├── __init__.py ├── spiders │ ├── __init__.py │ └── taobao_spider.py ├── pipelines.py ├── items.py ├── middlewares.py └── settings.py ├── .gitignore ├── scrapy.cfg └── README.md /taobao/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | 3 | *.pyc -------------------------------------------------------------------------------- /taobao/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = taobao.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = taobao 12 | -------------------------------------------------------------------------------- /taobao/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class TaobaoPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /taobao/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TaobaoItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scrapy-taobao 2 | scrapy模拟淘宝登陆,未加代理ip的处理。希望有好的代理处理方法分享出来。 3 | 4 | # 确保安装了scrapy。 5 | self.http_user = 'xxxxxxxx' # taobao username 6 | self.http_pass = 'xxxxxxxx' # taobao password 7 | 记得修改taobao_spider.py中的用户名username和密码password。\
8 | 9 | # 运行命令 10 | scrapy crawl taobao 11 | 如果用户登陆需要输入验证码,则会自动打开验证码的图片链接让客户手动输入,输入错误会重新打开验证码的图片链接供用户再次输入。 12 | 13 | # 登陆成功的提示 14 | login-success, get user nick: ["user nick"] 15 | 用户看到这句代表登陆成功,可以进行一些其他数据的提取。 16 | 17 | -------------------------------------------------------------------------------- /taobao/middlewares.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Administrator' 2 | # -*- coding: utf-8 -*- 3 | 4 | import base64 5 | 6 | 7 | class ProxyMiddleware(object): 8 | # overwrite process request 9 | def process_request(self, request, spider): 10 | # Set the location of the proxy 11 | # 设置代理ip:port 12 | request.meta['proxy'] = "http://YOUR_PROXY_IP:PORT" 13 | 14 | # Use the following lines if your proxy requires authentication 15 | # 如果你的代理服务器需要用户登陆的验证,请使用以下设置 16 | # proxy_user_pass = "USERNAME:PASSWORD" 17 | # # setup basic authentication for the proxy 18 | # encoded_user_pass = base64.encodestring(proxy_user_pass) 19 | # request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass -------------------------------------------------------------------------------- /taobao/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for taobao project 4 | # 5 | # For simplicity, this file contains only the most important settings by 6 | # default. All the other settings are documented here: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # 10 | 11 | BOT_NAME = 'taobao' 12 | 13 | SPIDER_MODULES = ['taobao.spiders'] 14 | NEWSPIDER_MODULE = 'taobao.spiders' 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'taobao (+http://www.yourdomain.com)' 18 | # 19 | # DOWNLOADER_MIDDLEWARES = { 20 | # 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110, 21 | # 'taobao.middlewares.ProxyMiddleware': 100, 22 | # } 23 | -------------------------------------------------------------------------------- /taobao/spiders/taobao_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | __author__ = 'Administrator' 4 | 5 | from scrapy.contrib.spiders import CrawlSpider 6 | from scrapy.selector import Selector 7 | from scrapy.http import Request, FormRequest 8 | import re 9 | import urllib2 10 | 11 | class TaobaoSpider(CrawlSpider): 12 | 13 | name = 'taobao' 14 | allowed_domains = ['taobao.com'] 15 | start_urls = ['https://login.taobao.com/member/login.jhtml'] 16 | def __init__(self, *args, **kwargs): 17 | super(TaobaoSpider, self).__init__(*args, **kwargs) 18 | self.http_user = 'xxxxxxxxx' # taobao username 19 | self.http_pass = 'xxxxxxxxx' # taobao password 20 | #login form 21 | self.formdata = { 22 | 'TPL_checkcode':'',\ 23 | 'TPL_username':self.http_user, \ 24 | 'TPL_password':self.http_pass,\ 25 | } 26 | self.headers = {'Host':'login.taobao.com', 27 | 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0', 28 | 'Referer' : 'https://login.taobao.com/member/login.jhtml', 29 | 'Content-Type': 'application/x-www-form-urlencoded', 30 | 'Connection' : 'Keep-Alive' 31 | } 32 | self.id = 0 33 | 34 | def start_requests(self): 35 | for i, url in enumerate(self.start_urls): 36 | yield FormRequest(url, meta = {'cookiejar': i},\ 37 | formdata = self.formdata,\ 38 | headers = self.headers,\ 39 | callback = self.login)#jump to login page 40 | 41 | def _log_page(self, response, filename): 42 | with open(filename, 'w') as f: 43 | try: 44 | f.write("%s\n%s\n%s\n" % (response.url, response.headers, response.body)) 45 | except: 46 | f.write("%s\n%s\n" % (response.url, response.headers)) 47 | 48 | def login(self, response): 49 | self._log_page(response, 'taobao_login.html') 50 | return [FormRequest.from_response(response, \ 51 | formdata = self.formdata,\ 52 | headers = self.headers,\ 53 | meta = {'cookiejar':response.meta['cookiejar']},\ 54 | callback = self.parse_item)] 55 | 56 | def parse_item(self, response): 57 | self._log_page(response, 'get_checkcode.html') 58 | hxs = Selector(response) 59 | checkcode_err = hxs.xpath('//div[@id="J_Message"]/p/text()').extract() 60 | # print checkcode_err[0] 61 | # print checkcode_err[0] == u"为了您的账户安全,请输入验证码。" 62 | if checkcode_err[0] == u"为了您的账户安全,请输入验证码。": 63 | # print 'checkcode-----' 64 | # return response 65 | checkcode_url = hxs.xpath('//img[@id="J_StandardCode_m"]/@data-src').extract() 66 | # print checkcode_url,'-------------' 67 | if not checkcode_url == False: 68 | import webbrowser 69 | webbrowser.open_new_tab(checkcode_url[0]) 70 | #提示用户输入验证码 71 | checkcode = raw_input(u'input checkcode:') 72 | #将验证码重新添加到post的数据中 73 | print checkcode 74 | self.formdata['TPL_checkcode'] = checkcode 75 | return [FormRequest.from_response(response, \ 76 | formdata = self.formdata,\ 77 | headers = self.headers,\ 78 | meta = {'cookiejar':response.meta['cookiejar']},\ 79 | callback = self.get_J_HToken)] 80 | 81 | def get_J_HToken(self, response): 82 | self._log_page(response, 'get_J_HToken.html') 83 | 84 | hxs = Selector(response) 85 | J_HToken_data = hxs.xpath('//input[@id="J_HToken"]/@value').extract() 86 | # print J_HToken_data,'tttttt-----' 87 | if not J_HToken_data: 88 | # print u"get Token unsucc,redo" 89 | # print u'可能验证码错误' 90 | checkcode_err = hxs.xpath('//div[@id="J_Message"]/p/text()').extract() 91 | print checkcode_err[0] 92 | if checkcode_err[0] == u"验证码错误,请重新输入。": 93 | # print 'checkcode2222-----' 94 | # return response 95 | checkcode_url = hxs.xpath('//img[@id="J_StandardCode_m"]/@data-src').extract() 96 | # print checkcode_url,'-------------' 97 | if not checkcode_url == False: 98 | import webbrowser 99 | webbrowser.open_new_tab(checkcode_url[0]) 100 | #提示用户输入验证码 101 | checkcode = raw_input(u'input checkcode:') 102 | #将验证码重新添加到post的数据中 103 | print checkcode 104 | self.formdata['TPL_checkcode'] = checkcode 105 | return [FormRequest.from_response(response, \ 106 | formdata = self.formdata,\ 107 | headers = self.headers,\ 108 | meta = {'cookiejar':response.meta['cookiejar']},\ 109 | callback = self.get_J_HToken)] 110 | else: 111 | # 此处拼接字符串需要注意 编码问题 J_HToken_data是unicode码 112 | get_st_url = u'https://passport.alipay.com/mini_apply_st.js?site=0&token=%s&callback=stCallback6' % J_HToken_data[0] 113 | # print get_st_url 114 | request = urllib2.Request(get_st_url) 115 | response = urllib2.urlopen(request) 116 | self._log_page(response, 'get_st.html') 117 | # self._log_page(response, 'get_st.html') 118 | pattern = re.compile('{"st":"(.*?)"}',re.S) 119 | result = re.search(pattern,response.read()) 120 | #如果成功匹配 121 | if result: 122 | print u"成功获取st码" 123 | #获取st的值 124 | st = result.group(1) 125 | # print st,'-----st' 126 | 127 | stURL = 'https://login.taobao.com/member/vst.htm?st=%s&TPL_username=%s&callback=jsonp75'% (st,self.http_user) 128 | headers = { 129 | 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0', 130 | 'Host':'login.taobao.com', 131 | 'Connection' : 'Keep-Alive' 132 | } 133 | 134 | return Request(stURL,\ 135 | headers = headers,\ 136 | callback = self.get_check_login_success) 137 | 138 | def get_check_login_success(self, response): 139 | # 登陆成功。。可以获取订单等信息了。后续补充更新。。 140 | self._log_page(response, 'get_check.html') 141 | # 登陆成功,会自动跳转到 我的淘宝页 获取跳转的url 142 | pattern = re.compile(u'"url":"(.*?)"', re.S) 143 | # print pattern 144 | # print response.body 145 | match = re.search(pattern, response.body) 146 | # print match.group(1),'--------------------' 147 | next_url = match.group(1) 148 | return Request(next_url,\ 149 | callback = self.get_next_data) 150 | 151 | def get_next_data(self, response): 152 | self._log_page(response, 'get_next.html') 153 | try: 154 | hxs = Selector(response) 155 | nick = hxs.xpath('//em[@class="s-name"]/a/text()').extract() 156 | print "login-success, get user nick:",nick 157 | except: 158 | print u'请查看get_next.html里面是否是个人淘宝页,进行代码修改' 159 | return None 160 | 161 | 162 | 163 | 164 | 165 | --------------------------------------------------------------------------------