├── taobao
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   └── taobao_spider.py
    ├── pipelines.py
    ├── items.py
    ├── middlewares.py
    └── settings.py
├── .gitignore
├── scrapy.cfg
└── README.md


/taobao/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 
3 | *.pyc


--------------------------------------------------------------------------------
/taobao/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = taobao.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = taobao
12 | 


--------------------------------------------------------------------------------
/taobao/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class TaobaoPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/taobao/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class TaobaoItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # scrapy-taobao
 2 | scrapy模拟淘宝登陆，未加代理ip的处理。希望有好的代理处理方法分享出来。
 3 | 
 4 | # 确保安装了scrapy。
 5 |     self.http_user = 'xxxxxxxx'   # taobao username
 6 |     self.http_pass = 'xxxxxxxx'   # taobao password
 7 | 记得修改taobao_spider.py中的用户名username和密码password。\<br>
 8 | 
 9 | # 运行命令
10 |     scrapy crawl taobao
11 |   如果用户登陆需要输入验证码，则会自动打开验证码的图片链接让客户手动输入，输入错误会重新打开验证码的图片链接供用户再次输入。
12 | 
13 | # 登陆成功的提示
14 |     login-success, get user nick: ["user nick"]
15 | 用户看到这句代表登陆成功，可以进行一些其他数据的提取。
16 |     
17 | 


--------------------------------------------------------------------------------
/taobao/middlewares.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Administrator'
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import base64
 5 | 
 6 | 
 7 | class ProxyMiddleware(object):
 8 |     # overwrite process request
 9 |     def process_request(self, request, spider):
10 |         # Set the location of the proxy
11 |         # 设置代理ip:port
12 |         request.meta['proxy'] = "http://YOUR_PROXY_IP:PORT"
13 | 
14 |         # Use the following lines if your proxy requires authentication
15 |         # 如果你的代理服务器需要用户登陆的验证，请使用以下设置
16 |         # proxy_user_pass = "USERNAME:PASSWORD"
17 |         # # setup basic authentication for the proxy
18 |         # encoded_user_pass = base64.encodestring(proxy_user_pass)
19 |         # request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass


--------------------------------------------------------------------------------
/taobao/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for taobao project
 4 | #
 5 | # For simplicity, this file contains only the most important settings by
 6 | # default. All the other settings are documented here:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #
10 | 
11 | BOT_NAME = 'taobao'
12 | 
13 | SPIDER_MODULES = ['taobao.spiders']
14 | NEWSPIDER_MODULE = 'taobao.spiders'
15 | 
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'taobao (+http://www.yourdomain.com)'
18 | #
19 | # DOWNLOADER_MIDDLEWARES = {
20 | #     'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110,
21 | #     'taobao.middlewares.ProxyMiddleware': 100,
22 | # }
23 | 


--------------------------------------------------------------------------------
/taobao/spiders/taobao_spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | __author__ = 'Administrator'
  4 | 
  5 | from scrapy.contrib.spiders import CrawlSpider
  6 | from scrapy.selector import Selector
  7 | from scrapy.http import Request, FormRequest
  8 | import re
  9 | import urllib2
 10 | 
 11 | class TaobaoSpider(CrawlSpider):
 12 | 
 13 |     name = 'taobao'
 14 |     allowed_domains = ['taobao.com']
 15 |     start_urls = ['https://login.taobao.com/member/login.jhtml']
 16 |     def __init__(self, *args, **kwargs):
 17 |         super(TaobaoSpider, self).__init__(*args, **kwargs)
 18 |         self.http_user = 'xxxxxxxxx'   # taobao username
 19 |         self.http_pass = 'xxxxxxxxx'   # taobao password
 20 |         #login form
 21 |         self.formdata = {
 22 |                         'TPL_checkcode':'',\
 23 |                         'TPL_username':self.http_user, \
 24 |                         'TPL_password':self.http_pass,\
 25 |                         }
 26 |         self.headers = {'Host':'login.taobao.com',
 27 |                         'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0',
 28 |                         'Referer' : 'https://login.taobao.com/member/login.jhtml',
 29 |                         'Content-Type': 'application/x-www-form-urlencoded',
 30 |                         'Connection' : 'Keep-Alive'
 31 |                         }
 32 |         self.id = 0
 33 | 
 34 |     def start_requests(self):
 35 |         for i, url in enumerate(self.start_urls):
 36 |             yield FormRequest(url, meta = {'cookiejar': i},\
 37 |                                 formdata = self.formdata,\
 38 |                                 headers = self.headers,\
 39 |                                 callback = self.login)#jump to login page
 40 | 
 41 |     def _log_page(self, response, filename):
 42 |         with open(filename, 'w') as f:
 43 |             try:
 44 |                 f.write("%s\n%s\n%s\n" % (response.url, response.headers, response.body))
 45 |             except:
 46 |                 f.write("%s\n%s\n" % (response.url, response.headers))
 47 | 
 48 |     def login(self, response):
 49 |         self._log_page(response, 'taobao_login.html')
 50 |         return [FormRequest.from_response(response, \
 51 |                             formdata = self.formdata,\
 52 |                             headers = self.headers,\
 53 |                             meta = {'cookiejar':response.meta['cookiejar']},\
 54 |                             callback = self.parse_item)]
 55 | 
 56 |     def parse_item(self, response):
 57 |         self._log_page(response, 'get_checkcode.html')
 58 |         hxs = Selector(response)
 59 |         checkcode_err = hxs.xpath('//div[@id="J_Message"]/p/text()').extract()
 60 |         # print checkcode_err[0]
 61 |         # print checkcode_err[0] == u"为了您的账户安全，请输入验证码。"
 62 |         if checkcode_err[0] == u"为了您的账户安全，请输入验证码。":
 63 |             # print 'checkcode-----'
 64 |             # return response
 65 |             checkcode_url = hxs.xpath('//img[@id="J_StandardCode_m"]/@data-src').extract()
 66 |             # print checkcode_url,'-------------'
 67 |             if not checkcode_url == False:
 68 |                 import webbrowser
 69 |                 webbrowser.open_new_tab(checkcode_url[0])
 70 |                 #提示用户输入验证码
 71 |                 checkcode = raw_input(u'input checkcode:')
 72 |                 #将验证码重新添加到post的数据中
 73 |                 print checkcode
 74 |                 self.formdata['TPL_checkcode'] = checkcode
 75 |         return [FormRequest.from_response(response, \
 76 |                     formdata = self.formdata,\
 77 |                     headers = self.headers,\
 78 |                     meta = {'cookiejar':response.meta['cookiejar']},\
 79 |                     callback = self.get_J_HToken)]
 80 | 
 81 |     def get_J_HToken(self, response):
 82 |         self._log_page(response, 'get_J_HToken.html')
 83 | 
 84 |         hxs = Selector(response)
 85 |         J_HToken_data = hxs.xpath('//input[@id="J_HToken"]/@value').extract()
 86 |         # print J_HToken_data,'tttttt-----'
 87 |         if not J_HToken_data:
 88 |             # print u"get Token unsucc，redo"
 89 |             # print u'可能验证码错误'
 90 |             checkcode_err = hxs.xpath('//div[@id="J_Message"]/p/text()').extract()
 91 |             print checkcode_err[0]
 92 |             if checkcode_err[0] == u"验证码错误，请重新输入。":
 93 |                 # print 'checkcode2222-----'
 94 |                 # return response
 95 |                 checkcode_url = hxs.xpath('//img[@id="J_StandardCode_m"]/@data-src').extract()
 96 |                 # print checkcode_url,'-------------'
 97 |                 if not checkcode_url == False:
 98 |                     import webbrowser
 99 |                     webbrowser.open_new_tab(checkcode_url[0])
100 |                     #提示用户输入验证码
101 |                     checkcode = raw_input(u'input checkcode:')
102 |                     #将验证码重新添加到post的数据中
103 |                     print checkcode
104 |                     self.formdata['TPL_checkcode'] = checkcode
105 |                     return [FormRequest.from_response(response, \
106 |                                 formdata = self.formdata,\
107 |                                 headers = self.headers,\
108 |                                 meta = {'cookiejar':response.meta['cookiejar']},\
109 |                                 callback = self.get_J_HToken)]
110 |         else:
111 |             # 此处拼接字符串需要注意 编码问题 J_HToken_data是unicode码
112 |             get_st_url = u'https://passport.alipay.com/mini_apply_st.js?site=0&token=%s&callback=stCallback6' % J_HToken_data[0]
113 |             # print get_st_url
114 |             request = urllib2.Request(get_st_url)
115 |             response = urllib2.urlopen(request)
116 |             self._log_page(response, 'get_st.html')
117 |             # self._log_page(response, 'get_st.html')
118 |             pattern = re.compile('{"st":"(.*?)"}',re.S)
119 |             result = re.search(pattern,response.read())
120 |             #如果成功匹配
121 |             if result:
122 |                 print u"成功获取st码"
123 |                 #获取st的值
124 |                 st = result.group(1)
125 |                 # print st,'-----st'
126 | 
127 |                 stURL = 'https://login.taobao.com/member/vst.htm?st=%s&TPL_username=%s&callback=jsonp75'% (st,self.http_user)
128 |                 headers = {
129 |                     'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0',
130 |                     'Host':'login.taobao.com',
131 |                     'Connection' : 'Keep-Alive'
132 |                 }
133 | 
134 |                 return Request(stURL,\
135 |                             headers = headers,\
136 |                             callback = self.get_check_login_success)
137 | 
138 |     def get_check_login_success(self, response):
139 |         # 登陆成功。。可以获取订单等信息了。后续补充更新。。
140 |         self._log_page(response, 'get_check.html')
141 |         # 登陆成功，会自动跳转到 我的淘宝页 获取跳转的url
142 |         pattern = re.compile(u'"url":"(.*?)"', re.S)
143 |         # print pattern
144 |         # print response.body
145 |         match = re.search(pattern, response.body)
146 |         # print match.group(1),'--------------------'
147 |         next_url = match.group(1)
148 |         return Request(next_url,\
149 |                         callback = self.get_next_data)
150 | 
151 |     def get_next_data(self, response):
152 |         self._log_page(response, 'get_next.html')
153 |         try:
154 |             hxs = Selector(response)
155 |             nick = hxs.xpath('//em[@class="s-name"]/a/text()').extract()
156 |             print "login-success, get user nick:",nick
157 |         except:
158 |             print u'请查看get_next.html里面是否是个人淘宝页，进行代码修改'
159 |         return None
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 


--------------------------------------------------------------------------------