├── .idea ├── codeStyles │ └── codeStyleConfig.xml ├── demo1.iml ├── dictionaries │ └── liqinghua.xml ├── libraries │ └── R_User_Library.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── MATRIX_token_spider.py ├── MATRIX_tokens_spider.py ├── README.md ├── begin.py ├── demo1 ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ ├── etherscan1.py │ ├── etherscan2.py │ └── quotes.py ├── quotes-0.html ├── quotes-1.html └── scrapy.cfg /.idea/codeStyles/codeStyleConfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /.idea/demo1.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 13 | -------------------------------------------------------------------------------- /.idea/dictionaries/liqinghua.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | urllist 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/libraries/R_User_Library.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /MATRIX_token_spider.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from datetime import datetime 3 | from pyquery import PyQuery as pq 4 | #from threadpool import * just this scripts run so fast in vps, and too fast to get maybe error 5 | requests.adapters.DEFAULT_RETRIES = 5 6 | domain = "https://etherscan.io/" 7 | index_url = domain+"tokens?p=1" 8 | pagenums = pq(index_url)("#ContentPlaceHolder1_divpagingpanel > div:nth-child(2) > p > a:nth-child(5)").attr("href").split("=") 9 | if len(pagenums) == 2: 10 | pagenums = int(pagenums[1]) 11 | else: 12 | print("Error!") 13 | exit() 14 | 15 | def get_code(address=""): 16 | token_url = domain+"/address/%s#code"%address 17 | print("//spider token_url\t"+token_url+"\n") 18 | html = requests.get(url=token_url).text # just pq(token_url) throw timeout error sometime 19 | html_dom = pq(html) 20 | token_code = html_dom("pre#editor").html()#.encode("utf8",'ignore') 21 | print("//parser token_url\t"+token_url+"\n") 22 | if token_code!=None and token_code!="": 23 | token_name = html_dom("#ContentPlaceHolder1_tr_tokeninfo > td:nth-child(2) > a").text().replace(" ","_") 24 | token_Transactions = html_dom("#ContentPlaceHolder1_divSummary > div:nth-child(1) > table > tr:nth-child(4) > td >span").text() 25 | token_price = html_dom("#balancelistbtn > span.pull-left").text().split(" ") 26 | token_price = token_price[1] if len(token_price)==2 else "" 27 | spider_time = datetime.now().strftime('%Y/%m/%d %H:%M:%S') 28 | 29 | with open("./tokens_code/"+token_name+".sol",'w') as fp: 30 | fp.write("//token_name\t"+token_name+"\n") 31 | fp.write("//token_url\t"+token_url+"\n") 32 | fp.write("//spider_time\t"+spider_time+"\n") 33 | fp.write("//token_Transactions\t"+token_Transactions+"\n") 34 | fp.write("//token_price\t"+token_price+"\n\n") 35 | fp.write(token_code) 36 | print("write down\n") 37 | print("//token_name\t"+token_name+"\n") 38 | print("//spider_time\t"+spider_time+"\n") 39 | print("//token_Transactions\t"+token_Transactions+"\n") 40 | print("//token_price\t"+token_price+"\n") 41 | print("\n"+token_code+"\n") 42 | 43 | 44 | 45 | 46 | token_lists = [] 47 | urls = [index_url.replace("1",str(page)) for page in range(1,pagenums+1)] 48 | for url in urls: 49 | print(url) 50 | html = requests.get(url=url,timeout=3).text 51 | html_dom = pq(html) 52 | [token_lists.append(pq(a).attr("href").split("/")[-1]) for a in html_dom("td.hidden-xs>a")] 53 | 54 | for token_addr in token_lists: 55 | get_code(token_addr) 56 | 57 | print("Finish") 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /MATRIX_tokens_spider.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from datetime import datetime 3 | from pyquery import PyQuery as pq 4 | #from threadpool import * just this scripts run so fast in vps, and too fast to get maybe error 5 | requests.adapters.DEFAULT_RETRIES = 5 6 | domain = "https://etherscan.io/" 7 | index_url = domain+"/contractsVerified/1" 8 | pagenums = pq(index_url)("body > div.wrapper > div.profile.container > div:nth-child(4) > div:nth-child(2) > p > span > b:nth-child(2)").text() 9 | pagenums = int(pagenums) 10 | # if len(pagenums) == 2: 11 | # pagenums = int(pagenums[1]) 12 | # else: 13 | # print("Error!") 14 | # exit() 15 | 16 | def get_code(address=""): 17 | token_url = domain+"/address/%s"%address 18 | print("//spider token_url\t"+token_url+"\n") 19 | html = requests.get(url=token_url).text # just pq(token_url) throw timeout error sometime 20 | html_dom = pq(html) 21 | token_code = html_dom("pre#editor").html()#.encode("utf8",'ignore') 22 | print("//parser token_url\t"+token_url+"\n") 23 | if token_code!=None and token_code!="": 24 | token_name = html_dom("#ContentPlaceHolder1_tr_tokeninfo > td:nth-child(2) > a").text().replace(" ","_") 25 | token_Transactions = html_dom("#ContentPlaceHolder1_divSummary > div:nth-child(1) > table > tr:nth-child(4) > td >span").text() 26 | token_price = html_dom("#balancelistbtn > span.pull-left").text().split(" ") 27 | token_price = token_price[1] if len(token_price)==2 else "" 28 | spider_time = datetime.now().strftime('%Y/%m/%d %H:%M:%S') 29 | try: 30 | with open("./all_tokens_code/"+token_name+"_"+address+".sol",'w') as fp: 31 | fp.write("//token_name\t"+token_name+"\n") 32 | fp.write("//token_url\t"+token_url+"\n") 33 | fp.write("//spider_time\t"+spider_time+"\n") 34 | fp.write("//token_Transactions\t"+token_Transactions+"\n") 35 | fp.write("//token_price\t"+token_price+"\n\n") 36 | fp.write(token_code) 37 | print("write down\n") 38 | print("//token_name\t"+token_name+"\n") 39 | print("//spider_time\t"+spider_time+"\n") 40 | print("//token_Transactions\t"+token_Transactions+"\n") 41 | print("//token_price\t"+token_price+"\n") 42 | print("\n"+token_code+"\n") 43 | except Exception as e: 44 | print("Error") 45 | with open("Error.log",'a+') as fp: 46 | fp.write(address+"\t"+str(e)+"\n") 47 | 48 | 49 | 50 | 51 | 52 | 53 | token_lists = [] 54 | urls = [index_url.replace("1",str(page)) for page in range(1,pagenums+1)] 55 | for url in urls: 56 | print(url) 57 | html = requests.get(url=url,timeout=3).text 58 | html_dom = pq(html) 59 | [token_lists.append(pq(a).attr("href").split("/")[-1]) for a in html_dom('body > div.wrapper > div.profile.container > div:nth-child(3) > div > div > div > table > tbody > tr > td:nth-child(1) > a')] 60 | print(token_lists) 61 | 62 | 63 | for token_addr in token_lists: 64 | get_code(token_addr) 65 | 66 | print("Finish") 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MATRIX_Etherscan 2 | This is a crawler used to visit Etherscan and obtain smartcontract related info. The data will be used as the basis for MATRIX model analysis as well as AI based data analysis. 3 | 4 | ### Updated on 21st, August, 2018 5 | 6 | Codes re-designed against anti-spider policies -------------------------------------------------------------------------------- /begin.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from scrapy.cmdline import execute 5 | 6 | # 打断点调试py文件 7 | # sys.path.append('D:\PyCharm\py_scrapyjobbole') 8 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 9 | print(os.path.dirname(os.path.abspath(__file__))) 10 | 11 | if not os.path.exists('sol'): 12 | os.mkdir('sol') 13 | 14 | execute(['scrapy', 'crawl', 'etherscan1']) 15 | -------------------------------------------------------------------------------- /demo1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MatrixAINetwork/MATRIX_Etherscan/4b0a6ac94a5f5df57e6f43cb07bb253ba66502e5/demo1/__init__.py -------------------------------------------------------------------------------- /demo1/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class Demo1Item(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /demo1/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class Demo1SpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class Demo1DownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /demo1/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class Demo1Pipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /demo1/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for demo1 project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'demo1' 13 | 14 | SPIDER_MODULES = ['demo1.spiders'] 15 | NEWSPIDER_MODULE = 'demo1.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'demo1 (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = True 22 | # ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | # CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 1 31 | # The download delay setting will honor only one of: 32 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | # CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | # COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | # TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | # DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | # } 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | # SPIDER_MIDDLEWARES = { 50 | # 'demo1.middlewares.Demo1SpiderMiddleware': 543, 51 | # } 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | # DOWNLOADER_MIDDLEWARES = { 56 | # 'demo1.middlewares.Demo1DownloaderMiddleware': 543, 57 | # } 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | # EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | # } 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | # ITEM_PIPELINES = { 68 | # 'demo1.pipelines.Demo1Pipeline': 300, 69 | # } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | # AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | # AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | # AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | # AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | # HTTPCACHE_ENABLED = True 87 | # HTTPCACHE_EXPIRATION_SECS = 0 88 | # HTTPCACHE_DIR = 'httpcache' 89 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | DEFAULT_REQUEST_HEADERS = { 92 | 'Accept': 'text/html; charset=utf-8', 93 | 'Accept-Language': 'zh-CN,zh;q=0.8', 94 | 'Referer': ' https://etherscan.io/', 95 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36', 96 | 97 | } 98 | # DOWNLOADER_MIDDLEWARES = { 99 | # 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None, 100 | # 'etherscan1Spider.middlewares.ThreatDefenceRedirectMiddleware': 600, 101 | # } 102 | -------------------------------------------------------------------------------- /demo1/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /demo1/spiders/etherscan1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | # global sc_name1 6 | 7 | class etherscan1Spider(scrapy.Spider): 8 | name = "etherscan1" 9 | 10 | # sc_name="default.sol" 11 | 12 | def __init__(self, name="etherscan1", sc_name="default.sol", sc_content="test"): 13 | self.name = name 14 | self.sc_name = sc_name 15 | self.sc_content = sc_content 16 | 17 | def start_requests(self): 18 | pre_url = 'https://etherscan.io/contractsVerified' 19 | contract_start_page = 2 20 | contract_end_page = 8 21 | for i in range(contract_start_page, contract_end_page): 22 | url = '{}/{}'.format(pre_url, i) 23 | # print(url) 24 | print("Adding page %s" % i + " for downloading!") 25 | # yield scrapy.Request(url=url, callback=self.parse) 26 | yield scrapy.Request(url=url, meta={ 27 | 'dont_redirect': True, 28 | 'handle_httpstatus_list': [302] 29 | }, callback=self.parse) 30 | 31 | # def parse(self, response): 32 | # for href in response.css("ul.directory.dir-col > li > a::attr('href')"): 33 | # url = response.urljoin('https://etherscan.io', href.extract()) 34 | # print(url) 35 | # yield scrapy.Request(url, callback=self.parse_dir_contents) 36 | 37 | def parse(self, response): 38 | page = response.url.split("/")[-1] 39 | filename = 'quotes-%s.html' % page 40 | with open(filename, 'wb') as f: 41 | f.write(response.body) 42 | # self.log('Saved file %s' % filename) 43 | 44 | addr_list = response.xpath('//td[1]/a[1]/@href').extract() 45 | addl_list = response.xpath('//td[1]/a[1]/text()').extract() 46 | name_list = response.xpath('//td[2]/text()').extract() 47 | 48 | baseurl = 'https://etherscan.io' 49 | # https: // etherscan.io / address / 0x3c200bf4ec426236f8b042f0f1380315aee4c7d1 # code 50 | urllist = [] 51 | i = 0 52 | for addr in addr_list: 53 | filename = "SC" + "_" + name_list[i] + "_P" + page + "_" + addl_list[i] + ".sol" 54 | self.log('save file %s' % filename) 55 | newurl = '{}{}'.format(baseurl, addr) 56 | # nonlocal sc_name 57 | self.sc_name = filename 58 | # global sc_name1 59 | # sc_name1=filename 60 | # yield response.follow(newurl, self.parse_sc) 61 | yield scrapy.Request(url=newurl, meta={ 62 | 'dont_redirect': True, 63 | 'handle_httpstatus_list': [302] 64 | }, callback=self.parse_sc) 65 | 66 | urllist.append(newurl) 67 | i = i + 1 68 | 69 | #print(urllist) 70 | 71 | def parse_sc(self, response): 72 | def extract_with_css(query): 73 | return response.css(query).extract_first().strip() 74 | 75 | # print (response) 76 | sc_content = response.xpath('//div[@id=\'dividcode\']//pre[1]/text()').extract() 77 | # sc_content = response.xpath('//div[@id=\'dividcode\']').extract() 78 | # print (sc_content) 79 | sc_abstract = response.xpath('//pre[@id=\'js-copytextarea2\']/text()').extract() 80 | sc_name0 = response.xpath( 81 | '//div[@id=\'ContentPlaceHolder1_contractCodeDiv\']/div[2]/table/tr[1]/td[2]/text()').extract() 82 | # print(sc_name0) 83 | if (sc_name0 == []): 84 | print("error") 85 | sc_name = "err" 86 | else: 87 | sc_name = sc_name0[0].replace("\n", "") 88 | sc_addr = response.xpath('//*[@id="mainaddress"]/text()').extract() 89 | 90 | if (sc_addr == []): 91 | sc_addr0 = "erra" 92 | print("addr error") 93 | else: 94 | sc_addr0 = sc_addr[0] 95 | 96 | filename1 = "./sol/sc_" + sc_name + "_" + sc_addr0 + ".sol" 97 | filename2 = "./sol/sc_" + sc_name + "_" + sc_addr0 + ".ifsol" 98 | # if len(sc_content): 99 | with open(filename1, 'w') as f: 100 | if len(sc_content): 101 | f.write(sc_content[0]) 102 | # f.write(sc_content[0]) 103 | 104 | with open(filename2, 'w') as f: 105 | if len(sc_abstract): 106 | f.write(sc_abstract[0]) 107 | # f.write(sc_abstract[0]) 108 | 109 | self.log("writing " + filename1) 110 | # print(sc_addr,sc_name,sc_content,sc_abstract) 111 | -------------------------------------------------------------------------------- /demo1/spiders/etherscan2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | global sc_name1 5 | 6 | 7 | class etherscan1Spider(scrapy.Spider): 8 | name = "etherscan2" 9 | 10 | # sc_name="default.sol" 11 | 12 | def __init__(self, name="etherscan1", sc_name="default.sol", sc_content="test"): 13 | self.name = name 14 | self.sc_name = sc_name 15 | self.sc_content = sc_content 16 | 17 | def start_requests(self): 18 | pre_url = 'https://etherscan.io/contractsVerified' 19 | contract_page_amount = 2 20 | for i in range(int(contract_page_amount)): 21 | url = '{}/{}'.format(pre_url, i) 22 | print(url) 23 | yield scrapy.Request(url=url, callback=self.parse) 24 | 25 | def parse(self, response): 26 | page = response.url.split("/")[-1] 27 | filename = 'quotes-%s.html' % page 28 | with open(filename, 'wb') as f: 29 | f.write(response.body) 30 | self.log('Saved file %s' % filename) 31 | 32 | addr_list = response.xpath('//td[1]/a[1]/@href').extract() 33 | addl_list = response.xpath('//td[1]/a[1]/text()').extract() 34 | name_list = response.xpath('//td[2]/text()').extract() 35 | 36 | baseurl = 'https://etherscan.io' 37 | # https: // etherscan.io / address / 0x3c200bf4ec426236f8b042f0f1380315aee4c7d1 # code 38 | urllist = [] 39 | i = 0 40 | for addr in addr_list: 41 | filename = "SC" + "_" + name_list[i] + "_P" + page + "_" + addl_list[i] + ".sol" 42 | self.log('save file %s' % filename) 43 | newurl = '{}{}'.format(baseurl, addr) 44 | # nonlocal sc_name 45 | self.sc_name = filename 46 | global sc_name1 47 | sc_name1 = filename 48 | yield response.follow(newurl, self.parse_sc) 49 | 50 | urllist.append(newurl) 51 | i = i + 1 52 | 53 | print(urllist) 54 | 55 | def parse_sc(self, response): 56 | def extract_with_css(query): 57 | return response.css(query).extract_first().strip() 58 | 59 | sc_content = response.xpath('//div[@id=\'dividcode\']/pre[1]/text()').extract() 60 | sc_abstract = response.xpath('//pre[@id=\'js-copytextarea2\']/text()').extract() 61 | 62 | print(sc_name1, sc_content, sc_abstract) 63 | -------------------------------------------------------------------------------- /demo1/spiders/quotes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | class QuotesSpider(scrapy.Spider): 6 | name = "quotes" 7 | 8 | def start_requests(self): 9 | urls = [ 10 | 'http://quotes.toscrape.com/page/1/', 11 | 'http://quotes.toscrape.com/page/2/', 12 | ] 13 | for url in urls: 14 | yield scrapy.Request(url=url, callback=self.parse) 15 | 16 | def parse(self, response): 17 | page = response.url.split("/")[-2] 18 | filename = 'quotes-%s.html' % page 19 | with open(filename, 'wb') as f: 20 | f.write(response.body) 21 | self.log('Saved file %s' % filename) 22 | -------------------------------------------------------------------------------- /quotes-0.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Ethereum Contracts with Verified Source Codes 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 23 | 24 | 25 | 27 | 32 | 36 | 37 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 |
47 |
48 |
49 | 53 | 73 | 74 | 79 | 80 |
81 | 82 | 240 |
241 | 242 | 243 | 252 | 253 | 254 | 264 | 265 | 266 |
267 |
268 | 271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
281 | 283 | 288 |
289 |
290 |
291 |
292 |
293 | 298 |
299 |

300 | First Prev Page 1 of 1509 303 | Next Last 305 |

306 |
307 |
308 |
309 |
310 | 311 |
312 |
313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 331 | 332 | 333 | 334 | 335 | 337 | 338 | 339 | 340 | 344 | 345 | 346 | 347 | 348 | 350 | 351 | 352 | 353 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 369 | 370 | 371 | 372 | 373 | 375 | 376 | 377 | 378 | 382 | 383 | 384 | 385 | 386 | 388 | 389 | 390 | 391 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 407 | 408 | 409 | 410 | 411 | 413 | 414 | 415 | 416 | 420 | 421 | 422 | 423 | 424 | 426 | 427 | 428 | 429 | 433 | 434 | 435 | 436 | 437 | 439 | 440 | 441 | 442 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 470 | 471 | 472 | 473 | 474 | 476 | 477 | 478 | 479 | 483 | 484 | 485 | 486 | 487 | 489 | 490 | 491 | 492 | 496 | 497 | 498 | 499 | 500 | 502 | 503 | 504 | 505 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 557 | 558 | 559 | 560 | 561 | 563 | 564 | 565 | 566 | 570 | 571 | 575 | 576 | 577 | 579 | 580 | 581 | 582 | 586 | 587 | 588 | 589 | 590 | 592 | 593 | 594 | 595 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 623 | 624 | 625 | 626 | 627 | 629 | 630 | 631 | 632 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 |
AddressContractNameCompilerBalanceTxCountSettingsDateVerified
0x18302990a6649e566f68204c193359a1392f1ecalockEtherPayv0.4.240 Ether1 336 | 7/26/2018
0xb1da872cacc45b66841e972c44135ab50e7a1d8alockEtherPayv0.4.240 Ether2 349 | 7/26/2018
0x87a12e952c6e4825fc01aa91577457cc1aa6db61BENTokenv0.4.230 Ether1-7/26/2018
0x1c70accdeeb2a3ffb5c0946c8015432819da5afclockEtherPayv0.4.240 Ether2 374 | 7/26/2018
0x5d2f2e55dfc73789c118aed06f6ad83811166b9flockEtherPayv0.4.240 Ether2 387 | 7/26/2018
0xc16a39f7f209f5eb3e3d3698f9b9c5ac8dbf40c5MICTokenv0.4.230 Ether1-7/26/2018
0x4adb8bbd9b1f5a77df0458b99f0bb3db3b7e9280lockEtherPayv0.4.240 Ether2 412 | 7/26/2018
0x3749db7f3fba9787e589f5259a80e1b1259efa42lockEtherPayv0.4.240 Ether2 425 | 7/26/2018
0x0606923947cc9277e8bac44d34815a0e24eac960lockEtherPayv0.4.240 Ether2 438 | 7/26/2018
0x17c8d8b7659141273a1c2223030c89b96713a44aXPSv0.4.180 Ether4590-7/26/2018
0x40b85cffab146eefcceee004392168d6b4d5a24cADRTokenv0.4.230 Ether1-7/26/2018
0x47a6a6238dfca9cf282917328eeef60086bf6e64lockEtherPayv0.4.240 Ether2 475 | 7/26/2018
0x2ad84b2dc7cd8b039aa5aa67cd86f98e2cebcb9flockEtherPayv0.4.240 Ether2 488 | 7/26/2018
0x32534beea1de8c629db6832a544cec9bc557044clockEtherPayv0.4.240 Ether2 501 | 7/26/2018
0x50c6ed8237087d996fc086428d90355384d22e9bCoinMarksv0.4.240 Ether2-7/26/2018
0x56b7f2a2d6f70d88b991e6c7b8005e0a13a9b379CryptoCurrencyExchangev0.4.210 Ether2-7/26/2018
0x146d589cfe136644bdf4f1958452b5a4bb9c5a05FactoryDatav0.4.240 Ether1-7/26/2018
0x9473481877cb241b0c8f647bf841973196e61a1dPropTokenv0.4.240 Ether2-7/26/2018
0x9A973590BD67d0b38075d66907027F5cC1ab3903EABTokenv0.4.240 Ether1 562 | 7/26/2018
0x00359e48665c081ecba0519cacff26c7ca889296KNTTToken v0.4.22 574 | 0 Ether13 578 | 7/26/2018
0x24230cac1a176a2bf6d3270e49f505a21f17d991Energonv0.4.240 Ether2 591 | 7/26/2018
0xae9080243b0123a0c52aacdb955a5807cc4e7803Tokenv0.4.24-0-7/26/2018
0x3579412bdc1e30f03a862071023cb94c2a179742Crowdsalev0.4.240 Ether1-7/26/2018
0xee10638c311eeb0bbacea3ec7f04c94ca7d709a0LikeBitTokenv0.4.240 Ether1 628 | 7/26/2018
0xe328b27e728f037732479815dd160ca5744c7163GBMTokenv0.4.240 Ether2-7/26/2018
645 |
646 |
647 | 648 |
649 |
650 |
651 |
652 |
653 | 655 | 656 | 658 | Show 659 | 665 | Page 666 | 667 |
668 |
669 |
670 |

671 | First Prev Page 1 of 1509 674 | Next Last 676 |

677 |
678 |
679 | 680 |
681 |

682 | 687 |
688 | 689 |
690 | 772 | 785 | 786 | 787 | 817 | 818 | 819 | 820 | -------------------------------------------------------------------------------- /quotes-1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Ethereum Contracts with Verified Source Codes 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 23 | 24 | 25 | 27 | 32 | 36 | 37 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 |
47 |
48 |
49 | 53 | 73 | 74 | 79 | 80 |
81 | 82 | 240 |
241 | 242 | 243 | 252 | 253 | 254 | 264 | 265 | 266 |
267 |
268 | 271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
281 | 283 | 288 |
289 |
290 |
291 |
292 |
293 | 298 |
299 |

300 | First Prev Page 1 of 1518 303 | Next Last 305 |

306 |
307 |
308 |
309 |
310 | 311 |
312 |
313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 331 | 332 | 333 | 334 | 335 | 337 | 338 | 339 | 340 | 344 | 345 | 346 | 347 | 348 | 350 | 351 | 352 | 353 | 357 | 358 | 359 | 360 | 361 | 364 | 365 | 366 | 367 | 371 | 372 | 373 | 374 | 375 | 377 | 378 | 379 | 380 | 384 | 385 | 386 | 387 | 388 | 391 | 392 | 393 | 394 | 398 | 399 | 400 | 401 | 402 | 404 | 405 | 406 | 407 | 411 | 412 | 416 | 417 | 418 | 420 | 421 | 422 | 423 | 427 | 428 | 429 | 430 | 431 | 433 | 434 | 435 | 436 | 440 | 441 | 442 | 443 | 444 | 446 | 447 | 448 | 449 | 453 | 454 | 455 | 456 | 457 | 459 | 460 | 461 | 462 | 466 | 467 | 468 | 469 | 470 | 472 | 473 | 474 | 475 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 491 | 492 | 496 | 497 | 498 | 501 | 502 | 503 | 504 | 508 | 509 | 510 | 511 | 512 | 514 | 515 | 516 | 517 | 521 | 522 | 523 | 524 | 525 | 528 | 529 | 530 | 531 | 535 | 536 | 537 | 538 | 539 | 542 | 543 | 544 | 545 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 561 | 562 | 563 | 564 | 565 | 568 | 569 | 570 | 571 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 587 | 588 | 589 | 590 | 591 | 593 | 594 | 595 | 596 | 600 | 601 | 602 | 603 | 604 | 606 | 607 | 608 | 609 | 613 | 614 | 615 | 616 | 617 | 619 | 620 | 621 | 622 | 626 | 627 | 628 | 629 | 630 | 632 | 633 | 634 | 635 | 639 | 640 | 641 | 642 | 643 | 645 | 646 | 647 | 648 | 652 | 653 | 654 | 655 | 656 | 658 | 659 | 660 | 661 |
AddressContractNameCompilerBalanceTxCountSettingsDateVerified
0x162d258d7954c70ffc17d658cd7e9756935cafc3ERC20Standardv0.4.240 Ether1 336 | 7/28/2018
0x01653e20d03d450658740691087ea7155bcd9726FoMo3DlongUnlimitedv0.4.240 Ether1 349 | 7/28/2018
0x62190802001819f42cb2eda7017d3c617af431c1INTIMEv0.4.2422.428786998518337899 Ether9 362 | 363 | 7/28/2018
0xea374970cb477d4b4bec65ca8ac086c42542e121INTIMEv0.4.240 Ether1 376 | 7/28/2018
0xa8371742d7f3b5e4079ea04dea3b9f5157d0f680gsTokenv0.4.240 Ether1 389 | 390 | 7/28/2018
0xe76987e5f39652159dff4b40cf6cc93004f92958Fenerbahcev0.4.240 Ether2 403 | 7/28/2018
0x74028170d74751878228cda221fd0ac42a830921EncryptedToken v0.4.16 415 | 0 Ether9 419 | 7/28/2018
0x1190b071b80a37953ab465a34896d96ef2ea4b9dToukenTokenv0.4.240 Ether1 432 | 7/28/2018
0xfb0769c1cde4dbf18f144fd65e6d27f623730c81GIDIDAXv0.4.240 Ether2 445 | 7/28/2018
0x433e6d2e5a2eb07a62885b3c1fb08d74e7927811DIETCoinv0.4.240 Ether1 458 | 7/28/2018
0x317C54cc1ff18a0E275cDD8B8b75a657aD9063d1HumanStandardTokenv0.4.240 Ether2 471 | 7/28/2018
0x3cf6f11a4940dd8bf2e6eea46e30d8f76fa84dafTonCoinv0.4.240 Ether2-7/28/2018
0xba74368aa52ad58d08309f1f549aa63bab0c7e2aAZExchange v0.4.9 495 | 0 Ether1 499 | 500 | 7/28/2018
0x57ce4d3b0260bd93c420119ad867580654334143AssetMTCv0.4.240.1 Ether3 513 | 7/28/2018
0xeFD6AA85E91c5d8983bF748966cd48e163B9f198Tokenv0.4.240 Ether1 526 | 527 | 7/28/2018
0xaf9a3620dd490b475b67e2eee12ab854712baf6aDAMITokenv0.4.190 Ether1 540 | 541 | 7/28/2018
0x85f806b0df30709886c22ed1be338d2c647abd6bEthTokenToSmthSwapsv0.4.240 Ether1-7/28/2018
0x88cfab090616f238cada6ee49619d0296baaf321BCWv0.4.210 Ether3 566 | 567 | 7/28/2018
0xc301f0a7f62b5ef5cfa8454af23a688da5a65ec8WMCTokenv0.4.190 Ether19-7/28/2018
0x8a36f3e0da7b36fd22fcf2844c21e812279372acQuizTimev0.4.192.011800000727 Ether8 592 | 7/28/2018
0x4e8ecF79AdE5e2C49B9e30D795517A81e0Bf00B8FoMo3DSoonv0.4.242,080.561143558777613244 Ether6438 605 | 7/28/2018
0x202f291e30fe4aa626792ae01c35eaf7153b44b9TokenSalev0.4.240 Ether1 618 | 7/28/2018
0x9f73d808807c71af185fea0c1ce205002c74123cEthRaisedv0.4.240 Ether1 631 | 7/28/2018
0xc0c001140319c5f114f8467295b1f22f86929ad0Diviesv0.4.2417.004977505860993551 Ether131 644 | 7/28/2018
0x67ed24a0db2ae01c4841cd8aef1da519b588e2b2TeamEthv0.4.240 Ether1 657 | 7/28/2018
662 |
663 |
664 | 665 |
666 |
667 |
668 |
669 |
670 | 672 | 673 | 675 | Show 676 | 682 | Page 683 | 684 |
685 |
686 |
687 |

688 | First Prev Page 1 of 1518 691 | Next Last 693 |

694 |
695 |
696 | 697 |
698 |

699 | 704 |
705 | 706 |
707 | 790 | 803 | 804 | 805 | 835 | 836 | 837 | 838 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = demo1.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = demo1 12 | --------------------------------------------------------------------------------