├── .idea
├── codeStyles
│ └── codeStyleConfig.xml
├── demo1.iml
├── dictionaries
│ └── liqinghua.xml
├── libraries
│ └── R_User_Library.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── MATRIX_token_spider.py
├── MATRIX_tokens_spider.py
├── README.md
├── begin.py
├── demo1
├── __init__.py
├── items.py
├── middlewares.py
├── pipelines.py
├── settings.py
└── spiders
│ ├── __init__.py
│ ├── etherscan1.py
│ ├── etherscan2.py
│ └── quotes.py
├── quotes-0.html
├── quotes-1.html
└── scrapy.cfg
/.idea/codeStyles/codeStyleConfig.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/.idea/demo1.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/dictionaries/liqinghua.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | urllist
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/libraries/R_User_Library.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/MATRIX_token_spider.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from datetime import datetime
3 | from pyquery import PyQuery as pq
4 | #from threadpool import * just this scripts run so fast in vps, and too fast to get maybe error
5 | requests.adapters.DEFAULT_RETRIES = 5
6 | domain = "https://etherscan.io/"
7 | index_url = domain+"tokens?p=1"
8 | pagenums = pq(index_url)("#ContentPlaceHolder1_divpagingpanel > div:nth-child(2) > p > a:nth-child(5)").attr("href").split("=")
9 | if len(pagenums) == 2:
10 | pagenums = int(pagenums[1])
11 | else:
12 | print("Error!")
13 | exit()
14 |
15 | def get_code(address=""):
16 | token_url = domain+"/address/%s#code"%address
17 | print("//spider token_url\t"+token_url+"\n")
18 | html = requests.get(url=token_url).text # just pq(token_url) throw timeout error sometime
19 | html_dom = pq(html)
20 | token_code = html_dom("pre#editor").html()#.encode("utf8",'ignore')
21 | print("//parser token_url\t"+token_url+"\n")
22 | if token_code!=None and token_code!="":
23 | token_name = html_dom("#ContentPlaceHolder1_tr_tokeninfo > td:nth-child(2) > a").text().replace(" ","_")
24 | token_Transactions = html_dom("#ContentPlaceHolder1_divSummary > div:nth-child(1) > table > tr:nth-child(4) > td >span").text()
25 | token_price = html_dom("#balancelistbtn > span.pull-left").text().split(" ")
26 | token_price = token_price[1] if len(token_price)==2 else ""
27 | spider_time = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
28 |
29 | with open("./tokens_code/"+token_name+".sol",'w') as fp:
30 | fp.write("//token_name\t"+token_name+"\n")
31 | fp.write("//token_url\t"+token_url+"\n")
32 | fp.write("//spider_time\t"+spider_time+"\n")
33 | fp.write("//token_Transactions\t"+token_Transactions+"\n")
34 | fp.write("//token_price\t"+token_price+"\n\n")
35 | fp.write(token_code)
36 | print("write down\n")
37 | print("//token_name\t"+token_name+"\n")
38 | print("//spider_time\t"+spider_time+"\n")
39 | print("//token_Transactions\t"+token_Transactions+"\n")
40 | print("//token_price\t"+token_price+"\n")
41 | print("\n"+token_code+"\n")
42 |
43 |
44 |
45 |
46 | token_lists = []
47 | urls = [index_url.replace("1",str(page)) for page in range(1,pagenums+1)]
48 | for url in urls:
49 | print(url)
50 | html = requests.get(url=url,timeout=3).text
51 | html_dom = pq(html)
52 | [token_lists.append(pq(a).attr("href").split("/")[-1]) for a in html_dom("td.hidden-xs>a")]
53 |
54 | for token_addr in token_lists:
55 | get_code(token_addr)
56 |
57 | print("Finish")
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/MATRIX_tokens_spider.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from datetime import datetime
3 | from pyquery import PyQuery as pq
4 | #from threadpool import * just this scripts run so fast in vps, and too fast to get maybe error
5 | requests.adapters.DEFAULT_RETRIES = 5
6 | domain = "https://etherscan.io/"
7 | index_url = domain+"/contractsVerified/1"
8 | pagenums = pq(index_url)("body > div.wrapper > div.profile.container > div:nth-child(4) > div:nth-child(2) > p > span > b:nth-child(2)").text()
9 | pagenums = int(pagenums)
10 | # if len(pagenums) == 2:
11 | # pagenums = int(pagenums[1])
12 | # else:
13 | # print("Error!")
14 | # exit()
15 |
16 | def get_code(address=""):
17 | token_url = domain+"/address/%s"%address
18 | print("//spider token_url\t"+token_url+"\n")
19 | html = requests.get(url=token_url).text # just pq(token_url) throw timeout error sometime
20 | html_dom = pq(html)
21 | token_code = html_dom("pre#editor").html()#.encode("utf8",'ignore')
22 | print("//parser token_url\t"+token_url+"\n")
23 | if token_code!=None and token_code!="":
24 | token_name = html_dom("#ContentPlaceHolder1_tr_tokeninfo > td:nth-child(2) > a").text().replace(" ","_")
25 | token_Transactions = html_dom("#ContentPlaceHolder1_divSummary > div:nth-child(1) > table > tr:nth-child(4) > td >span").text()
26 | token_price = html_dom("#balancelistbtn > span.pull-left").text().split(" ")
27 | token_price = token_price[1] if len(token_price)==2 else ""
28 | spider_time = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
29 | try:
30 | with open("./all_tokens_code/"+token_name+"_"+address+".sol",'w') as fp:
31 | fp.write("//token_name\t"+token_name+"\n")
32 | fp.write("//token_url\t"+token_url+"\n")
33 | fp.write("//spider_time\t"+spider_time+"\n")
34 | fp.write("//token_Transactions\t"+token_Transactions+"\n")
35 | fp.write("//token_price\t"+token_price+"\n\n")
36 | fp.write(token_code)
37 | print("write down\n")
38 | print("//token_name\t"+token_name+"\n")
39 | print("//spider_time\t"+spider_time+"\n")
40 | print("//token_Transactions\t"+token_Transactions+"\n")
41 | print("//token_price\t"+token_price+"\n")
42 | print("\n"+token_code+"\n")
43 | except Exception as e:
44 | print("Error")
45 | with open("Error.log",'a+') as fp:
46 | fp.write(address+"\t"+str(e)+"\n")
47 |
48 |
49 |
50 |
51 |
52 |
53 | token_lists = []
54 | urls = [index_url.replace("1",str(page)) for page in range(1,pagenums+1)]
55 | for url in urls:
56 | print(url)
57 | html = requests.get(url=url,timeout=3).text
58 | html_dom = pq(html)
59 | [token_lists.append(pq(a).attr("href").split("/")[-1]) for a in html_dom('body > div.wrapper > div.profile.container > div:nth-child(3) > div > div > div > table > tbody > tr > td:nth-child(1) > a')]
60 | print(token_lists)
61 |
62 |
63 | for token_addr in token_lists:
64 | get_code(token_addr)
65 |
66 | print("Finish")
67 |
68 |
69 |
70 |
71 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MATRIX_Etherscan
2 | This is a crawler used to visit Etherscan and obtain smartcontract related info. The data will be used as the basis for MATRIX model analysis as well as AI based data analysis.
3 |
4 | ### Updated on 21st, August, 2018
5 |
6 | Codes re-designed against anti-spider policies
--------------------------------------------------------------------------------
/begin.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | from scrapy.cmdline import execute
5 |
6 | # 打断点调试py文件
7 | # sys.path.append('D:\PyCharm\py_scrapyjobbole')
8 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
9 | print(os.path.dirname(os.path.abspath(__file__)))
10 |
11 | if not os.path.exists('sol'):
12 | os.mkdir('sol')
13 |
14 | execute(['scrapy', 'crawl', 'etherscan1'])
15 |
--------------------------------------------------------------------------------
/demo1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MatrixAINetwork/MATRIX_Etherscan/4b0a6ac94a5f5df57e6f43cb07bb253ba66502e5/demo1/__init__.py
--------------------------------------------------------------------------------
/demo1/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class Demo1Item(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/demo1/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class Demo1SpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class Demo1DownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/demo1/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class Demo1Pipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/demo1/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for demo1 project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'demo1'
13 |
14 | SPIDER_MODULES = ['demo1.spiders']
15 | NEWSPIDER_MODULE = 'demo1.spiders'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'demo1 (+http://www.yourdomain.com)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = True
22 | # ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | # CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 1
31 | # The download delay setting will honor only one of:
32 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | # CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | # COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | # TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | # DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | # }
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | # SPIDER_MIDDLEWARES = {
50 | # 'demo1.middlewares.Demo1SpiderMiddleware': 543,
51 | # }
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | # 'demo1.middlewares.Demo1DownloaderMiddleware': 543,
57 | # }
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | # EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | # }
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | # ITEM_PIPELINES = {
68 | # 'demo1.pipelines.Demo1Pipeline': 300,
69 | # }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | # AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | # AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | # AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | # AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | # HTTPCACHE_ENABLED = True
87 | # HTTPCACHE_EXPIRATION_SECS = 0
88 | # HTTPCACHE_DIR = 'httpcache'
89 | # HTTPCACHE_IGNORE_HTTP_CODES = []
90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | DEFAULT_REQUEST_HEADERS = {
92 | 'Accept': 'text/html; charset=utf-8',
93 | 'Accept-Language': 'zh-CN,zh;q=0.8',
94 | 'Referer': ' https://etherscan.io/',
95 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36',
96 |
97 | }
98 | # DOWNLOADER_MIDDLEWARES = {
99 | # 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': None,
100 | # 'etherscan1Spider.middlewares.ThreatDefenceRedirectMiddleware': 600,
101 | # }
102 |
--------------------------------------------------------------------------------
/demo1/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/demo1/spiders/etherscan1.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 |
4 |
5 | # global sc_name1
6 |
7 | class etherscan1Spider(scrapy.Spider):
8 | name = "etherscan1"
9 |
10 | # sc_name="default.sol"
11 |
12 | def __init__(self, name="etherscan1", sc_name="default.sol", sc_content="test"):
13 | self.name = name
14 | self.sc_name = sc_name
15 | self.sc_content = sc_content
16 |
17 | def start_requests(self):
18 | pre_url = 'https://etherscan.io/contractsVerified'
19 | contract_start_page = 2
20 | contract_end_page = 8
21 | for i in range(contract_start_page, contract_end_page):
22 | url = '{}/{}'.format(pre_url, i)
23 | # print(url)
24 | print("Adding page %s" % i + " for downloading!")
25 | # yield scrapy.Request(url=url, callback=self.parse)
26 | yield scrapy.Request(url=url, meta={
27 | 'dont_redirect': True,
28 | 'handle_httpstatus_list': [302]
29 | }, callback=self.parse)
30 |
31 | # def parse(self, response):
32 | # for href in response.css("ul.directory.dir-col > li > a::attr('href')"):
33 | # url = response.urljoin('https://etherscan.io', href.extract())
34 | # print(url)
35 | # yield scrapy.Request(url, callback=self.parse_dir_contents)
36 |
37 | def parse(self, response):
38 | page = response.url.split("/")[-1]
39 | filename = 'quotes-%s.html' % page
40 | with open(filename, 'wb') as f:
41 | f.write(response.body)
42 | # self.log('Saved file %s' % filename)
43 |
44 | addr_list = response.xpath('//td[1]/a[1]/@href').extract()
45 | addl_list = response.xpath('//td[1]/a[1]/text()').extract()
46 | name_list = response.xpath('//td[2]/text()').extract()
47 |
48 | baseurl = 'https://etherscan.io'
49 | # https: // etherscan.io / address / 0x3c200bf4ec426236f8b042f0f1380315aee4c7d1 # code
50 | urllist = []
51 | i = 0
52 | for addr in addr_list:
53 | filename = "SC" + "_" + name_list[i] + "_P" + page + "_" + addl_list[i] + ".sol"
54 | self.log('save file %s' % filename)
55 | newurl = '{}{}'.format(baseurl, addr)
56 | # nonlocal sc_name
57 | self.sc_name = filename
58 | # global sc_name1
59 | # sc_name1=filename
60 | # yield response.follow(newurl, self.parse_sc)
61 | yield scrapy.Request(url=newurl, meta={
62 | 'dont_redirect': True,
63 | 'handle_httpstatus_list': [302]
64 | }, callback=self.parse_sc)
65 |
66 | urllist.append(newurl)
67 | i = i + 1
68 |
69 | #print(urllist)
70 |
71 | def parse_sc(self, response):
72 | def extract_with_css(query):
73 | return response.css(query).extract_first().strip()
74 |
75 | # print (response)
76 | sc_content = response.xpath('//div[@id=\'dividcode\']//pre[1]/text()').extract()
77 | # sc_content = response.xpath('//div[@id=\'dividcode\']').extract()
78 | # print (sc_content)
79 | sc_abstract = response.xpath('//pre[@id=\'js-copytextarea2\']/text()').extract()
80 | sc_name0 = response.xpath(
81 | '//div[@id=\'ContentPlaceHolder1_contractCodeDiv\']/div[2]/table/tr[1]/td[2]/text()').extract()
82 | # print(sc_name0)
83 | if (sc_name0 == []):
84 | print("error")
85 | sc_name = "err"
86 | else:
87 | sc_name = sc_name0[0].replace("\n", "")
88 | sc_addr = response.xpath('//*[@id="mainaddress"]/text()').extract()
89 |
90 | if (sc_addr == []):
91 | sc_addr0 = "erra"
92 | print("addr error")
93 | else:
94 | sc_addr0 = sc_addr[0]
95 |
96 | filename1 = "./sol/sc_" + sc_name + "_" + sc_addr0 + ".sol"
97 | filename2 = "./sol/sc_" + sc_name + "_" + sc_addr0 + ".ifsol"
98 | # if len(sc_content):
99 | with open(filename1, 'w') as f:
100 | if len(sc_content):
101 | f.write(sc_content[0])
102 | # f.write(sc_content[0])
103 |
104 | with open(filename2, 'w') as f:
105 | if len(sc_abstract):
106 | f.write(sc_abstract[0])
107 | # f.write(sc_abstract[0])
108 |
109 | self.log("writing " + filename1)
110 | # print(sc_addr,sc_name,sc_content,sc_abstract)
111 |
--------------------------------------------------------------------------------
/demo1/spiders/etherscan2.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 |
4 | global sc_name1
5 |
6 |
7 | class etherscan1Spider(scrapy.Spider):
8 | name = "etherscan2"
9 |
10 | # sc_name="default.sol"
11 |
12 | def __init__(self, name="etherscan1", sc_name="default.sol", sc_content="test"):
13 | self.name = name
14 | self.sc_name = sc_name
15 | self.sc_content = sc_content
16 |
17 | def start_requests(self):
18 | pre_url = 'https://etherscan.io/contractsVerified'
19 | contract_page_amount = 2
20 | for i in range(int(contract_page_amount)):
21 | url = '{}/{}'.format(pre_url, i)
22 | print(url)
23 | yield scrapy.Request(url=url, callback=self.parse)
24 |
25 | def parse(self, response):
26 | page = response.url.split("/")[-1]
27 | filename = 'quotes-%s.html' % page
28 | with open(filename, 'wb') as f:
29 | f.write(response.body)
30 | self.log('Saved file %s' % filename)
31 |
32 | addr_list = response.xpath('//td[1]/a[1]/@href').extract()
33 | addl_list = response.xpath('//td[1]/a[1]/text()').extract()
34 | name_list = response.xpath('//td[2]/text()').extract()
35 |
36 | baseurl = 'https://etherscan.io'
37 | # https: // etherscan.io / address / 0x3c200bf4ec426236f8b042f0f1380315aee4c7d1 # code
38 | urllist = []
39 | i = 0
40 | for addr in addr_list:
41 | filename = "SC" + "_" + name_list[i] + "_P" + page + "_" + addl_list[i] + ".sol"
42 | self.log('save file %s' % filename)
43 | newurl = '{}{}'.format(baseurl, addr)
44 | # nonlocal sc_name
45 | self.sc_name = filename
46 | global sc_name1
47 | sc_name1 = filename
48 | yield response.follow(newurl, self.parse_sc)
49 |
50 | urllist.append(newurl)
51 | i = i + 1
52 |
53 | print(urllist)
54 |
55 | def parse_sc(self, response):
56 | def extract_with_css(query):
57 | return response.css(query).extract_first().strip()
58 |
59 | sc_content = response.xpath('//div[@id=\'dividcode\']/pre[1]/text()').extract()
60 | sc_abstract = response.xpath('//pre[@id=\'js-copytextarea2\']/text()').extract()
61 |
62 | print(sc_name1, sc_content, sc_abstract)
63 |
--------------------------------------------------------------------------------
/demo1/spiders/quotes.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 |
4 |
5 | class QuotesSpider(scrapy.Spider):
6 | name = "quotes"
7 |
8 | def start_requests(self):
9 | urls = [
10 | 'http://quotes.toscrape.com/page/1/',
11 | 'http://quotes.toscrape.com/page/2/',
12 | ]
13 | for url in urls:
14 | yield scrapy.Request(url=url, callback=self.parse)
15 |
16 | def parse(self, response):
17 | page = response.url.split("/")[-2]
18 | filename = 'quotes-%s.html' % page
19 | with open(filename, 'wb') as f:
20 | f.write(response.body)
21 | self.log('Saved file %s' % filename)
22 |
--------------------------------------------------------------------------------
/quotes-0.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Ethereum Contracts with Verified Source Codes
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
23 |
24 |
25 |
27 |
32 |
36 |
37 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
241 |
242 |
243 |
245 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
Contracts With Verified Source Codes Only
257 |
258 |
259 | - Home
260 | - Verified Contracts
261 |
262 |
263 |
264 |
265 |
266 |
273 |
274 |
292 |
293 |
294 |
296 |
A Total Of 37709 verified contract source codes found
297 |
298 |
307 |
308 |
650 |
651 |
652 |
668 |
669 |
678 |
679 |
680 |
681 |
682 |
687 |
688 |
689 |
690 |
772 |
774 |
781 |
782 |
783 |
784 |
785 |
786 |
787 |
817 |
818 |
819 |
820 |
--------------------------------------------------------------------------------
/quotes-1.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Ethereum Contracts with Verified Source Codes
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
23 |
24 |
25 |
27 |
32 |
36 |
37 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
241 |
242 |
243 |
245 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
Contracts With Verified Source Codes Only
257 |
258 |
259 | - Home
260 | - Verified Contracts
261 |
262 |
263 |
264 |
265 |
266 |
273 |
274 |
292 |
293 |
294 |
296 |
A Total Of 37930 verified contract source codes found
297 |
298 |
307 |
308 |
667 |
668 |
669 |
685 |
686 |
695 |
696 |
697 |
698 |
699 |
704 |
705 |
706 |
707 |
790 |
792 |
799 |
800 |
801 |
802 |
803 |
804 |
805 |
835 |
836 |
837 |
838 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = demo1.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = demo1
12 |
--------------------------------------------------------------------------------