├── .gitignore ├── .github └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── argv2dict └── __init__.py ├── README.md ├── node_crawler.py ├── v07_crawler.py ├── chord_crawler.py └── nkn_crawler.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.rej 3 | *.orig 4 | *.patch 5 | .DS_Store 6 | idea/* 7 | temp/* 8 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | 5 | --- 6 | 7 | **Is your feature request related to a problem? Please describe.** 8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 9 | 10 | **Describe the solution you'd like** 11 | A clear and concise description of what you want to happen. 12 | 13 | **Describe alternatives you've considered** 14 | A clear and concise description of any alternative solutions or features you've considered. 15 | 16 | **Additional context** 17 | Add any other context or screenshots about the feature request here. 18 | -------------------------------------------------------------------------------- /argv2dict/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | """ 5 | Copyright (c) 2018 nkn.org. All Rights Reserved 6 | :author: gdmmx 7 | :date: 2018-08-13 10:52:21 8 | :Usage Example: python argv2dict.py a=1 b=2 x=str1 9 | """ 10 | 11 | __version__ = '1.0.0' 12 | __author__ = 'gdmmx ' 13 | 14 | def argv2dict(*lst): 15 | """ 16 | :Functional: Convert a [k=v, x=y, ...] list to a python' dict. It is useful for convert sys.argv to a dict 17 | :Example: argv2dict('a=1', 'b=2', 'x=str1') 18 | """ 19 | if hasattr(lst, '__iter__'): ### Is iterable 20 | return dict(map(lambda x:(str(x).split('=', 1)), 21 | filter(lambda x:str(x).find('=') != -1, lst) 22 | )) 23 | return {} 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![NKN](https://github.com/nknorg/nkn/wiki/img/nkn_logo.png)](https://nkn.org) 2 | 3 | # nkn-crawler 4 | 5 | A crawler of NKN network for discover nodes online. 6 | 7 | ## Prerequisites 8 | > Python 2.7 9 | * Module json 10 | * Module requests 11 | * Module gevent 12 | 13 | ## Node Crawler Usage 14 | 15 | ``` 16 | ./node_crawler.py seed=${IP}[:${ChortPort}] [thread=$N] [timeout=$T] 17 | 18 | :param IP: 19 | IP address should be one of online nodes. x.x.x.x or x.x.x.x:port both of acceptable 20 | :param ChordPort: 21 | Chord Port of the node. It will be overrided by "param IP" if it provided :port suffix. 22 | Default: 30000 23 | :param thread: 24 | Concurrent N threads for crawler. 25 | Default: 1 26 | :param timeout: 27 | The timeout threshold for waiting response or no new nodes any more. 28 | Default: 20 29 | ``` 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | 5 | --- 6 | 7 | **Describe the bug** 8 | A clear and concise description of what the bug is. 9 | 10 | **To Reproduce** 11 | Steps to reproduce the behavior: 12 | 1. Go to '...' 13 | 2. Click on '....' 14 | 3. Scroll down to '....' 15 | 4. See error 16 | 17 | **Expected behavior** 18 | A clear and concise description of what you expected to happen. 19 | 20 | **Screenshots** 21 | If applicable, add screenshots to help explain your problem. 22 | 23 | **Desktop (please complete the following information):** 24 | - OS: [e.g. iOS] 25 | - Browser [e.g. chrome, safari] 26 | - Version [e.g. 22] 27 | 28 | **Smartphone (please complete the following information):** 29 | - Device: [e.g. iPhone6] 30 | - OS: [e.g. iOS8.1] 31 | - Browser [e.g. stock browser, safari] 32 | - Version [e.g. 22] 33 | 34 | **Additional context** 35 | Add any other context about the problem here. 36 | -------------------------------------------------------------------------------- /node_crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import json 6 | 7 | from argv2dict import argv2dict 8 | from nkn_crawler import Crawler 9 | 10 | class NodeCrawler(Crawler): 11 | def __init__(self, seed, port='30001', method='getneighbor', **kwargs): 12 | kwargs['method'] = method 13 | super(NodeCrawler, self).__init__(seed, port, **kwargs) 14 | 15 | ip16 = [0]*12 + [ int(b) for b in seed.split('.') ] 16 | self.task_lst.put_nowait(dict(ID='', IpAddr=ip16, Port=port)) 17 | 18 | def parse(self, resp): 19 | return resp.get('result') 20 | 21 | ### return ID, ip, port 22 | def info_from_task(self, task): 23 | port = task.get('Port', 30001) 24 | ### TODO: Support IPv6 25 | ip = '.'.join([ str(b) for b in task.get('IpAddr', [])[12:] ]) 26 | return task.get('ID', ''), ip, int(port)+2 27 | 28 | def task_to_node(self, task): 29 | ip = '.'.join([ str(b) for b in task.pop('IpAddr', [])[12:] ]) 30 | task['IpAddr'] = ip 31 | task.pop('Time', None) 32 | return task 33 | 34 | if __name__ == "__main__": 35 | conf = argv2dict(*sys.argv[1:]) 36 | craw = NodeCrawler(**conf) 37 | craw.run(**conf) 38 | 39 | [ sys.stdout.write('%s\n' % json.dumps(n)) for n in craw.result.values() ] 40 | -------------------------------------------------------------------------------- /v07_crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import json 6 | 7 | from argv2dict import argv2dict 8 | from nkn_crawler import Crawler 9 | 10 | class V07Crawler(Crawler): 11 | def __init__(self, seed, port='30003', method='getneighbor', **kwargs): 12 | kwargs['method'] = method 13 | super(V07Crawler, self).__init__(seed, port, **kwargs) 14 | 15 | self.task_lst.put_nowait(dict(id='', addr=seed.strip('http://').split(':')[0], jsonRpcPort=port)) 16 | 17 | def parse(self, resp): 18 | return resp.get('result') 19 | 20 | ### return ID, ip, port 21 | def info_from_task(self, task): 22 | ### TODO: Support IPv6 23 | uri = task.get('addr', '') 24 | if uri.find('://') != -1: ### if has 'any://' prefix 25 | uri = uri.split('://')[1] ### strip it 26 | return task.get('id', ''), uri.split(':')[0], task.get('jsonRpcPort', 30003) 27 | 28 | def task_to_node(self, task): 29 | return task 30 | 31 | if __name__ == "__main__": 32 | conf = argv2dict(*sys.argv[1:]) 33 | if conf.has_key('timeout'): 34 | conf['timeout'] = float(conf['timeout']) 35 | 36 | craw = V07Crawler(**conf) 37 | craw.run(**conf) 38 | 39 | [ sys.stdout.write('%s\n' % json.dumps(n)) for n in craw.result.values() ] 40 | -------------------------------------------------------------------------------- /chord_crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import time 6 | import json 7 | import traceback 8 | 9 | from gevent.queue import Empty 10 | 11 | from argv2dict import argv2dict 12 | from nkn_crawler import Crawler 13 | 14 | class ChordCrawler(Crawler): 15 | def __init__(self, seed, port='30003', method='getchordringinfo', **kwargs): 16 | kwargs['method'] = method 17 | super(ChordCrawler, self).__init__(seed, port, **kwargs) 18 | 19 | self.task_lst.put_nowait(dict(id='', addr=seed.strip('http://').split(':')[0], jsonRpcPort=port)) 20 | 21 | def parse(self, resp): 22 | d = resp.get('result') or {} 23 | lnode = d.pop('localNode', {}) 24 | lnode.update(d) 25 | return lnode 26 | 27 | ### return ID, ip, port 28 | def info_from_task(self, task): 29 | ### TODO: Support IPv6 30 | uri = task.get('addr', '') 31 | if uri.find('://') != -1: ### if has 'any://' prefix 32 | uri = uri.split('://')[1] ### strip it 33 | return task.get('id', ''), uri.split(':')[0], task.get('jsonRpcPort', 30003) 34 | 35 | def task_to_node(self, node): 36 | ret = list(node.get('successors') or []) 37 | ret += node.get('predecessors') or [] 38 | return reduce(lambda x,y:x+y, node.get('fingerTable', {}).values(), ret) 39 | 40 | def worker(self, timeout=20): 41 | while True: 42 | try: 43 | t = self.task_lst.get(timeout=timeout) 44 | Id, ip, port = self.info_from_task(t) 45 | 46 | if ip and port and Id not in self.probed: ### Is valid task and Not crawl yet 47 | if Id: self.probed.add(Id) ### mark it as crawled already. Empty Id means task from sys.argv 48 | new_node = self.parse(self.req(ip, port, **self.conf)) 49 | Id = new_node.get('id') 50 | if Id: self.result[Id] = new_node ### Add to crawl result 51 | [ self.task_lst.put_nowait(n) for n in self.task_to_node(new_node) if n.get('id') not in self.probed ] ### add new task_to_node into task_lst 52 | except Empty as e: 53 | sys.stderr.write('%s: worker exit due to err %s\n' % (time.strftime('%F %T'), type(e))) 54 | break 55 | except Exception as e: 56 | sys.stderr.write('%s: worker req %s met err %s\n' % (time.strftime('%F %T'), str(t), type(e))) 57 | sys.stderr.write(traceback.format_exc(e)) ### stay for debug 58 | continue 59 | 60 | if __name__ == "__main__": 61 | conf = argv2dict(*sys.argv[1:]) 62 | if conf.has_key('timeout'): 63 | conf['timeout'] = float(conf['timeout']) 64 | 65 | craw = ChordCrawler(**conf) 66 | craw.run(**conf) 67 | 68 | [ sys.stdout.write('%s\n' % json.dumps(n)) for n in craw.result.values() ] 69 | -------------------------------------------------------------------------------- /nkn_crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from gevent import monkey;monkey.patch_all() 5 | import sys 6 | import time 7 | import json 8 | import requests 9 | import traceback 10 | from argv2dict import argv2dict 11 | 12 | import gevent 13 | from gevent.pool import Group 14 | from gevent.queue import Queue 15 | from gevent.queue import Empty 16 | 17 | class Crawler(object): 18 | def __init__(self, seed, port='30000', **kwargs): 19 | self.result = {} 20 | self.probed = set() 21 | self.task_lst = Queue() 22 | self.pool = Group() 23 | self.conf = kwargs 24 | 25 | addr = seed.split(':') ### Invalid the seed if conf NOT set 26 | if len(addr)==1: ### if seed str not contain port, use conf['port'] or default 27 | addr += [port] 28 | elif len(addr) > 2: 29 | ### TODO: Usage 30 | sys.stderr.write('Invalid seed %s\n' % seed) 31 | exit(22) 32 | 33 | #self.task_lst.put_nowait(dict(Host=':'.join(addr))) 34 | sys.stderr.write('Crawler start from %s\n' % (':'.join(addr))) 35 | 36 | def req(self, ip, port=30003, apiId='1', apiVer='3.0', method='getchordringinfo', params={}, timeout=20, **kwargs): 37 | r = '' 38 | ret = {} 39 | d = dict(id=apiId, jsonrpc=apiVer, method=method, params=params) 40 | try: 41 | r = requests.post('http://%s:%s' % (ip, port), json=d, headers={'Content-Type':'application/json'}, timeout=timeout) 42 | ret = json.loads(r.text) 43 | except Exception as e: 44 | sys.stderr.write('%s: met Error [%s] when request [%s] from %s:%s resp [%s]\n' % ( 45 | time.strftime('%F %T'), e.message, method, ip, port, r.text if r else '' 46 | )) 47 | raise e 48 | return ret 49 | 50 | def parse(self, resp): 51 | succ_lst = [] 52 | for vn in resp.get('result',{}).get('Vnodes',[]): 53 | ### new discovery 54 | succ_lst += [ n for n in vn.pop('Successors',[]) if n ] 55 | succ_lst += [ n for n in vn.pop('Finger',[]) if n ] 56 | succ_lst += [ vn.pop('Predecessor') or {} ] 57 | return succ_lst 58 | 59 | def info_from_task(self, task): 60 | ip, port = task.get('Host', '').split(':') 61 | return task.get('Id', ''), ip, int(port)+3 62 | 63 | def task_to_node(self, task): 64 | return task 65 | 66 | def worker(self, timeout=20): 67 | while True: 68 | try: 69 | t = self.task_lst.get(timeout=timeout) 70 | Id, ip, port = self.info_from_task(t) 71 | 72 | if ip and port and Id not in self.probed: ### Is valid task and Not crawl yet 73 | self.probed.add(Id) ### mark it as crawled whatever success or fail 74 | new_nodes = self.parse(self.req(ip, port, **self.conf)) 75 | self.result[Id] = self.task_to_node(t) ### Add to crawl result 76 | [ self.task_lst.put_nowait(n) for n in new_nodes ] ### add new_nodes into task_lst 77 | except Empty as e: 78 | sys.stderr.write('%s: worker exit due to err %s\n' % (time.strftime('%F %T'), type(e))) 79 | break 80 | except Exception as e: 81 | sys.stderr.write('%s: worker req %s met err %s\n' % (time.strftime('%F %T'), str(t), type(e))) 82 | # print traceback.format_exc(e) ### stay for debug 83 | continue 84 | 85 | def debug(self, interval=5): 86 | while True: 87 | sys.stderr.write('Craw results %d\n' % len(self.result)) 88 | gevent.sleep(interval) 89 | 90 | def run(self, timeout=20, thread=1, **kwargs): 91 | gevent.spawn(self.debug, 5) 92 | self.pool.map(self.worker, [timeout]*int(thread)) 93 | self.pool.join() 94 | sys.stderr.write('Total: %d Nodes\n' % len(self.result)) 95 | 96 | if __name__ == "__main__": 97 | conf = argv2dict(*sys.argv[1:]) 98 | if conf.has_key('timeout'): 99 | conf['timeout'] = float(conf['timeout']) 100 | 101 | craw = Crawler(**conf) 102 | craw.run(**conf) 103 | 104 | [ sys.stdout.write('%s\n' % json.dumps(n)) for n in craw.result.values() ] 105 | --------------------------------------------------------------------------------