├── .github ├── dependabot.yml └── workflows │ ├── pip-audit.yml │ ├── pythonpublish.yml │ └── tests.yml ├── LICENSE ├── README.md ├── fluxture ├── __init__.py ├── __main__.py ├── async_utils.py ├── bitcoin.py ├── blockchain.py ├── crawl_schema.py ├── crawler.py ├── db.py ├── fluxture.py ├── geolocation.py ├── kml.py ├── messaging.py ├── serialization.py ├── shodan.py ├── statistics.py ├── structures.py └── topology.py ├── setup.py └── test ├── __init__.py ├── test_async_utils.py ├── test_bitcoin.py ├── test_db.py ├── test_statistics.py └── test_types.py /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | updates: 4 | - package-ecosystem: github-actions 5 | directory: / 6 | schedule: 7 | interval: daily 8 | -------------------------------------------------------------------------------- /.github/workflows/pip-audit.yml: -------------------------------------------------------------------------------- 1 | name: Scan dependencies for vulnerabilities with pip-audit 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | schedule: 9 | - cron: "0 12 * * *" 10 | 11 | jobs: 12 | pip-audit: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Checkout repository 17 | uses: actions/checkout@v3 18 | 19 | - name: Install Python 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: "3.x" 23 | 24 | - name: Install project 25 | run: | 26 | python -m venv --upgrade-deps /tmp/pip-audit-env 27 | source /tmp/pip-audit-env/bin/activate 28 | 29 | python -m pip install --upgrade wheel 30 | python -m pip install .[dev] 31 | 32 | 33 | - name: Run pip-audit 34 | uses: pypa/gh-action-pip-audit@v1.0.8 35 | with: 36 | virtual-environment: /tmp/pip-audit-env 37 | 38 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [published] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v3 17 | - name: Set up Python 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Tests 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.7", "3.8", "3.9", "3.10"] 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install setuptools wheel 30 | pip install .[dev] 31 | - name: Lint with flake8 32 | run: | 33 | pip install flake8 34 | # stop the build if there are Python syntax errors or undefined names 35 | flake8 fluxture test --count --select=E9,F63,F7,F82 --show-source --statistics 36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 37 | flake8 fluxture test --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 38 | - name: Test with pytest 39 | run: | 40 | pip install pytest 41 | pytest 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2019 Trail of Bits 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fluxture 2 | 3 | [![PyPI version](https://badge.fury.io/py/fluxture.svg)](https://badge.fury.io/py/fluxture) 4 | [![Tests](https://github.com/trailofbits/fluxture/workflows/Tests/badge.svg)](https://github.com/trailofbits/fluxture/actions) 5 | [![Slack Status](https://slack.empirehacking.nyc/badge.svg)](https://slack.empirehacking.nyc) 6 | 7 | Fluxture is a lightweight crawler for peer-to-peer networks like Blockchains. It currently supports the latest version 8 | of the Bitcoin protocol: 70015. It implements the minimum amount of the Bitcoin protocol necessary to collect geographic 9 | and topological information. 10 | 11 | ## Quickstart 12 | 13 | ```commandline 14 | pip3 install fluxture 15 | ``` 16 | 17 | Or, to install from source (_e.g._, for development): 18 | 19 | ```commandline 20 | $ git clone https://github.com/trailofbits/fluxture 21 | $ cd fluxture 22 | $ pip3 install -e '.[dev]' 23 | ``` 24 | 25 | ## Usage 26 | 27 | To crawl the Bitcoin network, run: 28 | 29 | ```commandline 30 | fluxture crawl bitcoin --database crawl.db 31 | ``` 32 | 33 | The crawl database is a SQLite database that can be reused between crawls. 34 | 35 | ## Geolocation 36 | 37 | Fluxture uses the MaxMind GeoLite2 City database for geolocating nodes based upon their IP address. Various Fluxture 38 | commands will either require a path to the database, or a MaxMind license key (which will be used to automatically 39 | download the database). You can sign up for a free MaxMind license key, 40 | [here](https://www.maxmind.com/en/geolite2/signup). 41 | 42 | A KML file (which can be imported to Google Maps or Google Earth) can be generated from a crawl using: 43 | 44 | ```commandline 45 | fluxture kml --group-by ip crawl.db output.kml 46 | ``` 47 | 48 | The geolocation database can be updated from MaxMind by running: 49 | 50 | ```commandline 51 | fluxture update-geo-db 52 | ``` 53 | 54 | An existing crawl database can be re-analyzed for missing or updated geolocations (_e.g._, from an updated MaxMind database) by running: 55 | 56 | ```commandline 57 | fluxture geolocate crawl.db 58 | ``` 59 | 60 | ## Topological Analysis 61 | 62 | Fluxture can calculate topological statistics about the centrality of a crawled network by running: 63 | 64 | ```commandline 65 | fluxture topology crawl.db 66 | ``` 67 | 68 | ## Programmatic Interface 69 | 70 | ```python 71 | from fluxture.crawl_schema import CrawlDatabase 72 | 73 | with CrawlDatabase("crawl.db") as db: 74 | for node in db.nodes: 75 | print(f"Node {node.ip}:{node.port} {node.state!s}") 76 | location = node.get_location() 77 | if location is not None: 78 | print(f"\tLocation:\t{location.continent_code}\t{location.country_code}\t{location.city}") 79 | else: 80 | print("\tLocation:\t?") 81 | version = node.get_version() 82 | if version is not None: 83 | print(f"\tVersion:\t{version.version!s}") 84 | else: 85 | print("\tVersion:\t?") 86 | print(f"\tOut-Edges:\t{', '.join(str(neighbor.ip) for neighbor in node.get_latest_edges())}") 87 | ``` 88 | 89 | ## License and Acknowledgements 90 | 91 | This research was developed by [Trail of Bits](https://www.trailofbits.com/) based upon work supported by DARPA under 92 | Contract No. HR001120C0084. Any opinions, findings and conclusions or recommendations expressed in this material are 93 | those of the authors and do not necessarily reflect the views of the United States Government or DARPA. 94 | It is licensed under the [Apache 2.0 license](LICENSE). © 2020–2021, Trail of Bits. 95 | -------------------------------------------------------------------------------- /fluxture/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib import import_module 2 | from inspect import isclass 3 | from pathlib import Path 4 | from pkgutil import iter_modules 5 | 6 | # Automatically load all modules in the `fluxture` package, 7 | # so all Fluxture plugins will auto-register themselves: 8 | package_dir = Path(__file__).resolve().parent 9 | for (_, module_name, _) in iter_modules([str(package_dir)]): 10 | # import the module and iterate through its attributes 11 | module = import_module(f"{__name__}.{module_name}") 12 | for attribute_name in dir(module): 13 | attribute = getattr(module, attribute_name) 14 | 15 | if isclass(attribute): 16 | # Add the class to this package's variables 17 | globals()[attribute_name] = attribute 18 | -------------------------------------------------------------------------------- /fluxture/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from typing import Union 4 | 5 | from .fluxture import add_command_subparsers 6 | 7 | 8 | def get_root_logger() -> logging.Logger: 9 | l = logging.getLogger(__name__) 10 | while l.parent: 11 | l = l.parent 12 | return l 13 | 14 | 15 | def setLevel(level: Union[int, str]): 16 | get_root_logger().setLevel(level) 17 | 18 | 19 | def main(): 20 | parser = argparse.ArgumentParser( 21 | description="Fluxture: a peer-to-peer network crawler" 22 | ) 23 | parser.add_argument( 24 | "--debug", action="store_true", help="set the log level to debug" 25 | ) 26 | 27 | add_command_subparsers(parser) 28 | 29 | args = parser.parse_args() 30 | 31 | if args.debug: 32 | setLevel(logging.DEBUG) 33 | else: 34 | setLevel(logging.INFO) 35 | 36 | return args.func(args) 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | -------------------------------------------------------------------------------- /fluxture/async_utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from collections import deque 3 | from functools import partial, wraps 4 | from threading import Condition, Lock, Thread 5 | from typing import (Any, AsyncIterator, Callable, Coroutine, Deque, Dict, 6 | Generic, Iterable, Iterator, Optional, Tuple, TypeVar) 7 | 8 | T = TypeVar("T") 9 | 10 | 11 | class SyncIteratorWrapper(Generic[T]): 12 | def __init__( 13 | self, 14 | to_wrap: Callable[..., Iterator[T]], 15 | args: Iterable[Any] = (), 16 | kwargs: Dict[str, Any] = {}, 17 | poll_interval: float = 0.5, 18 | ): 19 | self.wrapped: Callable[..., Iterator[T]] = to_wrap 20 | self.args: Tuple[Any, ...] = tuple(args) 21 | self.kwargs: Dict[str, Any] = kwargs 22 | self.thread: Optional[Thread] = None 23 | self.condition: Optional[Condition] = None 24 | self.result_queue: Deque[T] = deque() 25 | self.poll_interval: float = poll_interval 26 | 27 | def __getattr__(self, item): 28 | return getattr(self.wrapped, item) 29 | 30 | def __iter__(self): 31 | return self.wrapped(*self.args, **self.kwargs) 32 | 33 | def _run(self): 34 | for result in self.wrapped(*self.args, **self.kwargs): 35 | with self.condition: 36 | self.result_queue.append(result) 37 | 38 | def __aiter__(self): 39 | if self.thread is None: 40 | self.condition: Condition = Condition(Lock()) 41 | self.thread = Thread(target=self._run) 42 | self.thread.start() 43 | return self 44 | 45 | async def __anext__(self): 46 | while True: 47 | with self.condition: 48 | if self.result_queue: 49 | return self.result_queue.popleft() 50 | elif self.thread is None or not self.thread.is_alive(): 51 | # The thread finished and there are no more results 52 | self.thread = None 53 | raise StopAsyncIteration() 54 | await asyncio.sleep(self.poll_interval) 55 | 56 | 57 | def iterator_to_async( 58 | to_wrap: Optional[Callable[..., Iterator[T]]] = None, *, poll_interval: float = 0.5 59 | ): 60 | """Decorator to automatically convert a synchronous function that returns an iterator to be asynchronous""" 61 | if to_wrap is None: 62 | # this will happen if the user optionally passes a `poll_interval` argument 63 | return partial(iterator_to_async, poll_interval=poll_interval) 64 | 65 | @wraps(to_wrap) 66 | def wrapper(*args, **kwargs): 67 | return SyncIteratorWrapper(to_wrap, args, kwargs, poll_interval=poll_interval) 68 | 69 | return wrapper 70 | 71 | 72 | def sync_to_async( 73 | to_wrap: Optional[Callable[..., T]] = None, *, poll_interval: float = 0.5 74 | ) -> Callable[..., Coroutine[Any, Any, T]]: 75 | """Decorator to automatically convert a synchronous function to be asynchronous""" 76 | if to_wrap is None: 77 | # this will happen if the user optionally passes a `poll_interval` argument 78 | return partial(sync_to_async, poll_interval=poll_interval) 79 | 80 | class WrapperThread(Thread): 81 | def __init__(self, *args, **kwargs): 82 | super().__init__() 83 | self.args = args 84 | self.kwargs = kwargs 85 | self.result: Optional[T] = None 86 | 87 | def run(self): 88 | self.result = to_wrap(*self.args, **self.kwargs) 89 | 90 | @wraps(to_wrap) 91 | async def wrapper(*args, **kwargs) -> T: 92 | thread = WrapperThread(*args, **kwargs) 93 | thread.start() 94 | while thread.is_alive(): 95 | await asyncio.sleep(poll_interval) 96 | return thread.result 97 | 98 | return wrapper 99 | -------------------------------------------------------------------------------- /fluxture/bitcoin.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import socket 3 | import sys 4 | from abc import ABC 5 | from hashlib import sha256 6 | from ipaddress import IPv4Address, IPv6Address 7 | from logging import getLogger 8 | from time import time as current_time 9 | from typing import (AsyncIterator, Dict, FrozenSet, Generic, KeysView, List, 10 | Optional, Set, Tuple, Type, TypeVar, Union) 11 | 12 | import fluxture.structures 13 | 14 | from . import serialization 15 | from .blockchain import (Blockchain, BlockchainError, Miner, Node, Version, 16 | get_public_ip) 17 | from .messaging import BinaryMessage 18 | from .serialization import ByteOrder, P, UnpackError 19 | from .shodan import SearchQuery, ShodanResult, get_api 20 | 21 | log = getLogger(__file__) 22 | 23 | BITCOIN_MAINNET_MAGIC = b"\xf9\xbe\xb4\xd9" 24 | 25 | 26 | NODE_QUERY = SearchQuery.register( 27 | name="BitcoinNode", query='port:8333 product:"/Satoshi:*/"' 28 | ) 29 | MINER_QUERY = SearchQuery.register(name="BitcoinMiner", query="antminer") 30 | 31 | 32 | B = TypeVar("B", bound="BitcoinMessage") 33 | 34 | 35 | class BitcoinError(BlockchainError): 36 | pass 37 | 38 | 39 | class BitcoinMessageHeader(BinaryMessage): 40 | non_serialized = "byte_order" 41 | byte_order = ByteOrder.LITTLE 42 | 43 | magic: serialization.SizedByteArray[4] 44 | command: serialization.SizedByteArray[12] 45 | length: serialization.UInt32 46 | checksum: serialization.SizedByteArray[4] 47 | 48 | @property 49 | def decoded_command(self) -> str: 50 | decoded = self.command.decode("utf-8") 51 | first_null_byte = decoded.find("\0") 52 | if any(c != "\0" for c in decoded[first_null_byte:]): 53 | raise ValueError( 54 | f"Command name {self.command!r} includes bytes after the null terminator!" 55 | ) 56 | return decoded[:first_null_byte] 57 | 58 | @classmethod 59 | async def next_message( 60 | cls, reader: asyncio.StreamReader 61 | ) -> Optional["BitcoinMessageHeader"]: 62 | data = await reader.read(4 + 12 + serialization.UInt32.BYTES + 4) 63 | if not data: 64 | return None 65 | return cls.deserialize(data) 66 | 67 | def __repr__(self): 68 | return ( 69 | f"{self.__class__.__name__}(magic={self.magic!r}, command={self.decoded_command!r}, " 70 | f"length={self.length!r}, checksum={self.checksum!r})" 71 | ) 72 | 73 | 74 | MESSAGES_BY_COMMAND: Dict[str, Type["BitcoinMessage"]] = {} 75 | 76 | 77 | def bitcoin_checksum(payload: bytes) -> bytes: 78 | return sha256(sha256(payload).digest()).digest()[:4] 79 | 80 | 81 | class BitcoinMessage(BinaryMessage, ABC): 82 | non_serialized = "byte_order", "command" 83 | byte_order = ByteOrder.LITTLE 84 | command: Optional[str] = None 85 | 86 | def __init_subclass__(cls, **kwargs): 87 | if cls.command is None: 88 | raise TypeError( 89 | f"{cls.__name__} extends BitcoinMessage but does not speficy a command string!" 90 | ) 91 | elif cls.command in MESSAGES_BY_COMMAND: 92 | raise TypeError( 93 | f"The command {cls.command} is already registered to message class " 94 | f"{MESSAGES_BY_COMMAND[cls.command]}" 95 | ) 96 | MESSAGES_BY_COMMAND[cls.command] = cls 97 | 98 | def serialize(self) -> bytes: 99 | payload = super().serialize() 100 | return ( 101 | BitcoinMessageHeader( 102 | magic=BITCOIN_MAINNET_MAGIC, 103 | command=self.command.encode("utf-8"), 104 | length=len(payload), 105 | checksum=bitcoin_checksum(payload), 106 | ).serialize() 107 | + payload 108 | ) 109 | 110 | @classmethod 111 | def deserialize_partial( 112 | cls, data: bytes, header: Optional[BitcoinMessageHeader] = None 113 | ) -> Tuple["BitcoinMessage", bytes]: 114 | if header is None: 115 | header, payload = BitcoinMessageHeader.unpack_partial( 116 | data, byte_order=BitcoinMessageHeader.byte_order 117 | ) 118 | else: 119 | payload = data 120 | if header.magic != BITCOIN_MAINNET_MAGIC: 121 | raise ValueError( 122 | f"Message header magic was {header.magic}, but expected {BITCOIN_MAINNET_MAGIC!r} " 123 | "for Bitcoin mainnet!" 124 | ) 125 | elif header.length < len(payload): 126 | raise ValueError( 127 | f"Invalid payload length of {len(payload)}; expected at least {header.length} bytes" 128 | ) 129 | elif header.decoded_command not in MESSAGES_BY_COMMAND: 130 | raise NotImplementedError( 131 | f'TODO: Implement Bitcoin command "{header.command}"' 132 | ) 133 | payload, remainder = payload[: header.length], payload[header.length :] 134 | decoded_command = header.decoded_command 135 | expected_checksum = bitcoin_checksum(payload) 136 | if header.checksum != expected_checksum: 137 | raise ValueError( 138 | f"Invalid message checksum; got {header.checksum!r} but expected {expected_checksum!r}" 139 | ) 140 | return ( 141 | MESSAGES_BY_COMMAND[decoded_command].unpack( 142 | payload, MESSAGES_BY_COMMAND[decoded_command].byte_order 143 | ), 144 | remainder, 145 | ) 146 | 147 | @classmethod 148 | def deserialize(cls, data: bytes) -> "BitcoinMessage": 149 | message, remainder = cls.deserialize_partial(data) 150 | if remainder: 151 | raise ValueError(f"Unexpected bytes trailing message: {remainder!r}") 152 | return message 153 | 154 | @classmethod 155 | async def next_message( 156 | cls, reader: asyncio.StreamReader 157 | ) -> Optional["BitcoinMessage"]: 158 | header = await BitcoinMessageHeader.next_message(reader) 159 | if header is None: 160 | return None 161 | try: 162 | payload = await reader.readexactly(header.length) 163 | except asyncio.IncompleteReadError: 164 | raise ValueError( 165 | f"Expected {header.length} bytes when reading the message with header {header!r}" 166 | ) 167 | msg, remainder = cls.deserialize_partial(payload, header=header) 168 | assert len(remainder) == 0 169 | return msg 170 | 171 | 172 | class VarInt(int, serialization.AbstractPackable): 173 | def __new__(cls, value: int): 174 | return int.__new__(cls, value) 175 | 176 | def pack( 177 | self, byte_order: serialization.ByteOrder = serialization.ByteOrder.LITTLE 178 | ) -> bytes: 179 | value = int(self) 180 | if value < 0xFD: 181 | return bytes([value]) 182 | elif value <= 0xFFFF: 183 | return b"\xFD" + serialization.UInt16(value).pack(byte_order) 184 | elif value <= 0xFFFFFFFF: 185 | return b"\xFE" + serialization.UInt32(value).pack(byte_order) 186 | elif value <= serialization.UInt64.MAX_VALUE: 187 | return b"\xFF" + serialization.UInt64(value).pack(byte_order) 188 | else: 189 | raise ValueError( 190 | f"Value {value} must be less than {serialization.UInt64.MAX_VALUE}" 191 | ) 192 | 193 | @classmethod 194 | def unpack_partial( 195 | cls: Type[P], data: bytes, byte_order: ByteOrder = ByteOrder.LITTLE 196 | ) -> Tuple[P, bytes]: 197 | if data[0] < 0xFD: 198 | return cls(data[0]), data[1:] 199 | elif data[0] == 0xFD: 200 | return ( 201 | cls(serialization.UInt16.unpack(data[1:3], byte_order=byte_order)), 202 | data[3:], 203 | ) 204 | elif data[0] == 0xFE: 205 | return ( 206 | cls(serialization.UInt32.unpack(data[1:5], byte_order=byte_order)), 207 | data[5:], 208 | ) 209 | elif data[0] == 0xFF: 210 | return ( 211 | cls(serialization.UInt64.unpack(data[1:9], byte_order=byte_order)), 212 | data[9:], 213 | ) 214 | else: 215 | raise UnpackError(f"Unexpected data: {data!r}") 216 | 217 | @classmethod 218 | async def read( 219 | cls: Type[P], 220 | reader: asyncio.StreamReader, 221 | byte_order: ByteOrder = ByteOrder.NETWORK, 222 | ) -> P: 223 | first_byte = await reader.read(1) 224 | if len(first_byte) < 1: 225 | raise BitcoinError() 226 | elif first_byte[0] < 0xFD: 227 | return cls(first_byte[0]) 228 | elif first_byte[0] == 0xFD: 229 | data_type = serialization.UInt16 230 | elif first_byte[0] == 0xFE: 231 | data_type = serialization.UInt32 232 | elif first_byte[0] == 0xFF: 233 | data_type = serialization.UInt64 234 | else: 235 | raise BitcoinError() 236 | return cls(await data_type.read(reader, byte_order=byte_order)) 237 | 238 | 239 | class VarStr(bytes, serialization.AbstractPackable): 240 | def __new__(cls, value: bytes): 241 | return bytes.__new__(cls, value) 242 | 243 | def pack( 244 | self, byte_order: serialization.ByteOrder = serialization.ByteOrder.LITTLE 245 | ) -> bytes: 246 | return VarInt(len(self)).pack(byte_order) + self 247 | 248 | @classmethod 249 | def unpack_partial( 250 | cls: Type[P], data: bytes, byte_order: ByteOrder = ByteOrder.LITTLE 251 | ) -> Tuple[P, bytes]: 252 | length, remainder = VarInt.unpack_partial(data, byte_order=byte_order) 253 | if len(remainder) < length: 254 | raise UnpackError( 255 | f"Expected a byte sequence of length {length} but instead got {remainder!r}" 256 | ) 257 | return remainder[:length], remainder[length:] 258 | 259 | @classmethod 260 | async def read( 261 | cls: Type[P], 262 | reader: asyncio.StreamReader, 263 | byte_order: ByteOrder = ByteOrder.NETWORK, 264 | ) -> P: 265 | length = await VarInt.read(reader, byte_order=byte_order) 266 | string = await reader.read(length) 267 | if len(string) < length: 268 | raise UnpackError( 269 | f"Expected a byte sequence of length {length} but instead got {string!r}" 270 | ) 271 | return string 272 | 273 | 274 | class NetAddr(fluxture.structures.PackableStruct): 275 | services: serialization.UInt64 276 | ip: serialization.BigEndian[serialization.IPv6Address] 277 | port: serialization.BigEndian[serialization.UInt16] 278 | 279 | def __init__( 280 | self, 281 | services: int = 0, 282 | ip: Optional[Union[serialization.IPv6Address, str, bytes]] = None, 283 | port: int = 8333, 284 | ): 285 | if ip is None: 286 | ip = get_public_ip() 287 | if not isinstance(ip, serialization.IPv6Address): 288 | # IP is big-endian in Bitcoin 289 | ip = serialization.IPv6Address(ip) 290 | super().__init__(services=services, ip=ip, port=port) 291 | 292 | 293 | class NetIP(fluxture.structures.PackableStruct): 294 | time: serialization.UInt32 295 | addr: NetAddr 296 | 297 | def __init__(self, time: Optional[int] = None, addr: Optional[NetAddr] = None): 298 | if time is None: 299 | time = int(current_time()) 300 | if addr is None: 301 | addr = NetAddr() 302 | super().__init__(time=time, addr=addr) 303 | 304 | 305 | class VerackMessage(BitcoinMessage): 306 | command = "verack" 307 | 308 | 309 | class SendHeaders(BitcoinMessage): 310 | command = "sendheaders" 311 | 312 | 313 | class SendCmpct(BitcoinMessage): 314 | command = "sendcmpct" 315 | 316 | announce: serialization.Bool 317 | version: serialization.UInt64 318 | 319 | 320 | class Ping(BitcoinMessage): 321 | command = "ping" 322 | 323 | nonce: serialization.UInt64 324 | 325 | 326 | class Pong(BitcoinMessage): 327 | command = "pong" 328 | 329 | nonce: serialization.UInt64 330 | 331 | 332 | class VersionMessage(BitcoinMessage): 333 | command = "version" 334 | 335 | version: serialization.Int32 336 | services: serialization.UInt64 337 | timestamp: serialization.Int64 338 | addr_recv: NetAddr 339 | addr_from: NetAddr 340 | nonce: serialization.UInt64 341 | user_agent: VarStr 342 | start_height: serialization.Int32 343 | relay: serialization.Bool 344 | 345 | def __str__(self): 346 | try: 347 | s = self.user_agent.decode("utf-8") 348 | except UnicodeDecodeError: 349 | s = repr(self.user_agent) 350 | return f"{int(self.version)} {s}" 351 | 352 | 353 | class FeeFilter(BitcoinMessage): 354 | command = "feefilter" 355 | 356 | feerate: serialization.UInt64 357 | 358 | 359 | class GetAddrMessage(BitcoinMessage): 360 | command = "getaddr" 361 | 362 | 363 | class AbstractList(list, Generic[P], List[P], serialization.AbstractPackable, ABC): 364 | ELEMENT_TYPE: Type[P] 365 | 366 | def __new__(cls, *args, **kwargs): 367 | return list.__new__(cls, *args, **kwargs) 368 | 369 | def pack(self, byte_order: ByteOrder = ByteOrder.NETWORK) -> bytes: 370 | return VarInt(len(self)).pack(byte_order) + b"".join( 371 | element.pack(byte_order) for element in self 372 | ) 373 | 374 | @classmethod 375 | def unpack_partial( 376 | cls: Type[P], data: bytes, byte_order: ByteOrder = ByteOrder.NETWORK 377 | ) -> Tuple[P, bytes]: 378 | length, remainder = VarInt.unpack_partial(data, byte_order) 379 | num_bytes = length * cls.ELEMENT_TYPE.num_bytes 380 | if num_bytes > len(remainder): 381 | raise UnpackError(f"Expected {num_bytes} bytes, but got {remainder!r}") 382 | iters = [iter(remainder[:num_bytes])] * cls.ELEMENT_TYPE.num_bytes 383 | return ( 384 | cls( 385 | cls.ELEMENT_TYPE.unpack(bytes(data), byte_order=byte_order) 386 | for data in zip(*iters) 387 | ), 388 | remainder[num_bytes:], 389 | ) 390 | 391 | @classmethod 392 | async def read( 393 | cls: Type[P], 394 | reader: asyncio.StreamReader, 395 | byte_order: ByteOrder = ByteOrder.NETWORK, 396 | ) -> P: 397 | length = await VarInt.read(reader, byte_order) 398 | return cls( 399 | cls.ELEMENT_TYPE.read(reader, byte_order=byte_order) for _ in range(length) 400 | ) 401 | 402 | 403 | class AddressList(AbstractList[NetIP]): 404 | ELEMENT_TYPE = NetIP 405 | 406 | 407 | class Identifier(serialization.UInt32): 408 | MSG_TX = serialization.UInt32(1) 409 | MSG_BLOCK = serialization.UInt32(2) 410 | MSG_FILTERED_BLOCK = serialization.UInt32(3) 411 | MSG_CMPCT_BLOCK = serialization.UInt32(4) 412 | MSG_WITNESS_TX = serialization.UInt32((1 << 30) | 1) 413 | MSG_WITNESS_BLOCK = serialization.UInt32((1 << 30) | 2) 414 | MSG_FILTERED_WITNESS_BLOCK = serialization.UInt32((1 << 30) | 3) 415 | 416 | 417 | class Inventory(fluxture.structures.PackableStruct): 418 | identifier: Identifier 419 | hash: serialization.SizedByteArray[32] 420 | 421 | 422 | class Inventories(AbstractList[Inventory]): 423 | ELEMENT_TYPE = Inventory 424 | 425 | 426 | class InvMessage(BitcoinMessage): 427 | command = "inv" 428 | 429 | inventories: Inventories 430 | 431 | 432 | class AddrMessage(BitcoinMessage): 433 | command = "addr" 434 | 435 | addresses: AddressList 436 | 437 | 438 | class BitcoinNode(Node): 439 | def __init__( 440 | self, 441 | address: Union[str, IPv4Address, IPv6Address], 442 | port: int = 8333, 443 | source: str = "peer", 444 | ): 445 | super().__init__(address, port, source) 446 | self.connected: bool = False 447 | self.connecting: bool = False 448 | self.version: Optional[VersionMessage] = None 449 | 450 | async def receive_message(self) -> Optional["BitcoinMessage"]: 451 | return await BitcoinMessage.next_message(await self.reader) 452 | 453 | async def connect(self): 454 | if self.connected or self.connecting: 455 | return 456 | await super().connect() 457 | self.connecting = True 458 | t = int(current_time()) 459 | await self.send_message( 460 | VersionMessage( 461 | version=70015, 462 | services=0, 463 | timestamp=t, 464 | addr_recv=NetAddr(ip=self.address, port=self.port), 465 | addr_from=NetAddr(ip="::ffff:127.0.0.1", port=8333), 466 | nonce=0, 467 | user_agent=b"fluxture", 468 | start_height=0, 469 | relay=True, 470 | ) 471 | ) 472 | async for reply in self.run(): 473 | if isinstance(reply, VerackMessage): 474 | self.connected = True 475 | break 476 | if not self.connected: 477 | raise BitcoinError( 478 | f"Did not receive a Verack message from client {self.address}:{self.port}" 479 | ) 480 | self.connecting = False 481 | 482 | async def get_neighbors(self) -> AddrMessage: 483 | async with self: 484 | await self.send_message(GetAddrMessage()) 485 | async for msg in self.run(): 486 | if isinstance(msg, AddrMessage): 487 | return msg 488 | raise BitcoinError( 489 | f"Node {self.address}:{self.port} closed the connection before replying to our " 490 | "GetAddr message" 491 | ) 492 | 493 | async def get_version(self) -> VersionMessage: 494 | if self.version is not None: 495 | return self.version 496 | async with self: 497 | async for _ in self.run(): 498 | if self.version is not None: 499 | return self.version 500 | raise BitcoinError( 501 | f"Node {self.address}:{self.port} closed the connection before sending us a VersionMessage" 502 | ) 503 | 504 | async def run(self) -> AsyncIterator["BitcoinMessage"]: 505 | async with self: 506 | await self.connect() 507 | while True: 508 | done, pending = await asyncio.wait( 509 | [self.join(), self.receive_message()], 510 | return_when=asyncio.FIRST_COMPLETED, 511 | ) 512 | gather = asyncio.gather(*pending) 513 | gather.cancel() 514 | try: 515 | await gather 516 | except asyncio.CancelledError: 517 | pass 518 | got_message = False 519 | for result in done: 520 | try: 521 | message = result.result() 522 | except (NotImplementedError, ValueError, UnpackError) as e: 523 | sys.stderr.write(f"Warning: {e!s}") 524 | continue 525 | if not isinstance(message, BitcoinMessage): 526 | continue 527 | got_message = True 528 | if self.is_running: 529 | # print(f"{self.address}:{self.port} {message}") 530 | if isinstance(message, VersionMessage): 531 | self.version = message 532 | await self.send_message(VerackMessage()) 533 | elif isinstance(message, Ping): 534 | await self.send_message(Pong(nonce=message.nonce)) 535 | yield message 536 | if not got_message: 537 | break 538 | 539 | 540 | async def collect_addresses(url: str, port: int = 8333) -> Tuple[BitcoinNode, ...]: 541 | return tuple( 542 | BitcoinNode(addr[4][0], source="seed") 543 | for addr in await asyncio.get_running_loop().getaddrinfo( 544 | url, port, proto=socket.IPPROTO_TCP 545 | ) 546 | ) 547 | 548 | 549 | async def collect_defaults( 550 | *args: Union[Tuple[str], Tuple[str, int]], use_shodan: bool = True 551 | ) -> AsyncIterator[BitcoinNode]: 552 | yielded: Set[IPv6Address] = set() 553 | futures = [asyncio.ensure_future(collect_addresses(*arg)) for arg in args] 554 | if use_shodan: 555 | shodan_iterator: Optional[AsyncIterator[ShodanResult]] = NODE_QUERY.run_async( 556 | get_api() 557 | ).__aiter__() 558 | futures.append(asyncio.ensure_future(shodan_iterator.__anext__())) 559 | else: 560 | shodan_iterator = None 561 | shodan_results = 0 562 | bitcoin_seeds = 0 563 | while futures: 564 | done, pending = await asyncio.wait(futures, return_when=asyncio.FIRST_COMPLETED) 565 | futures = list(pending) 566 | for result in await asyncio.gather(*done, return_exceptions=True): 567 | if isinstance(result, StopAsyncIteration): 568 | shodan_iterator = None 569 | continue 570 | elif isinstance(result, ShodanResult) and shodan_iterator is not None: 571 | if result.ip not in yielded: 572 | yield BitcoinNode(result.ip, source="shodan") 573 | yielded.add(result.ip) 574 | shodan_results += 1 575 | futures.append(asyncio.ensure_future(shodan_iterator.__anext__())) 576 | elif isinstance(result, Exception): 577 | sys.stderr.write(f"{result!s}\n") 578 | continue 579 | else: 580 | # this should be an iterable of BitcoinNodes 581 | for node in result: # type: ignore 582 | assert isinstance(node, BitcoinNode) 583 | bitcoin_seeds += 1 584 | if node.address not in yielded: 585 | yield node 586 | yielded.add(node.address) 587 | sys.stderr.write("Got ") 588 | if use_shodan: 589 | sys.stderr.write(f"{shodan_results} seed nodes from Shodan and ") 590 | sys.stderr.write(f"{bitcoin_seeds} official Bitcoin seed nodes\n") 591 | 592 | 593 | class Bitcoin(Blockchain[BitcoinNode]): 594 | name = "bitcoin" 595 | node_type = BitcoinNode 596 | _miner_query_lock: Optional[asyncio.Lock] = None 597 | _miners: Optional[Dict[IPv6Address, ShodanResult]] = None 598 | _finished_miners_query: bool = False 599 | 600 | @classmethod 601 | async def default_seeds(cls) -> AsyncIterator[BitcoinNode]: 602 | return collect_defaults( 603 | ("dnsseed.bitcoin.dashjr.org",), 604 | ("dnsseed.bluematt.me",), 605 | ("seed.bitcoin.jonasschnelli.ch",), 606 | ("seed.bitcoin.sipa.be",), 607 | ("seed.bitcoinstats.com",), 608 | ("seed.btc.petertodd.org",), 609 | ) 610 | 611 | async def get_version(self, node: BitcoinNode) -> Optional[Version]: 612 | try: 613 | version = await node.get_version() 614 | return Version(str(version), version.timestamp) 615 | except BitcoinError: 616 | return None 617 | 618 | async def get_neighbors(self, node: BitcoinNode) -> FrozenSet[BitcoinNode]: 619 | assert node.is_running 620 | neighbor_addrs = await node.get_neighbors() 621 | return frozenset( 622 | BitcoinNode(addr.addr.ip, addr.addr.port) 623 | for addr in neighbor_addrs.addresses 624 | if addr.addr.ip != node.address or addr.addr.port != node.port 625 | ) 626 | 627 | async def get_miner_ips(self) -> KeysView[IPv6Address]: 628 | if self._miner_query_lock is None: 629 | self._miner_query_lock = asyncio.Lock() 630 | await self._miner_query_lock.acquire() 631 | if self._miners is None: 632 | self._miners = {} 633 | self._miner_query_lock.release() 634 | async for miner in MINER_QUERY.run_async(get_api()): 635 | async with self._miner_query_lock: 636 | self._miners[miner.ip] = miner 637 | async with self._miner_query_lock: 638 | self._finished_miners_query = True 639 | else: 640 | self._miner_query_lock.release() 641 | return self._miners.keys() 642 | 643 | async def get_miners(self) -> FrozenSet[BitcoinNode]: 644 | return frozenset(BitcoinNode(ip) for ip in await self.get_miner_ips()) 645 | 646 | async def is_miner(self, node: BitcoinNode) -> Miner: 647 | if self._miner_query_lock is None: 648 | self._miner_query_lock = asyncio.Lock() 649 | async with self._miner_query_lock: 650 | is_miner = ( 651 | self._miners is not None 652 | and self._finished_miners_query 653 | and node.address in self._miners 654 | ) 655 | if is_miner or node.address in await self.get_miner_ips(): 656 | return Miner.MINER 657 | else: 658 | return Miner.UNKNOWN 659 | -------------------------------------------------------------------------------- /fluxture/blockchain.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import socket 3 | from abc import ABCMeta, abstractmethod 4 | from ipaddress import IPv4Address, IPv6Address, ip_address 5 | from typing import (AsyncIterator, Dict, FrozenSet, Generic, Optional, Tuple, 6 | Type, TypeVar, Union) 7 | 8 | from . import serialization 9 | from .messaging import Message 10 | 11 | 12 | class BlockchainError(RuntimeError): 13 | pass 14 | 15 | 16 | class Miner(serialization.IntEnum): 17 | UNKNOWN = 0 18 | MINER = 1 19 | NOT_MINER = 2 20 | 21 | 22 | def get_public_ip() -> Union[IPv4Address, IPv6Address]: 23 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 24 | s.connect(("8.8.8.8", 80)) 25 | try: 26 | return ip_address(s.getsockname()[0]) 27 | finally: 28 | s.close() 29 | 30 | 31 | class Version: 32 | def __init__(self, version: str, timestamp: int): 33 | self.version: str = version 34 | self.timestamp: int = timestamp 35 | 36 | 37 | class Node(metaclass=ABCMeta): 38 | def __init__( 39 | self, 40 | address: Union[str, bytes, IPv4Address, IPv6Address], 41 | port: int, 42 | source: str = "peer", 43 | ): 44 | if not isinstance(address, IPv6Address): 45 | self.address: IPv6Address = serialization.IPv6Address(address) 46 | else: 47 | self.address = address 48 | self.port: int = port 49 | self.source: str = source 50 | self._reader: Optional[asyncio.StreamReader] = None 51 | self._writer: Optional[asyncio.StreamWriter] = None 52 | self._entries: int = 0 53 | self._stop: Optional[asyncio.Event] = None 54 | 55 | @property 56 | def is_running(self) -> bool: 57 | return ( 58 | self._reader is not None 59 | and self._stop is not None 60 | and not self._stop.is_set() 61 | ) 62 | 63 | def terminate(self): 64 | if self._stop is not None: 65 | self._stop.set() 66 | 67 | async def join(self): 68 | if self._stop is not None: 69 | await self._stop.wait() 70 | 71 | @property 72 | async def reader(self) -> asyncio.StreamReader: 73 | if self._reader is None: 74 | await self.connect() 75 | return self._reader 76 | 77 | @property 78 | async def writer(self) -> asyncio.StreamWriter: 79 | if self._writer is None: 80 | await self.connect() 81 | return self._writer 82 | 83 | async def connect(self): 84 | if self._reader is None: 85 | self._reader, self._writer = await asyncio.open_connection( 86 | str(self.address), 87 | self.port, 88 | happy_eyeballs_delay=0.25, # this causes IPv4 and IPv6 attempts to be interleaved 89 | ) 90 | if self._stop is None: 91 | self._stop = asyncio.Event() 92 | elif self._stop.is_set(): 93 | self._stop.clear() 94 | 95 | async def close(self): 96 | if self._writer is not None: 97 | self._writer.close() 98 | try: 99 | await self._writer.wait_closed() 100 | except BrokenPipeError: 101 | # this is expected 102 | pass 103 | self._reader = None 104 | self._writer = None 105 | if not self._stop.is_set(): 106 | self._stop.set() 107 | 108 | async def __aenter__(self): 109 | self._entries += 1 110 | if self._entries == 1 and self._reader is None: 111 | await self.connect() 112 | 113 | async def __aexit__(self, exc_type, exc_val, exc_tb): 114 | self._entries -= 1 115 | if self._entries == 0 and self._reader is not None: 116 | await self.close() 117 | 118 | async def send_message(self, message: Message): 119 | writer = await self.writer 120 | writer.write(message.serialize()) 121 | await writer.drain() 122 | 123 | def __hash__(self): 124 | return hash((self.address, self.port)) 125 | 126 | def __eq__(self, other): 127 | return ( 128 | isinstance(other, Node) 129 | and other.address == self.address 130 | and other.port == self.port 131 | ) 132 | 133 | def __repr__(self): 134 | return ( 135 | f"{self.__class__.__name__}(address={self.address!r}, port={self.port!r})" 136 | ) 137 | 138 | @abstractmethod 139 | async def run(self) -> AsyncIterator[Message]: 140 | raise NotImplementedError() 141 | 142 | 143 | N = TypeVar("N", bound=Node) 144 | 145 | 146 | BLOCKCHAINS: Dict[str, Type["Blockchain[Node]"]] = {} 147 | 148 | 149 | class Blockchain(Generic[N], metaclass=ABCMeta): 150 | DEFAULT_SEEDS: Tuple[N, ...] = () 151 | name: str 152 | node_type: Type[N] 153 | 154 | def __init_subclass__(cls, **kwargs): 155 | if not hasattr(cls, "name") or cls.name is None: 156 | raise TypeError("Subclasses of `Blockchain` must define a `name`") 157 | if not hasattr(cls, "node_type") or cls.node_type is None: 158 | raise TypeError("Subclasses of `Blockchain` must define a `node_type`") 159 | BLOCKCHAINS[cls.name] = cls 160 | 161 | @classmethod 162 | @abstractmethod 163 | async def default_seeds(cls) -> AsyncIterator[N]: 164 | raise NotImplementedError() 165 | 166 | @abstractmethod 167 | async def get_neighbors(self, node: N) -> FrozenSet[N]: 168 | raise NotImplementedError() 169 | 170 | @abstractmethod 171 | async def get_version(self, node: N) -> Optional[Version]: 172 | raise NotImplementedError() 173 | 174 | @abstractmethod 175 | async def is_miner(self, node: N) -> Miner: 176 | raise NotImplementedError() 177 | 178 | @abstractmethod 179 | async def get_miners(self) -> FrozenSet[N]: 180 | raise NotImplementedError() 181 | -------------------------------------------------------------------------------- /fluxture/crawl_schema.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | from abc import abstractmethod 3 | from ipaddress import IPv4Address 4 | from ipaddress import IPv6Address as IPv6AddressPython 5 | from typing import (Callable, FrozenSet, Generic, Optional, Set, Sized, 6 | TypeVar, Union) 7 | 8 | from .blockchain import Miner, Node, Version 9 | from .db import Cursor, Database, ForeignKey, Model, Table 10 | from .geolocation import Geolocation 11 | from .serialization import DateTime, IntFlag, IPv6Address 12 | 13 | N = TypeVar("N", bound=Node) 14 | 15 | 16 | class HostInfo(Model): 17 | ip: IPv6Address 18 | isp: str 19 | os: str 20 | timestamp: DateTime 21 | 22 | def __hash__(self): 23 | return hash(self.ip) 24 | 25 | 26 | class CrawlState(IntFlag): 27 | UNKNOWN = 0 28 | DISCOVERED = 1 29 | GEOLOCATED = 2 30 | ATTEMPTED_CONNECTION = DISCOVERED | 4 31 | CONNECTION_FAILED = ATTEMPTED_CONNECTION | 8 32 | CONNECTED = ATTEMPTED_CONNECTION | 16 33 | CONNECTION_RESET = CONNECTED | 32 34 | REQUESTED_NEIGHBORS = CONNECTED | 64 35 | GOT_NEIGHBORS = REQUESTED_NEIGHBORS | 128 36 | REQUESTED_VERSION = CONNECTED | 256 37 | GOT_VERSION = REQUESTED_VERSION | 512 38 | 39 | 40 | class CrawledNode(Model["CrawlDatabase"]): 41 | ip: IPv6Address 42 | port: int 43 | is_miner: Miner 44 | state: CrawlState 45 | source: str 46 | 47 | def __hash__(self): 48 | return hash((self.ip, self.port)) 49 | 50 | def get_events(self) -> Cursor["CrawlEvent"]: 51 | return self.db.events.select( 52 | node=self.rowid, order_by="timestamp", order_direction="DESC" 53 | ) 54 | 55 | def get_version(self) -> Optional[Version]: 56 | for version_event in self.db.events.select( 57 | node=self.rowid, 58 | order_by="timestamp", 59 | order_direction="DESC", 60 | limit=1, 61 | event="version", 62 | ): 63 | return Version(version_event.description, version_event.timestamp) 64 | return None 65 | 66 | def get_location(self) -> Optional[Geolocation]: 67 | return self.db.locations.select( 68 | ip=self.ip, order_by="timestamp DESC", limit=1 69 | ).fetchone() 70 | 71 | def last_crawled(self) -> Optional[DateTime]: 72 | max_edge = Cursor( 73 | self.db.edges, 74 | "SELECT a.* FROM edges a LEFT OUTER JOIN edges b ON a.rowid = b.rowid AND a.timestamp < b.timestamp " 75 | "WHERE b.rowid is NULL AND a.from_node = ? LIMIT 1", 76 | (self.rowid,), 77 | ).fetchone() 78 | if max_edge is None: 79 | return None 80 | return max_edge.timestamp 81 | 82 | def get_latest_edges(self) -> Set["CrawledNode"]: 83 | return { 84 | edge.to_node.row 85 | for edge in Cursor( 86 | self.db.edges, 87 | "SELECT a.* FROM edges a LEFT OUTER JOIN edges b ON a.rowid = b.rowid AND a.timestamp < b.timestamp " 88 | "WHERE b.rowid is NULL AND a.from_node = ?", 89 | (self.rowid,), 90 | ) 91 | } 92 | 93 | def out_degree(self) -> int: 94 | cur = self.db.con.cursor() 95 | try: 96 | result = cur.execute( 97 | "SELECT count(a.*) FROM edges a " 98 | "LEFT OUTER JOIN edges b ON a.rowid = b.rowid AND a.timestamp < b.timestamp " 99 | "WHERE b.rowid is NULL AND a.from_node = ?", 100 | (self.rowid,), 101 | ) 102 | return result.fetchone()[0] 103 | finally: 104 | cur.close() 105 | 106 | 107 | class Edge(Model): 108 | from_node: ForeignKey["nodes", CrawledNode] # noqa: F821 109 | to_node: ForeignKey["nodes", CrawledNode] # noqa: F821 110 | timestamp: DateTime 111 | 112 | 113 | class CrawlEvent(Model): 114 | node: ForeignKey["nodes", CrawledNode] # noqa: F821 115 | timestamp: DateTime 116 | event: str 117 | description: str 118 | 119 | 120 | class CrawlDatabase(Database): 121 | nodes: Table[CrawledNode] 122 | events: Table[CrawlEvent] 123 | locations: Table[Geolocation] 124 | edges: Table[Edge] 125 | hosts: Table[HostInfo] 126 | 127 | def __init__(self, path: str = ":memory:"): 128 | super().__init__(path) 129 | 130 | @property 131 | def crawled_nodes(self) -> Cursor[CrawledNode]: 132 | return Cursor( 133 | self.nodes, 134 | f"SELECT DISTINCT n.*, n.rowid FROM {self.nodes.name} n WHERE n.state >= ?", 135 | (CrawlState.CONNECTED,), 136 | ) 137 | 138 | 139 | class Crawl(Generic[N], Sized): 140 | @abstractmethod 141 | def __contains__(self, node: N) -> bool: 142 | raise NotImplementedError() 143 | 144 | @abstractmethod 145 | def __getitem__(self, node: N) -> CrawledNode: 146 | raise NotImplementedError() 147 | 148 | @abstractmethod 149 | def get_node(self, node: N) -> CrawledNode: 150 | raise NotImplementedError() 151 | 152 | @abstractmethod 153 | def add_event( 154 | self, 155 | node: CrawledNode, 156 | event: str, 157 | description: str, 158 | timestamp: Optional[DateTime] = None, 159 | ): 160 | raise NotImplementedError() 161 | 162 | @abstractmethod 163 | def set_location(self, ip: IPv6Address, location: Geolocation): 164 | raise NotImplementedError() 165 | 166 | @abstractmethod 167 | def get_neighbors(self, node: N) -> FrozenSet[N]: 168 | raise NotImplementedError() 169 | 170 | @abstractmethod 171 | def set_neighbors(self, node: N, neighbors: FrozenSet[N]): 172 | raise NotImplementedError() 173 | 174 | @abstractmethod 175 | def set_miner(self, node: N, miner: Miner): 176 | raise NotImplementedError() 177 | 178 | @abstractmethod 179 | def set_host_info(self, host_info: HostInfo): 180 | raise NotImplementedError() 181 | 182 | @abstractmethod 183 | def add_state(self, node: Union[N, CrawledNode], state: CrawlState): 184 | raise NotImplementedError() 185 | 186 | @abstractmethod 187 | def update_node(self, node: CrawledNode): 188 | raise NotImplementedError() 189 | 190 | def commit(self): 191 | pass 192 | 193 | 194 | class DatabaseCrawl(Generic[N], Crawl[N]): 195 | def __init__( 196 | self, 197 | constructor: Callable[[Union[str, IPv4Address, IPv6AddressPython], int], N], 198 | db: CrawlDatabase, 199 | ): 200 | super().__init__() 201 | self.constructor: Callable[ 202 | [Union[str, IPv4Address, IPv6AddressPython], int], N 203 | ] = constructor 204 | self.db: CrawlDatabase = db 205 | 206 | def __contains__(self, node: N) -> bool: 207 | return self.db.nodes.select(ip=node.ip, port=node.port).fetchone() is not None 208 | 209 | def __getitem__(self, node: N) -> CrawledNode: 210 | try: 211 | return next(iter(self.db.nodes.select(ip=node.address, port=node.port))) 212 | except StopIteration: 213 | pass 214 | raise KeyError(node) 215 | 216 | def commit(self): 217 | self.db.con.commit() 218 | 219 | def get_node(self, node: N) -> CrawledNode: 220 | try: 221 | return self[node] 222 | except KeyError: 223 | # this is a new node 224 | pass 225 | ret = CrawledNode(ip=node.address, port=node.port, source=node.source) 226 | self.db.nodes.append(ret) 227 | return ret 228 | 229 | def update_node(self, node: CrawledNode): 230 | with self.db: 231 | self.db.nodes.update(node) 232 | 233 | def add_event( 234 | self, 235 | node: CrawledNode, 236 | event: str, 237 | description: str, 238 | timestamp: Optional[DateTime] = None, 239 | ): 240 | with self.db: 241 | if timestamp is None: 242 | timestamp = DateTime() 243 | self.db.events.append( 244 | CrawlEvent( 245 | node=node.rowid, 246 | event=event, 247 | description=description, 248 | timestamp=timestamp, 249 | ) 250 | ) 251 | 252 | def get_neighbors(self, node: N) -> FrozenSet[N]: 253 | return frozenset( 254 | { 255 | self.constructor(neighbor.ip, neighbor.port) 256 | for neighbor in self.get_node(node).get_latest_edges() 257 | } 258 | ) 259 | 260 | def set_neighbors(self, node: N, neighbors: FrozenSet[N]): 261 | with self.db: 262 | crawled_node = self.get_node(node) 263 | timestamp = DateTime() 264 | self.db.edges.extend( 265 | [ 266 | Edge( 267 | from_node=crawled_node, 268 | to_node=self.get_node(neighbor), 269 | timestamp=timestamp, 270 | ) 271 | for neighbor in neighbors 272 | ] 273 | ) 274 | self.add_state(node, CrawlState.GOT_NEIGHBORS) 275 | for neighbor in neighbors: 276 | # Make sure we record that we discovered the neighbor 277 | _ = self.get_node(neighbor) 278 | # (simply getting the node for the neighbor will ensure that its state's "discovered" flag is set) 279 | 280 | def set_location(self, ip: IPv6Address, location: Geolocation): 281 | with self.db: 282 | self.db.locations.append(location) 283 | 284 | def set_miner(self, node: N, miner: Miner): 285 | with self.db: 286 | crawled_node = self.get_node(node) 287 | crawled_node.is_miner = miner 288 | self.db.nodes.update(crawled_node) 289 | 290 | def set_host_info(self, host_info: HostInfo): 291 | with self.db: 292 | self.db.hosts.append(host_info) 293 | 294 | def add_state(self, node: Union[N, CrawledNode], state: CrawlState): 295 | with self.db: 296 | if isinstance(node, CrawledNode): 297 | crawled_node = node 298 | else: 299 | crawled_node = self.get_node(node) 300 | if crawled_node.state & state != state: 301 | crawled_node.state = crawled_node.state | state 302 | self.db.nodes.update(crawled_node) 303 | 304 | def __len__(self) -> int: 305 | return len(self.db.nodes) 306 | -------------------------------------------------------------------------------- /fluxture/crawler.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import errno 3 | import resource 4 | import sys 5 | import traceback 6 | from abc import ABCMeta 7 | from argparse import ArgumentParser, Namespace 8 | from asyncio import Future, ensure_future 9 | from collections import deque 10 | from inspect import isabstract 11 | from typing import (Any, AsyncIterator, Coroutine, Deque, Dict, FrozenSet, 12 | Generic, Iterable, List, Optional, Union) 13 | 14 | from geoip2.errors import AddressNotFoundError 15 | from tqdm import tqdm 16 | 17 | from .blockchain import BLOCKCHAINS, Blockchain, BlockchainError, Miner, Node 18 | from .crawl_schema import (Crawl, CrawlDatabase, CrawlState, DatabaseCrawl, 19 | DateTime, N) 20 | from .fluxture import Command 21 | from .geolocation import (GeoIP2Error, GeoIP2Locator, Geolocator, 22 | download_maxmind_db) 23 | 24 | CRAWL_LISTENERS: List["CrawlListener"] = [] 25 | 26 | 27 | class CrawlListener: 28 | has_on_crawl_node: bool = False 29 | has_on_miner: bool = False 30 | has_on_complete: bool = False 31 | 32 | async def on_crawl_node(self, crawler: "Crawler", node: Node): 33 | pass 34 | 35 | async def on_miner(self, crawler: "Crawler", node: Node, miner: Miner): 36 | pass 37 | 38 | async def on_complete(self, crawler: "Crawler"): 39 | pass 40 | 41 | def __init_subclass__(cls, **kwargs): 42 | if not isabstract(cls): 43 | for func in dir(cls): 44 | if func.startswith("on_") and hasattr(CrawlListener, func): 45 | setattr( 46 | cls, 47 | f"has_{func}", 48 | getattr(cls, func) != getattr(CrawlListener, func), 49 | ) 50 | CRAWL_LISTENERS.append(cls()) 51 | 52 | 53 | class MinerTask(CrawlListener): 54 | async def on_crawl_node(self, crawler: "Crawler", node: Node): 55 | is_miner = await crawler.blockchain.is_miner(node) 56 | crawler.crawl.set_miner(node, is_miner) 57 | crawler.add_tasks( 58 | *( 59 | listener.on_miner(crawler, node, is_miner) 60 | for listener in CRAWL_LISTENERS 61 | if listener.has_on_miner 62 | ) 63 | ) 64 | if is_miner == Miner.MINER: 65 | print(f"Node {node} is a miner") 66 | elif is_miner == Miner.NOT_MINER: 67 | print(f"Node {node} is not a miner") 68 | 69 | 70 | class Crawler(Generic[N], metaclass=ABCMeta): 71 | def __init__( 72 | self, 73 | blockchain: Blockchain[N], 74 | crawl: Crawl[N], 75 | geolocator: Optional[Geolocator] = None, 76 | max_connections: Optional[int] = None, 77 | ): 78 | self.blockchain: Blockchain[N] = blockchain 79 | self.crawl: Crawl[N] = crawl 80 | self.geolocator: Optional[Geolocator] = geolocator 81 | self.nodes: Dict[N, N] = {} 82 | if max_connections is None: 83 | max_connections = resource.getrlimit(resource.RLIMIT_NOFILE)[0] // 3 * 2 84 | max_connections = max(max_connections, 1) 85 | self.max_connections: int = max_connections 86 | self.listener_tasks: List[Future] = [] 87 | 88 | async def _crawl_node(self, node: N) -> FrozenSet[N]: 89 | crawled_node = self.crawl.get_node(node) 90 | if ( 91 | self.geolocator is not None 92 | and crawled_node.state & CrawlState.GEOLOCATED != CrawlState.GEOLOCATED 93 | ): 94 | try: 95 | self.crawl.set_location( 96 | node.address, self.geolocator.locate(node.address) 97 | ) 98 | self.crawl.add_state(crawled_node, CrawlState.GEOLOCATED) 99 | except AddressNotFoundError: 100 | pass 101 | if ( 102 | crawled_node.state & CrawlState.ATTEMPTED_CONNECTION 103 | == CrawlState.ATTEMPTED_CONNECTION 104 | ): 105 | raise ValueError(f"Node {node} was already crawled!") 106 | self.crawl.add_state(crawled_node, CrawlState.ATTEMPTED_CONNECTION) 107 | try: 108 | async with node: 109 | self.crawl.add_state(crawled_node, CrawlState.CONNECTED) 110 | neighbors = [] 111 | new_neighbors = set() 112 | self.crawl.add_state(crawled_node, CrawlState.REQUESTED_NEIGHBORS) 113 | for neighbor in await self.blockchain.get_neighbors(node): 114 | if neighbor in self.nodes: 115 | # we have already seen this node 116 | neighbors.append(self.nodes[neighbor]) 117 | else: 118 | self.nodes[neighbor] = neighbor 119 | neighbors.append(neighbor) 120 | new_neighbors.add(neighbor) 121 | self.crawl.set_neighbors(node, frozenset(neighbors)) 122 | self.crawl.add_state( 123 | crawled_node, 124 | CrawlState.GOT_NEIGHBORS | CrawlState.REQUESTED_VERSION, 125 | ) 126 | version = await self.blockchain.get_version(node) 127 | if version is not None: 128 | self.crawl.add_state(crawled_node, CrawlState.GOT_VERSION) 129 | crawled_node = self.crawl.get_node(node) 130 | self.crawl.add_event( 131 | crawled_node, 132 | event="version", 133 | description=version.version, 134 | timestamp=DateTime(version.timestamp), 135 | ) 136 | return frozenset(new_neighbors) 137 | except BrokenPipeError: 138 | self.crawl.add_state(crawled_node, CrawlState.CONNECTION_RESET) 139 | raise 140 | except OSError as e: 141 | if e.errno in ( 142 | errno.ETIMEDOUT, 143 | errno.ECONNREFUSED, 144 | errno.EHOSTDOWN, 145 | errno.EHOSTUNREACH, 146 | ): 147 | # Connection failed 148 | self.crawl.add_state(crawled_node, CrawlState.CONNECTION_FAILED) 149 | else: 150 | # Something happened after we connected (e.g., connection reset by peer) 151 | self.crawl.add_state(crawled_node, CrawlState.CONNECTION_RESET) 152 | raise 153 | finally: 154 | await node.close() 155 | 156 | def add_tasks(self, *tasks: Union[Future, Coroutine[Any, Any, None]]): 157 | for task in tasks: 158 | if isinstance(task, Coroutine): 159 | self.listener_tasks.append(ensure_future(task)) 160 | else: 161 | self.listener_tasks.append(task) 162 | 163 | async def _check_miner(self, node: N): 164 | is_miner = await self.blockchain.is_miner(node) 165 | self.crawl.set_miner(node, is_miner) 166 | return node, is_miner 167 | 168 | async def _crawl(self, seeds: Optional[Iterable[N]] = None): 169 | if seeds is None: 170 | seed_iter: Optional[ 171 | AsyncIterator[N] 172 | ] = await self.blockchain.default_seeds() 173 | queue: Deque[N] = deque() 174 | futures: List[Future] = [ensure_future(seed_iter.__anext__())] 175 | num_seeds = 0 176 | else: 177 | seed_iter = None 178 | queue = deque(seeds) 179 | futures: List[Future] = [] 180 | num_seeds = len(seeds) 181 | num_connected_to = 0 182 | while futures or queue or self.listener_tasks: 183 | print( 184 | f"Discovered {len(self.nodes)} nodes ({num_seeds} seeds); crawled {num_connected_to}; " 185 | f"crawling {len(futures)}; waiting to crawl {len(queue)}..." 186 | ) 187 | if futures: 188 | waiting_on = futures 189 | done, pending = await asyncio.wait( 190 | waiting_on, return_when=asyncio.FIRST_COMPLETED 191 | ) 192 | futures = list(pending) 193 | for result in await asyncio.gather(*done, return_exceptions=True): 194 | # iterate over all of the new neighbors of the node 195 | if isinstance(result, StopAsyncIteration) and seed_iter is not None: 196 | seed_iter = None 197 | elif isinstance(result, Exception): 198 | # TODO: Save the exception to the database 199 | # self.crawl.add_event(node, event="Exception", description=str(result)) 200 | if isinstance( 201 | result, 202 | ( 203 | ConnectionError, 204 | OSError, 205 | BrokenPipeError, 206 | BlockchainError, 207 | ), 208 | ): 209 | print(str(result)) 210 | else: 211 | traceback.print_tb(result.__traceback__) 212 | print(result) 213 | elif seed_iter is not None and isinstance(result, Node): 214 | # This is a seed 215 | crawled_node = self.crawl.get_node(result) 216 | if crawled_node.source != result.source: 217 | # this means we already organically encountered this node from another peer 218 | # so update its source to be the seed 219 | crawled_node.source = result.source 220 | self.crawl.update_node(crawled_node) 221 | self.crawl.add_state(crawled_node, CrawlState.DISCOVERED) 222 | # Check if we have already encountered this node 223 | queue.append(result) 224 | num_seeds += 1 225 | futures.append(ensure_future(seed_iter.__anext__())) 226 | else: 227 | num_connected_to += 1 228 | queue.extend(result) 229 | if self.listener_tasks: 230 | waiting_on = self.listener_tasks 231 | done, pending = await asyncio.wait( 232 | waiting_on, return_when=asyncio.FIRST_COMPLETED, timeout=0.5 233 | ) 234 | for result in await asyncio.gather(*done, return_exceptions=True): 235 | if isinstance(result, Exception): 236 | # TODO: Save the exception to the database 237 | # self.crawl.add_event(node, event="Exception", description=str(result)) 238 | traceback.print_tb(result.__traceback__) 239 | print(result) 240 | self.listener_tasks = list(pending) 241 | new_nodes_to_crawl = min(self.max_connections - len(futures), len(queue)) 242 | if new_nodes_to_crawl: 243 | nodes_to_crawl = [] 244 | for i in range(new_nodes_to_crawl): 245 | node = queue.popleft() 246 | if node in self.nodes: 247 | nodes_to_crawl.append(self.nodes[node]) 248 | else: 249 | nodes_to_crawl.append(node) 250 | self.nodes[node] = node 251 | futures.extend( 252 | ensure_future(self._crawl_node(node)) for node in nodes_to_crawl 253 | ) 254 | self.add_tasks( 255 | *( 256 | listener.on_crawl_node(crawler=self, node=node) 257 | for node in nodes_to_crawl 258 | for listener in CRAWL_LISTENERS 259 | if listener.has_on_crawl_node 260 | ) 261 | ) 262 | self.crawl.commit() 263 | 264 | for miner in await self.blockchain.get_miners(): 265 | self.crawl.set_miner(miner, Miner.MINER) 266 | 267 | for node in self.nodes.values(): 268 | if node.is_running: 269 | node.terminate() 270 | await node.join() 271 | 272 | self.add_tasks( 273 | *( 274 | listener.on_complete(crawler=self) 275 | for listener in CRAWL_LISTENERS 276 | if listener.has_on_complete 277 | ) 278 | ) 279 | 280 | # wait for the on_complete tasks to finish: 281 | while self.listener_tasks: 282 | waiting_on = self.listener_tasks 283 | done, pending = await asyncio.wait( 284 | waiting_on, return_when=asyncio.FIRST_COMPLETED, timeout=0.5 285 | ) 286 | for result in await asyncio.gather(*done, return_exceptions=True): 287 | if isinstance(result, Exception): 288 | # TODO: Save the exception to the database 289 | # self.crawl.add_event(node, event="Exception", description=str(result)) 290 | traceback.print_tb(result.__traceback__) 291 | print(result) 292 | self.listener_tasks = list(pending) 293 | 294 | def do_crawl(self, seeds: Optional[Iterable[N]] = None): 295 | asyncio.run(self._crawl(seeds)) 296 | 297 | def crawl_node(self, node: N) -> FrozenSet[N]: 298 | """Return the neighbors for a single node""" 299 | return asyncio.run(self._crawl_node(node)) 300 | 301 | 302 | CITY_DB_PARSER: ArgumentParser = ArgumentParser(add_help=False) 303 | 304 | CITY_DB_PARSER.add_argument( 305 | "--city-db-path", 306 | "-c", 307 | type=str, 308 | default=None, 309 | help="path to a MaxMind GeoLite2 City database (default is " 310 | "`~/.config/fluxture/geolite2/GeoLite2-City.mmdb`); " 311 | "if omitted and `--maxmind-license-key` is provided, the latest database will be " 312 | "downloaded and saved to the default location; " 313 | "if both options are omttied, then geolocation will not be performed", 314 | ) 315 | CITY_DB_PARSER.add_argument( 316 | "--maxmind-license-key", 317 | type=str, 318 | default=None, 319 | help="License key for automatically downloading a GeoLite2 City database; you generate get " 320 | "a free license key by registering at https://www.maxmind.com/en/geolite2/signup", 321 | ) 322 | 323 | 324 | class UpdateMaxmindDBCommand(Command): 325 | name = "update-geo-db" 326 | help = "download the latest MaxMind GeoLite2 database" 327 | parent_parsers = (CITY_DB_PARSER,) 328 | 329 | def run(self, args: Namespace): 330 | if args.maxmind_license_key is None: 331 | sys.stderr.write("Error: --maxmind-license-key must be provided\n\n") 332 | sys.exit(1) 333 | save_path = download_maxmind_db(args.maxmind_license_key, args.city_db_path) 334 | print(f"Geolocation database saved to {save_path}") 335 | 336 | 337 | class NodeCommand(Command): 338 | name = "node" 339 | help = "connect to and interrogate a specific node" 340 | 341 | def __init_arguments__(self, parser: ArgumentParser): 342 | parser.add_argument( 343 | "BLOCKCHAIN_NAME", 344 | type=str, 345 | help="the name of the blockchain to crawl", 346 | choices=BLOCKCHAINS.keys(), 347 | ) 348 | parser.add_argument( 349 | "IP_ADDRESS", type=str, help="IP address of the node to interrogate" 350 | ) 351 | 352 | def run(self, args: Namespace): 353 | blockchain_type = BLOCKCHAINS[args.BLOCKCHAIN_NAME] 354 | with CrawlDatabase() as db: 355 | for neighbor in sorted( 356 | str(n.address) 357 | for n in Crawler( 358 | blockchain=blockchain_type(), 359 | crawl=DatabaseCrawl(blockchain_type.node_type, db), 360 | ).crawl_node(blockchain_type.node_type(args.IP_ADDRESS)) 361 | ): 362 | print(neighbor) 363 | 364 | 365 | class GeolocateCommand(Command): 366 | name = "geolocate" 367 | help = "re-run geolocation for already crawled nodes (e.g., after a call to the `update-geo-db` command)" 368 | parent_parsers = (CITY_DB_PARSER,) 369 | 370 | def __init_arguments__(self, parser: ArgumentParser): 371 | parser.add_argument( 372 | "CRAWL_DATABASE", type=str, help="path to the crawl database to update" 373 | ) 374 | parser.add_argument( 375 | "--process-all", 376 | "-a", 377 | action="store_true", 378 | help="by default, this command only geolocates " 379 | "nodes that do not already have a " 380 | "location; this option will re-process " 381 | "all nodes", 382 | ) 383 | 384 | def run(self, args: Namespace): 385 | geo = GeoIP2Locator(args.city_db_path, args.maxmind_license_key) 386 | 387 | with CrawlDatabase(args.CRAWL_DATABASE) as db: 388 | added = 0 389 | updated = 0 390 | with tqdm(db.nodes, leave=False, desc="geolocating", unit=" nodes") as t: 391 | for node in t: 392 | old_location = node.get_location() 393 | was_none = old_location is None 394 | if not args.process_all and not was_none: 395 | continue 396 | try: 397 | new_location = geo.locate(node.ip) 398 | except AddressNotFoundError: 399 | continue 400 | if new_location is not None: 401 | if was_none: 402 | db.locations.append(new_location) 403 | added += 1 404 | elif any( 405 | a != b 406 | for (field_name_a, a), (field_name_b, b) in zip( 407 | new_location.items(), old_location.items() 408 | ) 409 | if ( 410 | field_name_a != "rowid" 411 | and field_name_b != "rowid" 412 | and field_name_a != "timestamp" 413 | and field_name_b != "timestamp" 414 | ) 415 | ): 416 | # the location was updated 417 | new_location.rowid = old_location.rowid 418 | new_location.db = db 419 | db.locations.update(new_location) 420 | updated += 1 421 | else: 422 | continue 423 | t.desc = f"geolocating ({added} added, {updated} updated)" 424 | print(f"Added {added} new locations and updated {updated} existing ones") 425 | 426 | 427 | class CrawlCommand(Command): 428 | name = "crawl" 429 | help = "crawl a blockchain" 430 | parent_parsers = (CITY_DB_PARSER,) 431 | 432 | def __init_arguments__(self, parser: ArgumentParser): 433 | parser.add_argument( 434 | "--database", 435 | "-db", 436 | type=str, 437 | default=":memory:", 438 | help="path to the crawl database (default is to run in memory)", 439 | ) 440 | max_file_descriptors, _ = resource.getrlimit(resource.RLIMIT_NOFILE) 441 | parser.add_argument( 442 | "--max-connections", 443 | "-m", 444 | type=int, 445 | default=None, 446 | help="the maximum number of connections to open at once during the crawl, capped at " 447 | f"⅔ of `ulimit -n` = {max(max_file_descriptors // 3 * 2, 1)} (default is to use the " 448 | "maximum possible)", 449 | ) 450 | parser.add_argument( 451 | "BLOCKCHAIN_NAME", 452 | type=str, 453 | help="the name of the blockchain to crawl", 454 | choices=BLOCKCHAINS.keys(), 455 | ) 456 | 457 | def run(self, args: Namespace): 458 | try: 459 | geo = GeoIP2Locator(args.city_db_path, args.maxmind_license_key) 460 | except GeoIP2Error as e: 461 | sys.stderr.write(f"Warning: {e}\nCrawl IPs will not be geolocated!\n") 462 | geo = None 463 | 464 | if args.database == ":memory:": 465 | sys.stderr.write( 466 | "Warning: Using an in-memory crawl database. Results will not be saved!\n" 467 | "Run with `--database` to set a path for the database to be saved.\n" 468 | ) 469 | 470 | blockchain_type = BLOCKCHAINS[args.BLOCKCHAIN_NAME] 471 | 472 | if args.max_connections is None: 473 | max_file_handles, _ = resource.getrlimit(resource.RLIMIT_NOFILE) 474 | if sys.stderr.isatty() and sys.stdin.isatty(): 475 | if max_file_handles < 1024: 476 | while True: 477 | sys.stderr.write( 478 | f"`uname -n` is {max_file_handles}, which is low and will cause the crawl to " 479 | "be very slow.\nWould you like to increase this value to 32768? [Yn] " 480 | ) 481 | choice = input("") 482 | if choice.lower() == "y" or len(choice.strip()) == 0: 483 | resource.setrlimit( 484 | resource.RLIMIT_NOFILE, (32768, resource.RLIM_INFINITY) 485 | ) 486 | max_file_handles, _ = resource.getrlimit( 487 | resource.RLIMIT_NOFILE 488 | ) 489 | break 490 | elif choice.lower() == "n": 491 | break 492 | max_connections = max(max_file_handles // 3 * 2, 1) 493 | else: 494 | max_connections = args.max_connections 495 | 496 | def crawl(): 497 | with CrawlDatabase(args.database) as db: 498 | Crawler( 499 | blockchain=blockchain_type(), 500 | crawl=DatabaseCrawl(blockchain_type.node_type, db), 501 | geolocator=geo, 502 | max_connections=max_connections, 503 | ).do_crawl() 504 | 505 | if geo is None: 506 | crawl() 507 | else: 508 | with geo: 509 | crawl() 510 | -------------------------------------------------------------------------------- /fluxture/db.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | from typing import (Any, Dict, Generic, Iterable, Iterator, List, Optional, 3 | OrderedDict, Tuple, Type, TypeVar, Union, cast) 4 | 5 | from .serialization import Packable 6 | from .structures import Struct, StructMeta 7 | 8 | FieldType = Union[bool, int, float, str, bytes, Packable, "ForeignKey"] 9 | 10 | T = TypeVar("T", bound=FieldType) 11 | D = TypeVar("D") 12 | 13 | 14 | class AutoIncrement(int): 15 | initialized: bool = False 16 | 17 | def __new__(cls, *args): 18 | if args and ( 19 | len(args) > 1 20 | or not isinstance(args[0], AutoIncrement) 21 | or args[0].initialized 22 | ): 23 | retval = int.__new__(cls, *args) 24 | setattr(retval, "initialized", True) 25 | else: 26 | retval = int.__new__(cls, 0) 27 | setattr(retval, "initialized", False) 28 | return retval 29 | 30 | def __repr__(self): 31 | if self.initialized: 32 | return f"{self.__class__.__name__}({int(self)})" 33 | else: 34 | return f"{self.__class__.__name__}()" 35 | 36 | 37 | class RowId(int): 38 | initialized: bool = False 39 | 40 | def __new__(cls, *args): 41 | if args and ( 42 | len(args) > 1 or not isinstance(args[0], RowId) or args[0].initialized 43 | ): 44 | retval = int.__new__(cls, *args) 45 | setattr(retval, "initialized", True) 46 | else: 47 | retval = int.__new__(cls, 0) 48 | setattr(retval, "initialized", False) 49 | return retval 50 | 51 | def __repr__(self): 52 | if self.initialized: 53 | return f"{self.__class__.__name__}({int(self)})" 54 | else: 55 | return f"{self.__class__.__name__}()" 56 | 57 | def __eq__(self, other): 58 | return isinstance(other, RowId) and ( 59 | not self.initialized or not other.initialized or int(self) == int(other) 60 | ) 61 | 62 | 63 | class ColumnOptions: 64 | def __init__( 65 | self, 66 | primary_key: Optional[bool] = None, 67 | unique: Optional[bool] = None, 68 | not_null: Optional[bool] = None, 69 | default: Optional[FieldType] = None, 70 | auto_increment: Optional[bool] = None, 71 | ): 72 | self.primary_key: Optional[bool] = primary_key 73 | self.unique: Optional[bool] = unique 74 | self.not_null: Optional[bool] = not_null 75 | self.default: Optional[FieldType] = default 76 | self.auto_increment: Optional[bool] = auto_increment 77 | if self.auto_increment and not self.default: 78 | self.default = AutoIncrement() 79 | 80 | def is_set(self, option_name: str): 81 | a = getattr(self, option_name) 82 | return a is not None and not callable(a) 83 | 84 | def set_options(self) -> Iterator[str]: 85 | return iter( 86 | key_name 87 | for key_name in dir(self) 88 | if not key_name.startswith("_") and self.is_set(key_name) 89 | ) 90 | 91 | def items(self) -> Iterator[Tuple[str, Any]]: 92 | return iter( 93 | (key_name, getattr(self, key_name)) for key_name in self.set_options() 94 | ) 95 | 96 | def __or__(self, other: "ColumnOptions") -> "ColumnOptions": 97 | new_options = ColumnOptions(**dict(other.items())) 98 | for key_name, value in self.items(): 99 | if not other.is_set(key_name): 100 | setattr(new_options, key_name, value) 101 | return new_options 102 | 103 | def __sub__(self, other: "ColumnOptions") -> "ColumnOptions": 104 | return ColumnOptions( 105 | **{ 106 | key_name: value 107 | for key_name, value in self.items() 108 | if not other.is_set(key_name) 109 | } 110 | ) 111 | 112 | def type_suffix(self) -> str: 113 | return "".join( 114 | [ 115 | f"{''.join(key.capitalize() for key in key_name.split('_'))}" 116 | f"{''.join(val.capitalize() for val in str(value).replace('(', '').replace(')', '').split(' '))}" 117 | for key_name, value in self.items() 118 | ] 119 | ) 120 | 121 | def sql_modifiers(self) -> str: 122 | modifiers = [] 123 | if self.primary_key: 124 | modifiers.append("PRIMARY KEY") 125 | if self.auto_increment: 126 | modifiers.append("AUTOINCREMENT") 127 | if self.unique: 128 | modifiers.append("UNIQUE") 129 | if self.not_null: 130 | modifiers.append("NOT NULL") 131 | if self.default is not None and not isinstance(self.default, AutoIncrement): 132 | modifiers.append(f"DEFAULT {self.default}") 133 | return " ".join(modifiers) 134 | 135 | def __repr__(self): 136 | args = [f"{key}={value!r}" for key, value in self.items()] 137 | return f"{self.__class__.__name__}({', '.join(args)})" 138 | 139 | 140 | def column_options(ty: Type[T], options: ColumnOptions) -> Type[T]: 141 | if hasattr(ty, "column_options") and ty.column_options is not None: 142 | options = ty.column_options | options 143 | type_suffix = (options - ty.column_options).type_suffix() 144 | else: 145 | type_suffix = options.type_suffix() 146 | return cast( 147 | Type[T], type(f"{ty.__name__}{type_suffix}", (ty,), {"column_options": options}) 148 | ) 149 | 150 | 151 | def primary_key(ty: Type[T]) -> Type[T]: 152 | return column_options(ty, ColumnOptions(primary_key=True)) 153 | 154 | 155 | def unique(ty: Type[T]) -> Type[T]: 156 | return column_options(ty, ColumnOptions(unique=True)) 157 | 158 | 159 | def not_null(ty: Type[T]) -> Type[T]: 160 | return column_options(ty, ColumnOptions(not_null=True)) 161 | 162 | 163 | def default(ty: Type[T], default_value: FieldType) -> Type[T]: 164 | return column_options(ty, ColumnOptions(default=default_value)) 165 | 166 | 167 | COLUMN_TYPES: List[Type[Any]] = [int, str, bytes, float, Packable] 168 | 169 | 170 | S = TypeVar("S") 171 | D = TypeVar("D", bound="Database") 172 | 173 | 174 | class Model(Struct[FieldType], Generic[D]): 175 | non_serialized = "primary_key_name", "_db", "rowid" 176 | primary_key_name: str = "rowid" 177 | _db: Optional[D] = None 178 | rowid: RowId 179 | 180 | @staticmethod 181 | def is_primary_key(cls) -> bool: 182 | return ( 183 | hasattr(cls, "column_options") 184 | and cls.column_options is not None 185 | and cls.column_options.primary_key 186 | ) 187 | 188 | @classmethod 189 | def validate_fields(cls, fields: OrderedDict[str, Type[FieldType]]): 190 | primary_name = None 191 | for field_name, field_type in fields.items(): 192 | if not issubclass(field_type, ForeignKey): 193 | for valid_type in COLUMN_TYPES: 194 | if issubclass(field_type, valid_type): 195 | break 196 | else: 197 | raise TypeError( 198 | f"Database field {field_name} of {cls.__name__} is type {field_type!r}, but " 199 | f"must be one of {COLUMN_TYPES!r}" 200 | ) 201 | if Model.is_primary_key(field_type): 202 | if primary_name is not None: 203 | raise TypeError( 204 | f"A model can have at most one primary key, but both {primary_name} and " 205 | f"{field_name} were specified in {cls.__name__}" 206 | ) 207 | primary_name = field_name 208 | if primary_name is not None: 209 | cls.primary_key_name = primary_name 210 | if "rowid" not in fields: 211 | fields["rowid"] = default(RowId, RowId()) 212 | 213 | @property 214 | def in_db(self) -> bool: 215 | return self.rowid.initialized and self._db is not None 216 | 217 | def uninitialized_auto_increments(self) -> Iterator[Tuple[str, AutoIncrement]]: 218 | for key in self.keys(): 219 | value = getattr(self, key) 220 | if isinstance(value, AutoIncrement) and not value.initialized: 221 | yield key, value 222 | 223 | @property 224 | def db(self) -> D: 225 | if self._db is None: 226 | raise ValueError(f"Model {self!r} has not yet been added to a database") 227 | return self._db 228 | 229 | @db.setter 230 | def db(self, db: D): 231 | if self._db is not None: 232 | if self._db != db: 233 | raise ValueError( 234 | f"Model {self!r} is already associated with a different database: {db!r}" 235 | ) 236 | else: 237 | self._db = db 238 | 239 | def to_row(self) -> Iterator[FieldType]: 240 | for key in self.keys(): 241 | value = getattr(self, key) 242 | if ( 243 | isinstance(value, AutoIncrement) or isinstance(value, RowId) 244 | ) and not value.initialized: 245 | yield None 246 | else: 247 | yield getattr(self, key) 248 | 249 | 250 | M = TypeVar("M", bound=Model) 251 | 252 | 253 | def sql_format( 254 | param: Optional[FieldType], expected_type: Optional[Type[FieldType]] = None 255 | ) -> Optional[Union[str, bytes, int, float]]: 256 | if param is None: 257 | if ( 258 | expected_type is not None 259 | and hasattr(expected_type, "column_options") 260 | and expected_type.column_options is not None 261 | and expected_type.column_options.not_null 262 | ): 263 | raise ValueError(f"Field {expected_type!r} cannot be NULL") 264 | return None 265 | elif isinstance(param, Model) and expected_type is not None: 266 | if not issubclass(expected_type, ForeignKey): 267 | raise ValueError( 268 | f"Model {param!r} was expected to be of type {expected_type!r}" 269 | ) 270 | return getattr(param, expected_type.key) 271 | elif isinstance(param, int): 272 | return int(param) 273 | elif isinstance(param, float): 274 | return float(param) 275 | elif isinstance(param, str): 276 | return str(param) 277 | elif isinstance(param, bytes): 278 | return bytes(param) 279 | elif isinstance(param, ForeignKey): 280 | return sql_format(param.key, expected_type) 281 | elif isinstance(param, Packable): 282 | return param.pack() 283 | else: 284 | raise ValueError(f"Unsupported parameter type: {param!r}") 285 | 286 | 287 | class DatabaseConnection(sqlite3.Connection): 288 | def __init__(self, *args, rollback_on_exception: bool = False, **kwargs): 289 | super().__init__(*args, **kwargs) 290 | self.rollback_on_exception: bool = rollback_on_exception 291 | 292 | def execute(self, sql: str, *parameters: COLUMN_TYPES) -> sqlite3.Cursor: 293 | params = [sql_format(p) for p in parameters] 294 | try: 295 | return super().execute(sql, params) 296 | except sqlite3.Error as e: 297 | raise ValueError( 298 | f"Error executing SQL {sql!r} with parameters {params!r}: {e!r}" 299 | ) 300 | 301 | def __enter__(self) -> "DatabaseConnection": 302 | return self 303 | 304 | def __exit__(self, exc_type, exc_val, exc_tb): 305 | if exc_type is None or not self.rollback_on_exception: 306 | # no exception occurred 307 | self.commit() 308 | else: 309 | # an exception occurred 310 | self.rollback() 311 | 312 | 313 | class Cursor(Generic[M]): 314 | def __init__( 315 | self, 316 | table: "Table[M]", 317 | sql: str, 318 | params: Iterable[Union[str, int, float, bytes, Packable]] = (), 319 | ): 320 | self.table: Table[M] = table 321 | self.sql: str = sql 322 | self.params: List[Union[str, int, float, bytes]] = [] 323 | for i, p in enumerate(params): 324 | if ( 325 | isinstance(p, str) 326 | or isinstance(p, int) 327 | or isinstance(p, float) 328 | or isinstance(p, bytes) 329 | ): 330 | self.params.append(p) 331 | elif isinstance(p, Packable): 332 | self.params.append(p.pack()) 333 | else: 334 | raise ValueError( 335 | f"Unsupported SQL parameter #{i+1}, {p!r}, when running {sql!r}" 336 | ) 337 | self._item_iter: Optional[Iterator[M]] = None 338 | 339 | def __iter__(self) -> Iterator[M]: 340 | if self._item_iter is None: 341 | self._item_iter = self._iter() 342 | yield from self._item_iter 343 | 344 | def _iter(self) -> Iterator[M]: 345 | with self.table.db: 346 | cur = self.table.db.con.cursor() 347 | try: 348 | for row in cur.execute(self.sql, self.params): 349 | r = self.table.model_type(*row) 350 | r.db = self.table.db 351 | for field_name, field_type in self.table.model_type.FIELDS.items(): 352 | if issubclass(field_type, ForeignKey): 353 | getattr(r, field_name).table = self.table 354 | yield r 355 | finally: 356 | cur.close() 357 | 358 | def fetchone(self) -> Optional[M]: 359 | try: 360 | return next(iter(self)) 361 | except StopIteration: 362 | return None 363 | 364 | def fetchall(self) -> Iterator[M]: 365 | yield from iter(self) 366 | 367 | 368 | class Table(Generic[M]): 369 | model_type: Optional[Type[M]] = None 370 | 371 | def __init__(self, db: "Database", name: str): 372 | if self.model_type is None: 373 | raise TypeError( 374 | f"A Table must be instantiated by subclassing it with a model: `Table[ModelType]`" 375 | ) 376 | self.db: Database = db 377 | self.model_type: Type[M] = self.model_type 378 | self.name: str = name 379 | for field_type in self.model_type.FIELDS.values(): 380 | if isinstance(field_type, ForeignKey): 381 | field_type.table = self 382 | 383 | def __class_getitem__(cls, model_type: Type[M]) -> Type["Table[M]"]: 384 | if isinstance(model_type, TypeVar) or isinstance(model_type, str): 385 | return cls 386 | return cast( 387 | Type[Table[M]], 388 | type( 389 | f"{cls.__name__}{model_type.__name__}", 390 | (cls,), 391 | {"model_type": model_type}, 392 | ), 393 | ) 394 | 395 | def __iter__(self) -> Iterator[M]: 396 | yield from iter(self.select()) 397 | 398 | def select( 399 | self, 400 | distinct: bool = False, 401 | limit: Optional[int] = None, 402 | order_by: Optional[str] = None, 403 | order_direction: str = "ASC", 404 | **kwargs, 405 | ) -> Cursor[M]: 406 | params = [] 407 | where_clauses = [] 408 | for col_name, value in kwargs.items(): 409 | if not isinstance(value, AutoIncrement) or value.initialized: 410 | where_clauses.append(f"{col_name}=?") 411 | params.append(value) 412 | if where_clauses: 413 | clauses = [f"WHERE {' AND '.join(where_clauses)}"] 414 | else: 415 | clauses = [] 416 | if order_by is not None: 417 | clauses.append(f" ORDER BY ? {order_direction}") 418 | params.append(order_by) 419 | if limit is not None: 420 | clauses.append(" LIMIT ?") 421 | params.append(limit) 422 | clauses = "".join(clauses) 423 | if clauses: 424 | clauses = " " + clauses 425 | if distinct: 426 | distinct_clause = " DISTINCT" 427 | else: 428 | distinct_clause = "" 429 | return Cursor( 430 | self, f"SELECT{distinct_clause} *, rowid from {self.name}{clauses}", params 431 | ) 432 | 433 | def __len__(self): 434 | with self.db: 435 | cur = self.db.con.cursor() 436 | try: 437 | result = cur.execute(f"SELECT COUNT(*) from {self.name}") 438 | return result.fetchone()[0] 439 | finally: 440 | cur.close() 441 | 442 | def _finalize_added_row(self, row: M): 443 | to_update = [key for key, _ in row.uninitialized_auto_increments()] 444 | if to_update: 445 | try: 446 | obj_in_db = next(iter(self.select(**{"rowid": row.rowid}))) 447 | except StopIteration: 448 | raise ValueError( 449 | f"Row {row} was expected to be in the database in table {self.name}, but was not" 450 | ) 451 | for key in to_update: 452 | setattr(row, key, getattr(obj_in_db, key)) 453 | row.db = self.db 454 | 455 | def append(self, row: M): 456 | self.extend((row,)) 457 | 458 | def extend(self, rows: Iterable[M]): 459 | with self.db: 460 | rows = list(rows) 461 | if not rows: 462 | return 463 | cur = self.db.con.cursor() 464 | try: 465 | # we have to add each row individually so we can set its rowid 466 | for row in rows: 467 | if row.in_db: 468 | raise ValueError(f"Row {row!r} is already in the database!") 469 | result = cur.execute( 470 | f"INSERT INTO {self.name} ({','.join(self.model_type.FIELDS.keys())}) " 471 | f"VALUES ({','.join(['?']*len(self.model_type.FIELDS))})", 472 | tuple( 473 | sql_format(param, expected_type) 474 | for param, expected_type in zip( 475 | row.to_row(), self.model_type.FIELDS.values() 476 | ) 477 | ), 478 | ) 479 | setattr(row, "rowid", RowId(result.lastrowid)) 480 | self._finalize_added_row(row) 481 | finally: 482 | cur.close() 483 | 484 | def update(self, row: M): 485 | if not row.in_db: 486 | raise ValueError(f"Row {row!r} is not yet in the database!") 487 | with self.db: 488 | set_argument = ",".join( 489 | [ 490 | f"{field_name} = ?" 491 | for field_name in self.model_type.FIELDS.keys() 492 | if field_name != "rowid" 493 | ] 494 | ) 495 | new_values = tuple( 496 | sql_format(param, expected_type) 497 | for param, (field_name, expected_type) in zip( 498 | row.to_row(), self.model_type.FIELDS.items() 499 | ) 500 | if field_name != "rowid" 501 | ) 502 | cur = self.db.con.cursor() 503 | try: 504 | cur.execute( 505 | f"UPDATE {self.name} SET {set_argument} WHERE rowid=?", 506 | new_values + (int(row.rowid),), 507 | ) 508 | finally: 509 | cur.close() 510 | 511 | 512 | class ForeignKey(Generic[M]): 513 | row_type: Type[M] 514 | foreign_table_name: str 515 | foreign_col_name: str 516 | table: Optional[Table[M]] = None 517 | 518 | def __init__( 519 | self, key: Union[int, str, bytes, float, M], table: Optional[Table[M]] = None 520 | ): 521 | self._row: Optional[M] = None 522 | if table is not None: 523 | self.table = table 524 | if isinstance(key, Model): 525 | if not hasattr(self, "foreign_col_name"): 526 | raise ValueError( 527 | f"Foreign key {self!r} has not yet been assigned to a table!" 528 | ) 529 | elif not isinstance(key, self.row_type): 530 | raise ValueError( 531 | f"Foreign key {self!r} was expeted to be passed a value of type {self.row_type!r} " 532 | f"but was instead passed {key!r}" 533 | ) 534 | self.key: Union[int, str, bytes, float] = getattr( 535 | key, self.foreign_col_name 536 | ) 537 | else: 538 | self.key = key 539 | 540 | def __class_getitem__( 541 | cls, 542 | arguments: Union[ 543 | TypeVar, Tuple[str, Type[M]], Tuple[str, Type[M], str], Table[M] 544 | ], 545 | ) -> "ForeignKey[M]": 546 | if isinstance(arguments, TypeVar): 547 | return cls 548 | elif isinstance(arguments, Table): 549 | if not hasattr(cls, "foreign_table_name") or not cls.foreign_col_name: 550 | raise ValueError( 551 | "A table can only be passed to a ForeignKey that already has a `foreign_table_name`" 552 | ) 553 | return cast( 554 | ForeignKey[M], 555 | type( 556 | f"{cls.__name__}{arguments.model_type.__name__.capitalize()}" 557 | f"{cls.foreign_col_name.replace('_', '').capitalize()}", 558 | (cls,), 559 | {"table": arguments}, 560 | ), 561 | ) 562 | else: 563 | if ( 564 | not isinstance(arguments, tuple) 565 | or not (2 <= len(arguments) <= 3) 566 | or not isinstance(arguments[0], str) 567 | or not issubclass(arguments[1], Model) 568 | or (len(arguments) == 3 and not isinstance(arguments[2], str)) 569 | ): 570 | raise TypeError( 571 | f"Invalid ForeignKey arguments: {list(arguments)!r}. Expected either two or three " 572 | "arguments: (1) a string for the foreign table name; (2) the `Model` type for that " 573 | "table; and, optionally, (3) the foreign column name. If (3) is omitted, the primary " 574 | "key for the foreign table is used." 575 | ) 576 | if len(arguments) == 3: 577 | table_name, model, row_name = arguments 578 | else: 579 | table_name, model = arguments 580 | row_name = model.primary_key_name 581 | return cast( 582 | ForeignKey[M], 583 | type( 584 | f"{cls.__name__}{model.__name__.capitalize()}{row_name.replace('_', '').capitalize()}", 585 | (cls,), 586 | { 587 | "row_type": model, 588 | "foreign_col_name": row_name, 589 | "foreign_table_name": table_name, 590 | }, 591 | ), 592 | ) 593 | 594 | @classmethod 595 | def key_type(cls) -> Type[Union[int, float, str, bytes, Packable]]: 596 | foreign_type = cls.row_type.FIELDS[cls.foreign_col_name] 597 | if hasattr(cls, "column_options"): 598 | options = {"column_options": cls.column_options} 599 | else: 600 | options = {} 601 | return cast( 602 | Type[Union[int, float, str, bytes, Packable]], 603 | type(f"{foreign_type.__name__}ForeignKey", (foreign_type,), options), 604 | ) 605 | 606 | @property 607 | def row(self) -> M: 608 | if self._row is None: 609 | if self.table is None: 610 | raise ValueError(f"{self.__class__.__name__} must have a `table` set") 611 | foreign_table = getattr(self.table.db, self.foreign_table_name) 612 | self._row = next( 613 | iter(foreign_table.select(**{self.foreign_col_name: self.key})) 614 | ) 615 | return self._row 616 | 617 | def __getattr__(self, item): 618 | return getattr(self.row, item) 619 | 620 | def __eq__(self, other): 621 | if isinstance(other, ForeignKey): 622 | return self.key == other.key 623 | else: 624 | return self.row == other 625 | 626 | def __ne__(self, other): 627 | return not (self == other) 628 | 629 | def __lt__(self, other): 630 | if isinstance(other, ForeignKey): 631 | return self.key < other.key 632 | else: 633 | return self.row < other 634 | 635 | def __hash__(self): 636 | return hash(self.key) 637 | 638 | def __repr__(self): 639 | return f"{self.__class__.__name__}(key={self.key!r})" 640 | 641 | 642 | class Database(metaclass=StructMeta[Model]): 643 | def __init__(self, path: str = ":memory:", rollback_on_exception: bool = False): 644 | self.path: str = path 645 | self.con = DatabaseConnection( 646 | self.path, rollback_on_exception=rollback_on_exception 647 | ) 648 | self.tables: Dict[str, Table] = {} 649 | if self.FIELDS: 650 | with self: 651 | for table_name, table_type in self.FIELDS.items(): 652 | setattr(self, table_name, self.create_table(table_name, table_type)) 653 | 654 | @classmethod 655 | def validate_fields(cls, fields: OrderedDict[str, Type[FieldType]]): 656 | for field_name, field_type in fields.items(): 657 | if not issubclass(field_type, Table): 658 | raise TypeError( 659 | f"Database {cls!r} table `{field_name}` was expected to be of type `Table` but " 660 | f"was instead {field_type!r}" 661 | ) 662 | 663 | def __enter__(self: D) -> D: 664 | # self.con.__enter__() 665 | return self 666 | 667 | def __exit__(self, exc_type, exc_val, exc_tb): 668 | # self.con.__exit__(exc_type, exc_val, exc_tb) 669 | self.con.commit() 670 | 671 | def create_table(self, table_name: str, table_type: Type[Table[M]]) -> Table[M]: 672 | columns = [] 673 | table = table_type(self, table_name) 674 | model_type = table_type.model_type 675 | for field_name, field_type in model_type.FIELDS.items(): 676 | if issubclass(field_type, RowId): 677 | continue 678 | elif issubclass(field_type, ForeignKey): 679 | old_field_type = field_type 680 | field_type = field_type.key_type() 681 | if hasattr(old_field_type, "column_options"): 682 | setattr(field_type, "column_options", old_field_type.column_options) 683 | else: 684 | setattr(field_type, "column_options", None) 685 | if issubclass(field_type, int): 686 | data_type = "INTEGER" 687 | elif issubclass(field_type, float): 688 | data_type = "REAL" 689 | elif issubclass(field_type, str): 690 | data_type = "TEXT" 691 | elif issubclass(field_type, bytes) or isinstance(field_type, Packable): 692 | data_type = "BLOB" 693 | else: 694 | raise TypeError( 695 | f"Column {field_name} is of unsupported type {field_type!r}; it must be one of " 696 | f"{COLUMN_TYPES}" 697 | ) 698 | if ( 699 | hasattr(field_type, "column_options") 700 | and field_type.column_options is not None 701 | ): 702 | modifiers = field_type.column_options.sql_modifiers() 703 | if modifiers: 704 | modifiers = f" {modifiers}" 705 | else: 706 | modifiers = "" 707 | columns.append(f"{field_name} {data_type}{modifiers}") 708 | column_constraints = ",\n ".join(columns) 709 | if len(columns) > 1: 710 | column_constraints = "\n " + column_constraints + "\n" 711 | with self: 712 | self.con.execute( 713 | f"CREATE TABLE IF NOT EXISTS {table.name} ({column_constraints});" 714 | ) 715 | self.tables[table_name] = table 716 | return table 717 | -------------------------------------------------------------------------------- /fluxture/fluxture.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, ABCMeta, abstractmethod 2 | from argparse import ArgumentParser, Namespace 3 | from inspect import isabstract 4 | from typing import Dict, Tuple, Type 5 | 6 | PLUGINS: Dict[str, Type["Plugin"]] = {} 7 | COMMANDS: Dict[str, Type["Command"]] = {} 8 | 9 | 10 | class PluginMeta(ABCMeta): 11 | def __init__(cls, name, bases, clsdict): 12 | super().__init__(name, bases, clsdict) 13 | if not isabstract(cls) and name not in ("Plugin", "Command"): 14 | if "name" not in clsdict: 15 | raise TypeError(f"Fluxture plugin {name} does not define a name") 16 | elif clsdict["name"] in PLUGINS: 17 | raise TypeError( 18 | f"Cannot instaitiate class {cls.__name__} because a plugin named {name} already exists," 19 | f" implemented by class {PLUGINS[clsdict['name']]}" 20 | ) 21 | PLUGINS[clsdict["name"]] = cls 22 | if issubclass(cls, Command): 23 | if "help" not in clsdict: 24 | raise TypeError( 25 | f"Fluxture command {name} does not define a help string" 26 | ) 27 | COMMANDS[clsdict["name"]] = cls 28 | 29 | 30 | class Plugin(ABC, metaclass=PluginMeta): 31 | name: str 32 | 33 | 34 | class Command(Plugin): 35 | help: str 36 | parent_parsers: Tuple[ArgumentParser, ...] = () 37 | 38 | def __init__(self, argument_parser: ArgumentParser): 39 | self.__init_arguments__(argument_parser) 40 | 41 | def __init_arguments__(self, parser: ArgumentParser): 42 | pass 43 | 44 | @abstractmethod 45 | def run(self, args: Namespace): 46 | raise NotImplementedError() 47 | 48 | 49 | def add_command_subparsers(parser: ArgumentParser): 50 | subparsers = parser.add_subparsers( 51 | title="command", 52 | description="valid fluxture commands", 53 | help="run `fluxture command --help` for help on a specific command", 54 | ) 55 | for name, command_type in COMMANDS.items(): 56 | p = subparsers.add_parser( 57 | name, parents=command_type.parent_parsers, help=command_type.help 58 | ) 59 | p.set_defaults(func=command_type(p).run) 60 | -------------------------------------------------------------------------------- /fluxture/geolocation.py: -------------------------------------------------------------------------------- 1 | import tarfile 2 | import urllib.request 3 | from ipaddress import IPv4Address as PythonIPv4 4 | from ipaddress import IPv6Address as PythonIPv6 5 | from pathlib import Path 6 | from tempfile import NamedTemporaryFile 7 | from typing import Iterator, Optional, Tuple, Union 8 | 9 | try: 10 | from typing import Protocol 11 | except ImportError: 12 | from typing_extensions import Protocol 13 | 14 | import geoip2.database 15 | import great_circle_calculator.great_circle_calculator as gcc 16 | from geoip2.errors import AddressNotFoundError 17 | 18 | from .db import Model 19 | from .serialization import DateTime, IPv6Address 20 | 21 | 22 | class Location: 23 | lat: float 24 | lon: float 25 | 26 | def path_to( 27 | self, destination: "Location", intermediate_points: int = 20 28 | ) -> Iterator[Tuple[int, int]]: 29 | p1, p2 = (self.lon, self.lat), (destination.lon, destination.lat) 30 | yield self.lon, self.lat 31 | for i in range(intermediate_points): 32 | try: 33 | yield gcc.intermediate_point( 34 | p1, p2, (i + 1) / (intermediate_points + 2) 35 | ) 36 | except ZeroDivisionError: 37 | # this probably means p1 == p2 38 | yield self.lon, self.lat 39 | yield destination.lon, destination.lat 40 | 41 | def distance_to(self, destination: "Location", unit: str = "meters"): 42 | return gcc.distance_between_points( 43 | (self.lon, self.lat), (destination.lon, destination.lat), unit=unit 44 | ) 45 | 46 | 47 | class Geolocation(Model, Location): 48 | ip: IPv6Address 49 | city: str 50 | country_code: str 51 | continent_code: str 52 | lat: float 53 | lon: float 54 | timestamp: DateTime 55 | 56 | def __hash__(self): 57 | return hash(self.ip) 58 | 59 | 60 | class Geolocator(Protocol): 61 | def locate( 62 | self, ip: Union[IPv6Address, str, PythonIPv4, PythonIPv6] 63 | ) -> Geolocation: 64 | ... 65 | 66 | 67 | class GeoIP2Error(RuntimeError): 68 | pass 69 | 70 | 71 | def download_maxmind_db( 72 | maxmind_license_key: str, city_db_path: Optional[str] = None, overwrite: bool = True 73 | ) -> str: 74 | """ 75 | Downloads the latest MaxMind GeoLite2 database returning the path to which it was saved. 76 | 77 | If the path is omitted, the default path is used and returned. 78 | 79 | """ 80 | if city_db_path is None: 81 | city_db_path = ( 82 | Path.home() / ".config" / "fluxture" / "geolite2" / "GeoLite2-City.mmdb" 83 | ) 84 | else: 85 | city_db_path = Path(city_db_path) 86 | if overwrite or not city_db_path.exists(): 87 | if maxmind_license_key is None: 88 | raise GeoIP2Error( 89 | "No MaxMind GeoLite2 database provided; need a `maxmind_license_key` to download it. " 90 | "Sign up for GeoLite2 for free here: https://www.maxmind.com/en/geolite2/signup " 91 | "then, after logging in, generate a license key under the Services menu." 92 | ) 93 | db_dir = city_db_path.parent 94 | if not db_dir.exists(): 95 | db_dir.mkdir(parents=True) 96 | tmpfile = NamedTemporaryFile(mode="wb", delete=False) 97 | try: 98 | with urllib.request.urlopen( 99 | r"https://download.maxmind.com/app/geoip_download?edition_id=GeoLite2-City&" 100 | f"license_key={maxmind_license_key}&suffix=tar.gz" 101 | ) as response: 102 | # We have to write this to a temp file because the gzip decompression requires seekability 103 | while True: 104 | chunk = response.read(1024**2) 105 | if not chunk: 106 | break 107 | tmpfile.write(chunk) 108 | tmpfile.close() 109 | with tarfile.open(tmpfile.name, mode="r:gz") as tar: 110 | geolite_dir = None 111 | for tarinfo in tar: 112 | if tarinfo.isdir(): 113 | geolite_dir = tarinfo.name 114 | if geolite_dir is None: 115 | raise GeoIP2Error("Unexpected GeoLite2 database format") 116 | tar.extractall(str(city_db_path.parent)) 117 | latest_dir = db_dir / "GeoLite2-City_latest" 118 | latest_dir.unlink(missing_ok=True) 119 | latest_dir.symlink_to(db_dir / geolite_dir) 120 | finally: 121 | Path(tmpfile.name).unlink(missing_ok=True) 122 | city_db_path.unlink(missing_ok=True) 123 | city_db_path.symlink_to(latest_dir / "GeoLite2-City.mmdb") 124 | return str(city_db_path) 125 | 126 | 127 | class GeoIP2Locator: 128 | def __init__( 129 | self, 130 | city_db_path: Optional[str] = None, 131 | maxmind_license_key: Optional[str] = None, 132 | ): 133 | self.city_db_path: Path = download_maxmind_db( 134 | maxmind_license_key, city_db_path, overwrite=False 135 | ) # type: ignore 136 | self._geoip: Optional[geoip2.database.Reader] = None 137 | self._entries: int = 0 138 | 139 | def __enter__(self): 140 | if self._entries == 0: 141 | assert self._geoip is None 142 | self._geoip = geoip2.database.Reader(str(self.city_db_path)).__enter__() 143 | self._entries += 1 144 | return self 145 | 146 | def __exit__(self, exc_type, exc_val, exc_tb): 147 | if self._entries == 1: 148 | assert self._geoip is not None 149 | self._geoip.__exit__(exc_type, exc_val, exc_tb) 150 | self._geoip = None 151 | self._entries = max(0, self._entries - 1) 152 | 153 | def locate( 154 | self, ip: Union[IPv6Address, str, PythonIPv4, PythonIPv6] 155 | ) -> Geolocation: 156 | with self: 157 | ipv6 = IPv6Address(ip) 158 | city = self._geoip.city(str(ipv6)) 159 | if city.location.latitude is None or city.location.longitude is None: 160 | raise AddressNotFoundError(str(ip)) 161 | return Geolocation( 162 | ip=ipv6, 163 | city=city.city.name, 164 | country_code=city.country.iso_code, 165 | continent_code=city.continent.code, 166 | lat=city.location.latitude, 167 | lon=city.location.longitude, 168 | timestamp=DateTime(), 169 | ) 170 | -------------------------------------------------------------------------------- /fluxture/kml.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from argparse import ArgumentParser, FileType 3 | from collections import defaultdict 4 | from math import pi, sin 5 | from typing import Dict, Iterable, List, Optional, OrderedDict, Set 6 | 7 | from fastkml import IconStyle, LineStyle, Placemark, Style, kml 8 | from fastkml.geometry import Geometry, LineString, Point 9 | from shapely.geometry import MultiPoint 10 | from tqdm import tqdm 11 | 12 | from .blockchain import Miner 13 | from .crawl_schema import CrawledNode 14 | from .crawler import CrawlDatabase 15 | from .fluxture import Command 16 | from .geolocation import Geolocation, Location 17 | from .topology import CrawlGraph, ProbabilisticWeightedCrawlGraph 18 | 19 | KML_NS = "{http://www.opengis.net/kml/2.2}" 20 | EARTH_CIRCUMFERENCE = 40075000.0 21 | EARTH_DIAMETER = 12742000.0 22 | 23 | 24 | class KMLGraphNode(ABC, Location): 25 | @abstractmethod 26 | def neighbors(self) -> Iterable["KMLGraphNode"]: 27 | raise NotImplementedError() 28 | 29 | @abstractmethod 30 | def to_placemark(self, style: Optional[Style] = None) -> Placemark: 31 | raise NotImplementedError() 32 | 33 | @abstractmethod 34 | def description(self) -> str: 35 | raise NotImplementedError() 36 | 37 | @abstractmethod 38 | def uid(self) -> str: 39 | raise NotImplementedError() 40 | 41 | def __eq__(self, other): 42 | return isinstance(other, KMLGraphNode) and self.uid() == other.uid() 43 | 44 | def __ne__(self, other): 45 | return not (self == other) 46 | 47 | def __hash__(self): 48 | return hash(self.uid()) 49 | 50 | def __str__(self): 51 | return self.uid() 52 | 53 | 54 | class KMLGeolocation(KMLGraphNode): 55 | def __init__( 56 | self, location: Geolocation, db: CrawlDatabase, is_miner: bool = False 57 | ): 58 | self.location: Geolocation = location 59 | self.db: CrawlDatabase = db 60 | self.lat = location.lat 61 | self.lon = location.lon 62 | self.is_miner: bool = is_miner 63 | 64 | def __eq__(self, other): 65 | return ( 66 | isinstance(other, KMLGeolocation) and other.location.ip == self.location.ip 67 | ) or super().__eq__(other) 68 | 69 | def uid(self) -> str: 70 | return str(self.location.ip) 71 | 72 | def neighbors(self) -> Iterable["KMLGeolocation"]: 73 | possible_nodes_by_port: Dict[int, CrawledNode] = {} 74 | for possible_node in self.db.nodes.select(ip=self.location.ip): 75 | port = possible_node.port 76 | if port in possible_nodes_by_port: 77 | # choose the version that was crawled most recently 78 | if ( 79 | possible_nodes_by_port[port].last_crawled() 80 | >= possible_node.last_crawled() 81 | ): 82 | continue 83 | possible_nodes_by_port[port] = possible_node 84 | locations: Dict[Geolocation, Set[CrawledNode]] = defaultdict(set) 85 | for node in possible_nodes_by_port.values(): 86 | for neighbor in node.get_latest_edges(): 87 | neighbor_location = self.db.locations.select( 88 | ip=neighbor.ip, 89 | limit=1, 90 | order_by="timestamp", 91 | order_direction="DESC", 92 | ).fetchone() 93 | if neighbor_location is None: 94 | continue 95 | locations[neighbor_location].add(neighbor) 96 | return ( 97 | KMLGeolocation( 98 | loc, self.db, is_miner=any(n.is_miner == Miner.MINER for n in nodes) 99 | ) 100 | for loc, nodes in locations.items() 101 | if loc is not None 102 | ) 103 | 104 | @property 105 | def ip_str(self) -> str: 106 | if self.location.ip.ipv4_mapped is not None: 107 | return str(self.location.ip.ipv4_mapped) 108 | else: 109 | return str(self.location.ip) 110 | 111 | def description(self) -> str: 112 | if self.is_miner: 113 | miner_str = f"Likely a Miner " 114 | else: 115 | miner_str = "" 116 | return ( 117 | f"{miner_str}{self.ip_str}: {self.location.city} ({self.lat}°N, {self.lon}°E) @ " 118 | f"{self.location.timestamp!s}" 119 | ) 120 | 121 | def to_placemark(self, style: Optional[Style] = None) -> kml.Placemark: 122 | if style is None: 123 | style = Style(KML_NS, styles=[IconStyle(KML_NS, id="ip")]) 124 | p = kml.Placemark(KML_NS, self.uid(), self.ip_str, self.description()) 125 | p.append_style(style) 126 | p.geometry = Point(self.lon, self.lat) 127 | return p 128 | 129 | 130 | class ScaledKMLGraphNode(KMLGraphNode): 131 | def __init__(self, wrapped: KMLGraphNode, scale: float): 132 | self.node: KMLGraphNode = wrapped 133 | self.scale: float = scale 134 | self.lat = wrapped.lat 135 | self.lon = wrapped.lon 136 | 137 | def neighbors(self) -> Iterable["KMLGraphNode"]: 138 | return self.node.neighbors() 139 | 140 | def to_placemark(self, style: Optional[Style] = None) -> Placemark: 141 | scaled_style = Style( 142 | KML_NS, styles=[IconStyle(KML_NS, id="ip", scale=self.scale)] 143 | ) 144 | if style is not None: 145 | scaled_style.append_style(style) 146 | p = self.node.to_placemark(style=scaled_style) 147 | return p 148 | 149 | def description(self) -> str: 150 | return self.node.description() 151 | 152 | def uid(self) -> str: 153 | return self.node.uid() 154 | 155 | 156 | class KMLGraphNodeCollection(KMLGraphNode): 157 | def __init__( 158 | self, 159 | name: str, 160 | subnodes: Iterable[KMLGraphNode] = (), 161 | neighbors: Iterable[KMLGraphNode] = (), 162 | ): 163 | self.name: str = name 164 | self._neighbors: List[KMLGraphNode] = list(neighbors) 165 | self._subnodes: List[KMLGraphNode] = [] 166 | self.subnodes = subnodes 167 | 168 | def set_neighbors(self, neighbors: Iterable[KMLGraphNode]): 169 | self._neighbors = list(neighbors) 170 | assert self not in self._neighbors 171 | 172 | def uid(self) -> str: 173 | return self.name 174 | 175 | @property 176 | def subnodes(self) -> List[KMLGraphNode]: 177 | return self._subnodes 178 | 179 | @subnodes.setter 180 | def subnodes(self, nodes: Iterable[KMLGraphNode]): 181 | self._subnodes = list(nodes) 182 | if self._subnodes: 183 | points = MultiPoint([(node.lon, node.lat) for node in self.subnodes]) 184 | centroid = points.convex_hull.centroid 185 | self.lon = centroid.x 186 | self.lat = centroid.y 187 | 188 | def neighbors(self) -> Iterable["KMLGraphNode"]: 189 | return self._neighbors 190 | 191 | def description(self) -> str: 192 | return "\n".join(n.description() for n in self.subnodes) 193 | 194 | def to_placemark(self, style: Optional[Style] = None) -> kml.Placemark: 195 | if style is None: 196 | style = Style(KML_NS, styles=[IconStyle(KML_NS, id="ip")]) 197 | p = kml.Placemark(KML_NS, self.uid(), self.name, self.description()) 198 | p.append_style(style) 199 | p.geometry = Point(self.lon, self.lat) 200 | return p 201 | 202 | 203 | def to_kml( 204 | locations: Iterable[KMLGraphNode], 205 | doc_id: str, 206 | doc_name: str, 207 | doc_description: str, 208 | max_altitude: float = EARTH_DIAMETER / 4.0, 209 | ) -> kml.KML: 210 | k = kml.KML() 211 | d = kml.Document(KML_NS, doc_id, doc_name, doc_description) 212 | k.append(d) 213 | f = kml.Folder(KML_NS, "ips", "IPs", "Geolocalized IPs") 214 | d.append(f) 215 | edge_folder = kml.Folder( 216 | KML_NS, "topology", "Topology", "The network topology discovered in the crawl" 217 | ) 218 | d.append(edge_folder) 219 | edge_color = (0, 255, 0) 220 | edge_hex_color = "7f%02x%02x%02x" % tuple(reversed(edge_color)) 221 | edge_style = Style( 222 | KML_NS, styles=[LineStyle(KML_NS, id="edge", color=edge_hex_color, width=3)] 223 | ) 224 | for geolocation in tqdm( 225 | locations, leave=False, desc="Generating KML", unit=" locations" 226 | ): 227 | f.append(geolocation.to_placemark()) 228 | for neighbor in geolocation.neighbors(): 229 | if neighbor is None or geolocation == neighbor: 230 | continue 231 | p = kml.Placemark( 232 | KML_NS, 233 | f"{geolocation!s}->{neighbor!s}", 234 | f"{geolocation!s}->{neighbor!s}", 235 | f"Edge between {geolocation!s} and {neighbor!s}", 236 | ) 237 | p.append_style(edge_style) 238 | num_segments = 20 239 | distance = geolocation.distance_to(neighbor) 240 | peak_altitude = max_altitude * distance / (EARTH_CIRCUMFERENCE / 2.0) 241 | p.geometry = Geometry( 242 | geometry=LineString( 243 | [ 244 | (lon, lat, sin(i / (num_segments - 1) * pi) * peak_altitude) 245 | for i, (lon, lat) in enumerate( 246 | geolocation.path_to( 247 | neighbor, intermediate_points=num_segments - 2 248 | ) 249 | ) 250 | ] 251 | ), 252 | tessellate=False, 253 | extrude=False, 254 | altitude_mode="relativeToGround", 255 | ) 256 | edge_folder.append(p) 257 | return k 258 | 259 | 260 | def calculate_rank( 261 | loc: KMLGraphNode, pr: OrderedDict[CrawledNode, float], db: CrawlDatabase 262 | ) -> float: 263 | if isinstance(loc, KMLGeolocation): 264 | nodes = db.nodes.select(ip=loc.location.ip) 265 | elif isinstance(loc, KMLGraphNodeCollection): 266 | return sum(calculate_rank(subnode, pr, db) for subnode in loc.subnodes) 267 | else: 268 | raise NotImplementedError(f"Add support for locations of type {type(loc)}") 269 | return sum(pr[node] for node in nodes if node in pr) 270 | 271 | 272 | class ToKML(Command): 273 | name = "kml" 274 | help = "export a KML file visualizing the crawled data" 275 | 276 | def __init_arguments__(self, parser: ArgumentParser): 277 | parser.add_argument( 278 | "CRAWL_DB_FILE", type=str, help="path to the crawl database" 279 | ) 280 | parser.add_argument( 281 | "KML_FILE", 282 | type=FileType("w"), 283 | help="path to which to save the KML, or '-' for STDOUT (the default)", 284 | ) 285 | parser.add_argument( 286 | "--no-pagerank", 287 | action="store_true", 288 | help="do not scale the placemarks by their pagerank in the network topology", 289 | ) 290 | parser.add_argument( 291 | "--group-by", 292 | "-g", 293 | default="ip", 294 | choices=["ip", "city", "country", "continent"], 295 | help="grouping of pins (default: %(default)s)", 296 | ) 297 | 298 | def run(self, args): 299 | with CrawlDatabase(args.CRAWL_DB_FILE) as db: 300 | locations = (KMLGeolocation(loc, db) for loc in db.locations) 301 | if args.group_by != "ip": 302 | if args.group_by == "city": 303 | 304 | def grouper(loc: KMLGeolocation) -> str: 305 | return loc.location.city 306 | 307 | elif args.group_by == "country": 308 | 309 | def grouper(loc: KMLGeolocation) -> str: 310 | return loc.location.country_code 311 | 312 | elif args.group_by == "continent": 313 | 314 | def grouper(loc: KMLGeolocation) -> str: 315 | return loc.location.continent_code 316 | 317 | else: 318 | raise NotImplementedError( 319 | f"TODO: Implement support for --group-by={args.group_by}" 320 | ) 321 | groups: Dict[str, List[KMLGeolocation]] = defaultdict(list) 322 | for loc in locations: 323 | groups[grouper(loc)].append(loc) 324 | collections: Dict[str, KMLGraphNodeCollection] = { 325 | group: KMLGraphNodeCollection(group, subnodes=subnodes) 326 | for group, subnodes in groups.items() 327 | } 328 | for group, c in collections.items(): 329 | all_neighbors = set() 330 | for member in groups[group]: 331 | all_neighbors |= { 332 | grouper(neighbor) for neighbor in member.neighbors() 333 | } 334 | all_neighbors -= {group} 335 | c.set_neighbors( 336 | ( 337 | collections[neighbor_group] 338 | for neighbor_group in all_neighbors 339 | ) 340 | ) 341 | locations = collections.values() 342 | if not args.no_pagerank: 343 | graph = CrawlGraph.load(db) 344 | graph.prune() 345 | pr = ProbabilisticWeightedCrawlGraph(graph).pagerank() 346 | max_rank = max(pr.values()) 347 | if max_rank == 0.0: 348 | max_rank = 1.0 349 | new_locations = [] 350 | for loc in locations: 351 | scale = 1.0 + calculate_rank(loc, pr, db) / max_rank * 4.0 352 | new_locations.append(ScaledKMLGraphNode(loc, scale)) 353 | locations = new_locations 354 | args.KML_FILE.write( 355 | to_kml( 356 | locations=locations, 357 | doc_id=f"{args.CRAWL_DB_FILE}_IPs", 358 | doc_name=f"{args.CRAWL_DB_FILE} IPs", 359 | doc_description=f"Geolocalized IPs from crawl {args.CRAWL_DB_FILE}", 360 | ).to_string(prettyprint=True) 361 | ) 362 | -------------------------------------------------------------------------------- /fluxture/messaging.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from abc import ABC, abstractmethod 3 | from typing import Optional, TypeVar 4 | 5 | from fluxture.structures import PackableStruct 6 | 7 | from .serialization import ByteOrder 8 | 9 | M = TypeVar("M", bound="Message") 10 | B = TypeVar("B", bound="BinaryMessage") 11 | 12 | 13 | class Message(ABC): 14 | @abstractmethod 15 | def serialize(self) -> bytes: 16 | raise NotImplementedError() 17 | 18 | @classmethod 19 | @abstractmethod 20 | def deserialize(cls: M, data: bytes) -> M: 21 | raise NotImplementedError() 22 | 23 | @classmethod 24 | @abstractmethod 25 | async def next_message(cls: M, reader: asyncio.StreamReader) -> Optional[M]: 26 | raise NotImplementedError() 27 | 28 | 29 | class BinaryMessage(PackableStruct, Message): 30 | non_serialized = ("byte_order",) 31 | byte_order: ByteOrder = ByteOrder.NETWORK 32 | 33 | def serialize(self) -> bytes: 34 | return self.pack(self.byte_order) 35 | 36 | @classmethod 37 | def deserialize(cls: B, data: bytes) -> B: 38 | return cls.unpack(data, cls.byte_order) 39 | 40 | @classmethod 41 | async def next_message(cls: B, reader: asyncio.StreamReader) -> Optional[B]: 42 | return cls.read(reader, cls.byte_order) 43 | -------------------------------------------------------------------------------- /fluxture/serialization.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import datetime 3 | import ipaddress 4 | import struct 5 | import time 6 | from abc import ABC, ABCMeta, abstractmethod 7 | from collections import OrderedDict 8 | from enum import Enum as PythonEnum 9 | from inspect import isabstract 10 | from typing import Dict, Generic, Iterator 11 | from typing import OrderedDict as OrderedDictType 12 | from typing import Tuple, Type, TypeVar, Union 13 | 14 | try: 15 | from typing import Protocol, runtime_checkable 16 | except ImportError: 17 | from typing_extensions import Protocol, runtime_checkable 18 | 19 | 20 | P = TypeVar("P") 21 | 22 | 23 | class ByteOrder(PythonEnum): 24 | NATIVE = "@" 25 | LITTLE = "<" 26 | BIG = ">" 27 | NETWORK = "!" 28 | 29 | 30 | class UnpackError(RuntimeError): 31 | pass 32 | 33 | 34 | @runtime_checkable 35 | class Packable(Protocol): 36 | def pack(self, byte_order: ByteOrder = ByteOrder.NETWORK) -> bytes: 37 | ... 38 | 39 | @classmethod 40 | def unpack( 41 | cls: Type[P], data: bytes, byte_order: ByteOrder = ByteOrder.NETWORK 42 | ) -> P: 43 | ... 44 | 45 | @classmethod 46 | def unpack_partial( 47 | cls: Type[P], data: bytes, byte_order: ByteOrder = ByteOrder.NETWORK 48 | ) -> Tuple[P, bytes]: 49 | ... 50 | 51 | @classmethod 52 | async def read( 53 | cls: Type[P], 54 | reader: asyncio.StreamReader, 55 | byte_order: ByteOrder = ByteOrder.NETWORK, 56 | ) -> P: 57 | ... 58 | 59 | 60 | class BigEndian: 61 | def __class_getitem__(cls, item: Type[Packable]): 62 | def big_endian_pack(self, byte_order: ByteOrder = ByteOrder.BIG) -> bytes: 63 | return item.pack(self, byte_order=ByteOrder.BIG) 64 | 65 | def big_endian_unpack(data: bytes, byte_order: ByteOrder = ByteOrder.BIG): 66 | return item.unpack(data, byte_order=ByteOrder.BIG) 67 | 68 | def big_endian_unpack_partial( 69 | data: bytes, byte_order: ByteOrder = ByteOrder.BIG 70 | ): 71 | return item.unpack_partial(data, byte_order=ByteOrder.BIG) 72 | 73 | async def big_endian_read( 74 | reader: asyncio.StreamReader, byte_order: ByteOrder = ByteOrder.BIG 75 | ): 76 | return item.read(reader, byte_order=ByteOrder.BIG) 77 | 78 | return type( 79 | f"{item.__name__}BigEndian", 80 | (item,), 81 | { 82 | "pack": big_endian_pack, 83 | "unpack": big_endian_unpack, 84 | "unpack_partial": big_endian_unpack_partial, 85 | "read": big_endian_read, 86 | }, 87 | ) 88 | 89 | 90 | class LittleEndian: 91 | def __class_getitem__(cls, item: Type[Packable]): 92 | def little_endian_pack(self, byte_order: ByteOrder = ByteOrder.LITTLE) -> bytes: 93 | return item.pack(self, byte_order=ByteOrder.LITTLE) 94 | 95 | def little_endian_unpack(data: bytes, byte_order: ByteOrder = ByteOrder.LITTLE): 96 | return item.unpack(data, byte_order=ByteOrder.LITTLE) 97 | 98 | def little_endian_unpack_partial( 99 | data: bytes, byte_order: ByteOrder = ByteOrder.LITTLE 100 | ): 101 | return item.unpack_partial(data, byte_order=ByteOrder.LITTLE) 102 | 103 | async def little_endian_read( 104 | reader: asyncio.StreamReader, byte_order: ByteOrder = ByteOrder.LITTLE 105 | ): 106 | return item.read(reader, byte_order=ByteOrder.LITTLE) 107 | 108 | return type( 109 | f"{item.__name__}LittleEndian", 110 | (item,), 111 | { 112 | "pack": little_endian_pack, 113 | "unpack": little_endian_unpack, 114 | "unpack_partial": little_endian_unpack_partial, 115 | "read": little_endian_read, 116 | }, 117 | ) 118 | 119 | 120 | class AbstractPackable(ABC): 121 | @abstractmethod 122 | def pack(self, byte_order: ByteOrder = ByteOrder.NETWORK) -> bytes: 123 | raise NotImplementedError() 124 | 125 | @classmethod 126 | @abstractmethod 127 | def unpack_partial( 128 | cls: Type[P], data: bytes, byte_order: ByteOrder = ByteOrder.NETWORK 129 | ) -> Tuple[P, bytes]: 130 | raise NotImplementedError() 131 | 132 | @classmethod 133 | def unpack( 134 | cls: Type[P], data: bytes, byte_order: ByteOrder = ByteOrder.NETWORK 135 | ) -> P: 136 | ret, remaining = cls.unpack_partial(data, byte_order) 137 | if remaining: 138 | raise ValueError(f"Unexpected trailing bytes: {remaining!r}") 139 | return ret 140 | 141 | @classmethod 142 | @abstractmethod 143 | async def read( 144 | cls: Type[P], 145 | reader: asyncio.StreamReader, 146 | byte_order: ByteOrder = ByteOrder.NETWORK, 147 | ) -> P: 148 | raise NotImplementedError() 149 | 150 | 151 | @runtime_checkable 152 | class FixedSize(Protocol): 153 | num_bytes: int 154 | 155 | 156 | E = TypeVar("E") 157 | 158 | 159 | class IntEnumMeta(ABCMeta, Generic[E]): 160 | __members__: OrderedDictType[str, E] 161 | min_value: int 162 | max_value: int 163 | 164 | def __init__(cls, name, bases, clsdict): 165 | super().__init__(name, bases, clsdict) 166 | cls.__members__ = OrderedDict() 167 | if ( 168 | not isabstract(cls) 169 | and name != "IntEnum" 170 | and name != "IntFlag" 171 | and name != "AbstractIntEnum" 172 | ): 173 | values: Dict[int, str] = {} 174 | for v_name, value in clsdict.items(): 175 | if v_name.startswith("_") or v_name == "DEFAULT": 176 | continue 177 | elif not isinstance(value, int): 178 | raise TypeError( 179 | f"{name}.{v_name} must be of type `int`, not {type(value)}" 180 | ) 181 | elif value in values: 182 | raise TypeError( 183 | f"{name}.{v_name} has the same value as {name}.{values[value]}" 184 | ) 185 | if not values: 186 | # this is the first value 187 | cls.min_value = value 188 | cls.max_value = value 189 | else: 190 | cls.min_value = min(cls.min_value, value) 191 | cls.max_value = max(cls.max_value, value) 192 | values[value] = v_name 193 | int_enum = cls(value, name=v_name) 194 | cls.__members__[v_name] = int_enum 195 | setattr(int_enum, "name", v_name) 196 | setattr(cls, v_name, int_enum) 197 | 198 | if "DEFAULT" in clsdict: 199 | if clsdict["DEFAULT"] not in cls.__members__: 200 | raise TypeError( 201 | f"Invalid default value {name}.DEFAULT = {clsdict['DEFAULT']!r}" 202 | ) 203 | setattr(cls, "DEFAULT", cls.__members__[clsdict["DEFAULT"]]) 204 | else: 205 | setattr(cls, "DEFAULT", next(iter(cls.__members__.values()))) 206 | # call get_type() to ensure that all of the values are within range 207 | getattr(cls, "DEFAULT").get_type() 208 | 209 | def __iter__(cls) -> Iterator[E]: 210 | return iter(cls.__members__.values()) 211 | 212 | def get(cls, name: str) -> E: 213 | return cls.__members__[name] 214 | 215 | 216 | class AbstractIntEnum(int, AbstractPackable, Generic[E], metaclass=IntEnumMeta[E]): 217 | name: str 218 | DEFAULT: E 219 | 220 | def value(self) -> int: 221 | return int(self) 222 | 223 | def __str__(self): 224 | return f"{self.__class__.__name__}.{self.name}" 225 | 226 | def __repr__(self): 227 | return f"<{self!s}: {self.value()}>" 228 | 229 | @classmethod 230 | def get_type(cls: IntEnumMeta[E]) -> "Type[SizedInteger]": 231 | for int_type in (UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64): 232 | if ( 233 | cls.min_value >= int_type.MIN_VALUE 234 | and cls.max_value <= int_type.MAX_VALUE 235 | ): 236 | return int_type 237 | raise TypeError( 238 | "There is no SizedInteger type that can represent enum " 239 | f"{cls.__name__} on the range [{cls.min_value}, {cls.max_value}]" 240 | ) 241 | 242 | def pack(self, byte_order: ByteOrder = ByteOrder.NETWORK) -> bytes: 243 | int_type = self.get_type()(self.value()) 244 | return int_type.pack(byte_order) 245 | 246 | @classmethod 247 | def unpack_partial( 248 | cls: Type[P], data: bytes, byte_order: ByteOrder = ByteOrder.NETWORK 249 | ) -> Tuple[P, bytes]: 250 | int_type, remainder = cls.get_type().unpack_partial(data, byte_order) 251 | return cls(int(int_type)), remainder 252 | 253 | @classmethod 254 | async def read( 255 | cls: Type[P], 256 | reader: asyncio.StreamReader, 257 | byte_order: ByteOrder = ByteOrder.NETWORK, 258 | ) -> P: 259 | int_type = cls.get_type().read(reader, byte_order) 260 | return cls(int_type) 261 | 262 | def __or__(self, other) -> E: 263 | return self.__class__(int(self) | int(other)) 264 | 265 | __ror__ = __or__ 266 | 267 | def __and__(self, other) -> E: 268 | return self.__class__(int(self) & int(other)) 269 | 270 | __rand__ = __and__ 271 | 272 | def __neg__(self) -> E: 273 | return self.__class__( 274 | int(self) ^ ((1 << self.__class__.max_value.bit_length()) - 1) 275 | ) 276 | 277 | def __xor__(self, other) -> E: 278 | return self.__class__(int(self) ^ int(other)) 279 | 280 | __rxor__ = __xor__ 281 | 282 | 283 | class IntFlag(AbstractIntEnum["IntFlag"]): 284 | def __new__(cls, *args, **kwargs): 285 | if "name" in kwargs: 286 | name = kwargs["name"] 287 | del kwargs["name"] 288 | if args: 289 | value = args[0] 290 | args = args[1:] 291 | result = int.__new__(cls, value, *args, **kwargs) 292 | elif name in cls.__members__: 293 | return cls.__members__[name] 294 | else: 295 | raise ValueError( 296 | f"Invalid enum name {name!r}; possibilities are {list(cls.__members__.keys())!r}" 297 | ) 298 | elif not args: 299 | return cls.DEFAULT 300 | else: 301 | result = int.__new__(cls, *args, **kwargs) 302 | setattr(result, "name", None) 303 | return result 304 | 305 | @property 306 | def names(self) -> Iterator[str]: 307 | if self.name is not None: 308 | yield self.name 309 | else: 310 | yielded = False 311 | zero_value = "" 312 | int_value = int(self) 313 | for member_name, value in self.__class__.__members__.items(): 314 | if int(value) & int_value == int(value): 315 | yield member_name 316 | yielded = True 317 | elif value == 0: 318 | zero_value = member_name 319 | if not yielded and zero_value: 320 | yield zero_value 321 | 322 | def __str__(self): 323 | return f"{self.__class__.__name__}.{'|'.join(self.names)}" 324 | 325 | 326 | class IntEnum(AbstractIntEnum["IntEnum"]): 327 | def __new__(cls, *args, **kwargs): 328 | if "name" in kwargs: 329 | name = kwargs["name"] 330 | del kwargs["name"] 331 | if args: 332 | value = args[0] 333 | args = args[1:] 334 | elif name in cls.__members__: 335 | return cls.__members__[name] 336 | else: 337 | raise ValueError( 338 | f"Invalid enum name {name!r}; possibilities are {list(cls.__members__.keys())!r}" 339 | ) 340 | elif not args: 341 | return cls.DEFAULT 342 | else: 343 | for member_name, value in cls.__members__.items(): 344 | if value == args[0]: 345 | return value 346 | raise ValueError( 347 | f'Invalid enum value "{args[0]}"; possibilities are ' 348 | f"{list(cls.__members__.values())!r}" 349 | ) 350 | result = int.__new__(cls, value, *args, **kwargs) 351 | setattr(result, "name", name) 352 | return result 353 | 354 | 355 | class IPv6Address(ipaddress.IPv6Address, AbstractPackable): 356 | num_bytes: int = 16 357 | 358 | def __init__( 359 | self, 360 | address: Union[str, bytes, int, ipaddress.IPv6Address, ipaddress.IPv4Address], 361 | ): 362 | if ( 363 | isinstance(address, str) 364 | or isinstance(address, bytes) 365 | or isinstance(address, int) 366 | ): 367 | address = ipaddress.ip_address(address) 368 | if isinstance(address, ipaddress.IPv4Address): 369 | # convert ip4 to rfc 3056 IPv6 6to4 address 370 | # http://tools.ietf.org/html/rfc3056#section-2 371 | prefix6to4 = int(ipaddress.IPv6Address("2002::")) 372 | ipv4 = address 373 | address = ipaddress.IPv6Address(prefix6to4 | (int(ipv4) << 80)) 374 | assert address.sixtofour == ipv4 375 | super().__init__(address.packed) 376 | 377 | def pack(self, byte_order: ByteOrder = ByteOrder.BIG) -> bytes: 378 | if byte_order == ByteOrder.BIG: 379 | return self.packed 380 | else: 381 | return bytes(reversed(self.packed)) 382 | 383 | @classmethod 384 | def unpack_partial( 385 | cls: Type[P], data: bytes, byte_order: ByteOrder = ByteOrder.NETWORK 386 | ) -> Tuple[P, bytes]: 387 | if byte_order == ByteOrder.BIG: 388 | return cls(data[:16]), data[16:] 389 | else: 390 | return cls(bytes(reversed(data[:16]))), data[16:] 391 | 392 | @classmethod 393 | async def read( 394 | cls: Type[P], 395 | reader: asyncio.StreamReader, 396 | byte_order: ByteOrder = ByteOrder.NETWORK, 397 | ) -> P: 398 | return cls.unpack( 399 | await reader.readexactly(cls.num_bytes), byte_order=byte_order 400 | ) 401 | 402 | def __str__(self): 403 | if self.sixtofour is not None: 404 | return str(self.sixtofour) 405 | elif self.ipv4_mapped is not None: 406 | return str(self.ipv4_mapped) 407 | else: 408 | return super().__str__() 409 | 410 | 411 | class SizeMeta(type): 412 | num_bytes_is_defined: bool = False 413 | dependent_type_is_defined: bool = False 414 | 415 | @property 416 | def num_bytes(cls) -> int: 417 | if not cls.num_bytes_is_defined: 418 | raise TypeError( 419 | f"{cls.__name__} must be subscripted with its size when used in a Struct! " 420 | f"(E.g., {cls.__name__}[1024] will specify that it is 1024 bytes.)" 421 | ) 422 | return cls._num_bytes 423 | 424 | @property 425 | def size_field_name(cls) -> str: 426 | if not cls.dependent_type_is_defined: 427 | raise TypeError( 428 | f"{cls.__name__} must be subscripted with the name of its size field when used in a Struct!" 429 | f' (E.g., {cls.__name__}["length"] will specify that its length is specified by the ' 430 | "`length` field.)" 431 | ) 432 | return cls._size_field_name 433 | 434 | def __getitem__(self, item): 435 | if isinstance(item, int): 436 | if item < 0: 437 | raise ValueError(f"Fixed size {item} must be non-negative") 438 | typename = f"{self.__name__}{item}" 439 | return type( 440 | typename, (self,), {"_num_bytes": item, "num_bytes_is_defined": True} 441 | ) 442 | elif isinstance(item, str): 443 | typename = f"{self.__name__}{item}" 444 | return type( 445 | typename, 446 | (self,), 447 | {"_size_field_name": item, "dependent_type_is_defined": True}, 448 | ) 449 | else: 450 | raise KeyError(item) 451 | 452 | 453 | class Sized(metaclass=SizeMeta): 454 | num_bytes_is_defined: bool = False 455 | dependent_type_is_defined: bool = False 456 | 457 | @property 458 | def num_bytes(self) -> int: 459 | if self.num_bytes_is_defined: 460 | return super().num_bytes 461 | elif self.dependent_type_is_defined: 462 | raise NotImplementedError() 463 | else: 464 | raise ValueError(f"{self} does not have its size set!") 465 | 466 | @property 467 | def has_size(self) -> bool: 468 | return self.num_bytes_is_defined or self.dependent_type_is_defined 469 | 470 | 471 | class SizedByteArray(bytes, Sized): 472 | @property 473 | def num_bytes(self) -> int: 474 | if not self.has_size: 475 | return len(self) 476 | else: 477 | return super().num_bytes 478 | 479 | def __new__(cls, value: bytes, pad: bool = True): 480 | if cls.num_bytes_is_defined and cls.num_bytes < len(value): 481 | raise ValueError( 482 | f"{cls.__name__} can hold at most {cls.num_bytes} bytes, but {value!r} is longer!" 483 | ) 484 | elif cls.num_bytes_is_defined and cls.num_bytes > len(value): 485 | value = value + b"\0" * (cls.num_bytes - len(value)) 486 | return bytes.__new__(cls, value) 487 | 488 | def pack(self, byte_order: ByteOrder = ByteOrder.NETWORK) -> bytes: 489 | return self 490 | 491 | @classmethod 492 | def unpack( 493 | cls: Type[P], data: bytes, byte_order: ByteOrder = ByteOrder.NETWORK 494 | ) -> P: 495 | return cls(data) 496 | 497 | @classmethod 498 | def unpack_partial( 499 | cls: Type[P], data: bytes, byte_order: ByteOrder = ByteOrder.NETWORK 500 | ) -> Tuple[P, bytes]: 501 | return cls(data[: cls.num_bytes]), data[cls.num_bytes :] 502 | 503 | @classmethod 504 | async def read( 505 | cls: Type[P], 506 | reader: asyncio.StreamReader, 507 | byte_order: ByteOrder = ByteOrder.NETWORK, 508 | ) -> P: 509 | data = await reader.read(cls.num_bytes) 510 | return cls.unpack(data, byte_order) 511 | 512 | 513 | class SizedIntegerMeta(ABCMeta): 514 | FORMAT: str 515 | BITS: int 516 | BYTES: int 517 | SIGNED: bool 518 | MAX_VALUE: int 519 | MIN_VALUE: int 520 | 521 | def __init__(cls, name, bases, clsdict): 522 | if ( 523 | name != "SizedInteger" 524 | and "FORMAT" not in clsdict 525 | and (not isinstance(cls.FORMAT, str) or not cls.FORMAT) 526 | ): 527 | raise ValueError( 528 | f"{name} subclasses `SizedInteger` but does not define a `FORMAT` class member" 529 | ) 530 | super().__init__(name, bases, clsdict) 531 | if name != "SizedInteger": 532 | setattr( 533 | cls, "BYTES", struct.calcsize(f"{ByteOrder.NETWORK.value}{cls.FORMAT}") 534 | ) 535 | setattr(cls, "BITS", cls.BYTES * 8) 536 | setattr(cls, "SIGNED", cls.FORMAT.islower()) 537 | setattr(cls, "MAX_VALUE", 2 ** (cls.BITS - [0, 1][cls.SIGNED]) - 1) 538 | setattr(cls, "MIN_VALUE", [0, -(2 ** (cls.BITS - 1))][cls.SIGNED]) 539 | 540 | @property 541 | def num_bytes(cls) -> int: 542 | return cls.BYTES 543 | 544 | @property 545 | def c_type(cls) -> str: 546 | return f"{['u',''][cls.SIGNED]}int{cls.BITS}_t" 547 | 548 | 549 | class SizedInteger(int, metaclass=SizedIntegerMeta): 550 | def __new__(cls: SizedIntegerMeta, value: int): 551 | retval: SizedInteger = int.__new__(cls, value) 552 | if not (cls.MIN_VALUE <= retval <= cls.MAX_VALUE): 553 | raise ValueError( 554 | f"{retval} is not in the range [{cls.MIN_VALUE}, {cls.MAX_VALUE}]" 555 | ) 556 | return retval 557 | 558 | def pack(self, byte_order: ByteOrder = ByteOrder.NETWORK) -> bytes: 559 | return struct.pack(f"{byte_order.value}{self.__class__.FORMAT}", self) 560 | 561 | @classmethod 562 | def unpack( 563 | cls, data: bytes, byte_order: ByteOrder = ByteOrder.NETWORK 564 | ) -> "SizedInteger": 565 | return cls(struct.unpack(f"{byte_order.value}{cls.FORMAT}", data)[0]) 566 | 567 | @classmethod 568 | def unpack_partial( 569 | cls: Type[P], data: bytes, byte_order: ByteOrder = ByteOrder.NETWORK 570 | ) -> Tuple[P, bytes]: 571 | try: 572 | return ( 573 | cls( 574 | struct.unpack(f"{byte_order.value}{cls.FORMAT}", data[: cls.BYTES])[ 575 | 0 576 | ] 577 | ), 578 | data[cls.BYTES :], 579 | ) 580 | except struct.error: 581 | pass 582 | raise UnpackError(f"Error unpacking {cls.__name__} from the front of {data!r}") 583 | 584 | @classmethod 585 | async def read( 586 | cls: Type[P], 587 | reader: asyncio.StreamReader, 588 | byte_order: ByteOrder = ByteOrder.NETWORK, 589 | ) -> P: 590 | data = await reader.read(cls.num_bytes) 591 | return cls.unpack(data, byte_order) 592 | 593 | def __str__(self): 594 | return f"{self.__class__.c_type}({super().__str__()})" 595 | 596 | 597 | class Char(SizedInteger): 598 | FORMAT = "b" 599 | 600 | 601 | class UnsignedChar(SizedInteger): 602 | FORMAT = "B" 603 | 604 | 605 | class Short(SizedInteger): 606 | FORMAT = "h" 607 | 608 | 609 | class UnsignedShort(SizedInteger): 610 | FORMAT = "H" 611 | 612 | 613 | class Int(SizedInteger): 614 | FORMAT = "i" 615 | 616 | 617 | class UnsignedInt(SizedInteger): 618 | FORMAT = "I" 619 | 620 | 621 | class Long(SizedInteger): 622 | FORMAT = "l" 623 | 624 | 625 | class UnsignedLong(SizedInteger): 626 | FORMAT = "L" 627 | 628 | 629 | class LongLong(SizedInteger): 630 | FORMAT = "q" 631 | 632 | 633 | class UnsignedLongLong(SizedInteger): 634 | FORMAT = "Q" 635 | 636 | 637 | Int8 = Char 638 | UInt8 = UnsignedChar 639 | Bool = UInt8 640 | Int16 = Short 641 | UInt16 = UnsignedShort 642 | Int32 = Long 643 | UInt32 = UnsignedLong 644 | Int64 = LongLong 645 | UInt64 = UnsignedLongLong 646 | 647 | 648 | class DateTime(UInt64): 649 | def __new__(cls, *args): 650 | if not args: 651 | return UInt64.__new__(cls, int(time.time())) 652 | else: 653 | return UInt64.__new__(cls, *args) 654 | 655 | @staticmethod 656 | def fromisoformat(timestamp: str) -> "DateTime": 657 | return DateTime( 658 | int(datetime.datetime.fromisoformat(timestamp).timestamp() + 0.5) 659 | ) 660 | 661 | @property 662 | def date(self) -> datetime.datetime: 663 | return datetime.datetime.fromtimestamp(float(self)) 664 | 665 | def __repr__(self): 666 | return f"{self.__class__.__name__}({int(self)})" 667 | 668 | def __str__(self): 669 | return self.date.isoformat() 670 | -------------------------------------------------------------------------------- /fluxture/shodan.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from abc import ABC 3 | from argparse import ArgumentParser, Namespace 4 | from getpass import getpass 5 | from time import sleep 6 | from typing import (Any, AsyncIterator, Callable, Dict, Iterable, Iterator, 7 | Optional, Tuple) 8 | 9 | import keyring 10 | from shodan import APIError, Shodan 11 | 12 | from .async_utils import iterator_to_async, sync_to_async 13 | from .bitcoin import Node 14 | from .crawl_schema import HostInfo 15 | from .crawler import Crawler, CrawlListener 16 | from .fluxture import Command 17 | from .serialization import DateTime, IPv6Address 18 | 19 | KEYRING_NAME: str = "fluxture" 20 | 21 | 22 | def prompt( 23 | message: str, 24 | yes_options: Tuple[str, ...] = ("y", "yes"), 25 | no_options: Tuple[str, ...] = ("n", "no"), 26 | default: bool = True, 27 | ) -> bool: 28 | if default: 29 | yes_options = yes_options + ("",) 30 | else: 31 | no_options = no_options + ("",) 32 | while True: 33 | ret = input(message).strip().lower() 34 | if ret in yes_options: 35 | return True 36 | elif ret in no_options: 37 | return False 38 | 39 | 40 | class ShodanResult(HostInfo): 41 | def __init__(self, **kwargs): 42 | if "timestamp" in kwargs: 43 | timestamp: DateTime = DateTime.fromisoformat(kwargs["timestamp"]) 44 | else: 45 | timestamp = DateTime() 46 | if "ip" in kwargs: 47 | ip: IPv6Address = IPv6Address(kwargs["ip"]) 48 | else: 49 | raise ValueError( 50 | f"Shodan Result does not contain an IP address: {kwargs!r}" 51 | ) 52 | if "isp" in kwargs: 53 | isp: Optional[str] = kwargs["isp"] 54 | else: 55 | isp = None 56 | if "ip_str" in kwargs: 57 | self.ip_str: Optional[str] = kwargs["ip_str"] 58 | else: 59 | self.ip_str = None 60 | if "os" in kwargs: 61 | os: Optional[str] = kwargs["os"] 62 | else: 63 | os = None 64 | self.result: Dict[str, Any] = kwargs 65 | super().__init__(ip=ip, isp=isp, os=os, timestamp=timestamp) 66 | 67 | def __getattr__(self, item): 68 | if item in self.result: 69 | return self.result[item] 70 | 71 | def __str__(self): 72 | if self.ip_str is not None: 73 | ip = self.ip_str 74 | else: 75 | ip = str(self.ip) 76 | if self.isp is not None: 77 | isp = f" on {self.isp}" 78 | else: 79 | isp = "" 80 | if self.os is not None: 81 | os = f" running {self.os}" 82 | else: 83 | os = "" 84 | return f"{ip}{isp}{os}" 85 | 86 | def __repr__(self): 87 | kwargs = "".join( 88 | [ 89 | f", {argname!s}={argvalue!r}" 90 | for argname, argvalue in self.result.items() 91 | if argname != "ip" and argname != "isp" 92 | ] 93 | ) 94 | return f"{self.__class__.__name__}(ip={self.ip!r}, isp={self.isp!r}{kwargs})" 95 | 96 | 97 | class Query(ABC): 98 | def __init__(self, name: str, callback: Optional[Callable[["Query"], Any]] = None): 99 | self.name: str = name 100 | self.callback: Optional[Callable[[Query], Any]] = callback 101 | 102 | 103 | SEARCH_QUERIES: Dict[str, "SearchQuery"] = {} 104 | 105 | 106 | class SearchQuery(Query): 107 | def __init__( 108 | self, name: str, query: str, callback: Optional[Callable[["Query"], Any]] = None 109 | ): 110 | super().__init__(name=name, callback=callback) 111 | self.query: str = query 112 | 113 | @staticmethod 114 | def register( 115 | name: str, query: str, callback: Optional[Callable[["Query"], Any]] = None 116 | ) -> "SearchQuery": 117 | sq = SearchQuery(name=name, query=query, callback=callback) 118 | if name in SEARCH_QUERIES: 119 | raise KeyError( 120 | f'A search query of name "{name}" is already registered: {SEARCH_QUERIES[name]!r}' 121 | ) 122 | SEARCH_QUERIES[name] = sq 123 | return sq 124 | 125 | def run(self, api: Shodan) -> Iterator[ShodanResult]: 126 | for result in api.search_cursor(self.query): 127 | yield ShodanResult(**result) 128 | 129 | @iterator_to_async(poll_interval=0.5) 130 | def run_async(self, api: Shodan) -> AsyncIterator[ShodanResult]: 131 | return self.run(api) # type: ignore 132 | 133 | def __repr__(self): 134 | return f"{self.__class__.__name__}(name={self.name!r}, query={self.query!r}, callback={self.callback!r})" 135 | 136 | 137 | def get_keychain_api_key() -> Optional[str]: 138 | return keyring.get_password(KEYRING_NAME, "shodan_api_key") 139 | 140 | 141 | def save_keychain_api_key(api_key: str): 142 | keyring.set_password(KEYRING_NAME, "shodan_api_key", api_key) 143 | 144 | 145 | def get_api(api_key: Optional[str] = None) -> Shodan: 146 | keychain_key = get_keychain_api_key() 147 | if api_key is None: 148 | api_key = keychain_key 149 | if api_key is None: 150 | api_key = getpass(f"Shodan API KEY: ") 151 | if prompt( 152 | "Would you like to save this API key to the system keychain for future use? [Yn] " 153 | ): 154 | save_keychain_api_key(api_key) 155 | elif keychain_key is None: 156 | if prompt( 157 | "Would you like to save this API key to the system keychain for future use? [Yn] " 158 | ): 159 | save_keychain_api_key(api_key) 160 | elif api_key != keychain_key: 161 | print("This is a different API key than what is stored in the system keychain.") 162 | if prompt("Would you like to update the API key in the system keychain? [Yn] "): 163 | save_keychain_api_key(api_key) 164 | return Shodan(api_key) 165 | 166 | 167 | class ShodanCommand: 168 | def __init_arguments__(self, parser: ArgumentParser): 169 | parser.add_argument( 170 | "--api-key", 171 | "-k", 172 | type=str, 173 | default=None, 174 | help="Shodan API key. If omitted, a saved " 175 | "API key in the system keychain will be used, if one exists. Otherwise the user will be " 176 | "prompted to enter an API key.", 177 | ) 178 | 179 | 180 | class ActiveNodes(ShodanCommand, Command): 181 | name = "active" 182 | help = "enumerate active nodes from Shodan" 183 | 184 | def __init_arguments__(self, parser: ArgumentParser): 185 | super().__init_arguments__(parser) 186 | parser.add_argument("QUERY", choices=SEARCH_QUERIES.keys()) 187 | 188 | def run(self, args: Namespace): 189 | api = get_api(args.api_key) 190 | for result in SEARCH_QUERIES[args.QUERY].run(api): 191 | print(str(result)) 192 | 193 | 194 | class HostInfoCommand(ShodanCommand, Command): 195 | name = "hostinfo" 196 | help = "get information about IP addresses from Shodan" 197 | 198 | def __init_arguments__(self, parser: ArgumentParser): 199 | super().__init_arguments__(parser) 200 | parser.add_argument("IP", nargs="+", type=str) 201 | 202 | def run(self, args: Namespace): 203 | api = get_api(args.api_key) 204 | for ip in args.IP: 205 | try: 206 | info = api.host(ip) 207 | for key, value in info.items(): 208 | print(f"{key!s}:\t{value!r}") 209 | except APIError as e: 210 | sys.stdout.flush() 211 | sys.stderr.write(str(e)) 212 | sys.stderr.write("\n") 213 | sys.stderr.flush() 214 | 215 | 216 | class HostInfoFetcher(CrawlListener): 217 | def __init__(self): 218 | self.batch_size: int = 100 219 | self.node_queue: list[Node] = [] 220 | 221 | @staticmethod 222 | @sync_to_async(poll_interval=0.5) 223 | def get_host_info(ips: Iterable[IPv6Address]) -> Iterable[HostInfo]: 224 | max_delay = 10.0 225 | next_delay = 0.5 226 | while True: 227 | try: 228 | info = get_api().host(map(str, ips)) 229 | print(repr(info)) 230 | exit(0) 231 | except APIError as e: 232 | if "rate limit reached" in str(e): 233 | sleep(next_delay) 234 | next_delay = min(max_delay, next_delay * 1.5) 235 | # TODO: Check for rate limiting 236 | return (ShodanResult(**get_api().host(str(ip))) for ip in ips) 237 | 238 | async def process_nodes(self, crawler: Crawler, finalize: bool = False): 239 | if finalize: 240 | batch_size = len(self.node_queue) 241 | else: 242 | batch_size = self.batch_size 243 | while len(self.node_queue) >= batch_size > 0: 244 | to_process = self.node_queue[:batch_size] 245 | self.node_queue = self.node_queue[batch_size:] 246 | for info in await HostInfoFetcher.get_host_info( 247 | (p.address for p in to_process) 248 | ): 249 | crawler.crawl.set_host_info(info) 250 | 251 | # async def on_crawl_node(self, crawler: Crawler, node: Node): 252 | # self.node_queue.append(node) 253 | # await self.process_nodes(crawler) 254 | # 255 | # info = await HostInfoFetcher.get_host_info(node.address) 256 | # print(info) 257 | # 258 | # async def on_miner(self, crawler: Crawler, node: Node, miner): 259 | # self.node_queue.append(node) 260 | # await self.process_nodes(crawler) 261 | # 262 | # async def on_complete(self, crawler: Crawler): 263 | # await self.process_nodes(crawler, finalize=True) 264 | -------------------------------------------------------------------------------- /fluxture/statistics.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | from math import sqrt 3 | from typing import Iterable, Iterator, List, Optional, Tuple, Union 4 | 5 | Numeric = Union[int, float] 6 | 7 | 8 | def memoize(func): 9 | member_name = f"_{func.__name__}_memoized" 10 | 11 | @wraps(func) 12 | def wrapper(self, *args, **kwargs): 13 | if hasattr(self, member_name): 14 | return getattr(self, member_name) 15 | result = func(self, *args, **kwargs) 16 | setattr(self, member_name, result) 17 | return result 18 | 19 | return wrapper 20 | 21 | 22 | class Statistics: 23 | def __init__(self, iterable: Iterable[Numeric]): 24 | self._iter: Optional[Iterator[Numeric]] = iter(iterable) 25 | self._data: List[Numeric] = [] 26 | 27 | def __getitem__(self, index: int) -> Numeric: 28 | while self._iter is not None and index >= len(self._data): 29 | try: 30 | self._data.append(next(self._iter)) 31 | except StopIteration: 32 | self._iter = None 33 | return self._data[index] 34 | 35 | def __iter__(self) -> Iterator[Numeric]: 36 | if self._iter is None: 37 | yield from self._data 38 | return 39 | i = 0 40 | while True: 41 | try: 42 | yield self[i] 43 | i += 1 44 | except IndexError: 45 | break 46 | 47 | def __len__(self): 48 | while self._iter is not None: 49 | try: 50 | _ = self[len(self._data)] 51 | except IndexError: 52 | break 53 | return len(self._data) 54 | 55 | def __bool__(self): 56 | if self._data: 57 | return True 58 | try: 59 | _ = next(iter(self)) 60 | return True 61 | except StopIteration: 62 | return False 63 | 64 | @property 65 | @memoize 66 | def average(self) -> float: 67 | if not self: 68 | return 0.0 69 | return sum(self) / len(self) 70 | 71 | @property 72 | @memoize 73 | def std_dev(self) -> float: 74 | if not self: 75 | return 0.0 76 | avg = self.average 77 | return sqrt(sum((x - avg) ** 2.0 for x in self) / len(self)) 78 | 79 | @property 80 | @memoize 81 | def ordered(self) -> Tuple[Numeric, ...]: 82 | return tuple(sorted(self)) 83 | 84 | @property 85 | @memoize 86 | def median(self) -> Numeric: 87 | n = len(self) 88 | ordered = self.ordered 89 | if n % 2 == 0: 90 | return (ordered[(n - 1) // 2] + ordered[(n + 1) // 2]) / 2.0 91 | else: 92 | return ordered[n // 2] 93 | 94 | def __str__(self): 95 | return f"μ {self.average} σ {self.std_dev} Med {self.median}" 96 | -------------------------------------------------------------------------------- /fluxture/structures.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import itertools 3 | from abc import ABCMeta 4 | from collections import OrderedDict 5 | from typing import Generic, Iterator, KeysView 6 | from typing import OrderedDict as OrderedDictType 7 | from typing import Tuple, Type, TypeVar 8 | from typing import ValuesView 9 | from typing import ValuesView as ValuesViewType 10 | 11 | from fluxture.serialization import (AbstractIntEnum, ByteOrder, FixedSize, P, 12 | Packable, UnpackError) 13 | 14 | F = TypeVar("F") 15 | 16 | 17 | class StructMeta(ABCMeta, Generic[F]): 18 | FIELDS: OrderedDictType[str, Type[F]] 19 | 20 | def __init__(cls, name, bases, clsdict): 21 | fields = OrderedDict() 22 | field_sources = {} 23 | for base in bases: 24 | if isinstance(base, StructMeta): 25 | # this will happen if a Struct is extending another Struct 26 | # so inherit all of the superclass's fields 27 | for field_name, field_type in base.FIELDS.items(): 28 | if field_name in fields: 29 | raise TypeError( 30 | f"{name} inherits field {field_name} from both {base.__name__} and " 31 | f"{field_sources[field_name]}" 32 | ) 33 | elif hasattr(base, "non_serialized") and field_name not in getattr( 34 | base, "non_serialized" 35 | ): 36 | field_sources[field_name] = base 37 | fields[field_name] = field_type 38 | if "non_serialized" in clsdict: 39 | non_serialized = set(clsdict["non_serialized"]) 40 | else: 41 | non_serialized = set() 42 | non_serialized |= {"FIELDS", "non_serialized"} 43 | if "__annotations__" in clsdict: 44 | for field_name, field_type in clsdict["__annotations__"].items(): 45 | if field_name in field_sources: 46 | raise TypeError( 47 | f"{name} cannot redefine field {field_name} from {field_sources[field_name]}" 48 | ) 49 | elif field_name not in non_serialized: 50 | fields[field_name] = field_type 51 | super().__init__(name, bases, clsdict) 52 | cls.validate_fields(fields) 53 | setattr(cls, "FIELDS", fields) 54 | # are all fields fixed size? if so, we are fixed size, too! 55 | if all(hasattr(field, "num_bytes") for field in fields.values()): 56 | cls.num_bytes = sum(field.num_bytes for field in fields.values()) # type: ignore 57 | assert isinstance(cls, FixedSize) 58 | 59 | def validate_fields(cls, fields: OrderedDictType[str, Type[F]]): 60 | pass 61 | 62 | 63 | class Struct(Generic[F], metaclass=StructMeta[F]): 64 | def __init__(self, *args, **kwargs): 65 | unsatisfied_fields = [ 66 | name for name in self.__class__.FIELDS.keys() if name not in kwargs 67 | ] 68 | if len(args) > len(unsatisfied_fields): 69 | raise ValueError( 70 | f"Unexpected positional argument: {args[len(unsatisfied_fields)]}" 71 | ) 72 | elif len(args) < len(unsatisfied_fields): 73 | # see if any of the unsatisfied fields have defaults: 74 | for name in unsatisfied_fields[len(args) :]: 75 | field_type = self.__class__.FIELDS[name] 76 | if ( 77 | hasattr(field_type, "column_options") 78 | and field_type.column_options.default is not None 79 | ): 80 | kwargs[name] = field_type.column_options.default 81 | elif issubclass(field_type, AbstractIntEnum): 82 | kwargs[name] = field_type.DEFAULT 83 | else: 84 | raise ValueError(f"Missing argument for {name} in {self.__class__}") 85 | unsatisfied_fields = unsatisfied_fields[: len(args)] 86 | for name, value in itertools.chain( 87 | kwargs.items(), zip(unsatisfied_fields, args) 88 | ): 89 | if name not in self.__class__.FIELDS: 90 | raise TypeError( 91 | f"{self.__class__.__name__}.__init__() got an unexpected keyword argument '{name}'. " 92 | f"Valid arguments are: {', '.join(self.__class__.FIELDS.keys())}" 93 | ) 94 | elif isinstance(value, self.__class__.FIELDS[name]): 95 | # the value was already passed as the correct type 96 | setattr(self, name, value) 97 | else: 98 | # we need to construct the correct type 99 | setattr(self, name, self.__class__.FIELDS[name](value)) 100 | super().__init__() 101 | 102 | def __contains__(self, field_name: str): 103 | return field_name in self.__class__.FIELDS 104 | 105 | def __getitem__(self, field_name: str) -> Type[F]: 106 | if field_name not in self: 107 | raise KeyError(field_name) 108 | return getattr(self, field_name) 109 | 110 | def __len__(self) -> int: 111 | return len(self.__class__.FIELDS) 112 | 113 | def __iter__(self) -> Iterator[str]: 114 | return iter(self.__class__.FIELDS.keys()) 115 | 116 | def items(self) -> Iterator[Tuple[str, Type[F]]]: 117 | for field_name in self: 118 | yield field_name, getattr(self, field_name) 119 | 120 | def keys(self) -> KeysView[str]: 121 | return self.__class__.FIELDS.keys() 122 | 123 | def values(self) -> ValuesViewType[Type[F]]: 124 | return ValuesView(self) 125 | 126 | def __eq__(self, other): 127 | return ( 128 | isinstance(other, Struct) 129 | and len(self) == len(other) 130 | and all(a == b for (_, a), (_, b) in zip(self.items(), other.items())) 131 | ) 132 | 133 | def __ne__(self, other): 134 | return not (self == other) 135 | 136 | def __str__(self): 137 | types = "".join( 138 | f" {field_name} = {field_value!s};\n" 139 | for field_name, field_value in self.items() 140 | ) 141 | newline = "\n" 142 | return f"typedef struct {{{['', newline][len(types) > 0]}{types}}} {self.__class__.__name__}" 143 | 144 | def __repr__(self): 145 | args = [ 146 | f"{name}={getattr(self, name)!r}" for name in self.__class__.FIELDS.keys() 147 | ] 148 | return f"{self.__class__.__name__}({', '.join(args)})" 149 | 150 | 151 | class PackableStruct(Generic[P], Struct[P]): 152 | def pack(self, byte_order: ByteOrder = ByteOrder.NETWORK) -> bytes: 153 | # TODO: Combine the formats and use a single struct.pack instead 154 | return b"".join( 155 | getattr(self, field_name).pack(byte_order) 156 | for field_name in self.__class__.FIELDS.keys() 157 | ) 158 | 159 | @classmethod 160 | def validate_fields(cls, fields: OrderedDictType[str, Type[F]]): 161 | for field_name, field_type in fields.items(): 162 | if not isinstance(field_type, Packable): 163 | raise TypeError( 164 | f"Field {field_name} of {cls.__name__} must be Packable, not {field_type}" 165 | ) 166 | 167 | @classmethod 168 | def unpack( 169 | cls: Type[P], data: bytes, byte_order: ByteOrder = ByteOrder.NETWORK 170 | ) -> P: 171 | ret, remaining = cls.unpack_partial(data, byte_order) 172 | if remaining: 173 | raise ValueError(f"Unexpected trailing bytes: {remaining!r}") 174 | return ret 175 | 176 | @classmethod 177 | def unpack_partial( 178 | cls: Type[P], data: bytes, byte_order: ByteOrder = ByteOrder.NETWORK 179 | ) -> Tuple[P, bytes]: 180 | remaining_data = data 181 | args = [] 182 | for field_name, field_type in cls.FIELDS.items(): 183 | try: 184 | field, remaining_data = field_type.unpack_partial( 185 | remaining_data, byte_order 186 | ) 187 | errored = False 188 | except UnpackError: 189 | errored = True 190 | if errored: 191 | parsed_fields = [ 192 | f"{field_name} = {arg!r}" 193 | for field_name, arg in zip(cls.FIELDS.keys(), args) 194 | ] 195 | parsed_fields = ", ".join(parsed_fields) 196 | raise UnpackError( 197 | f"Error parsing field {cls.__name__}.{field_name} (field {len(args)+1}) of type " 198 | f"{field_type.__name__} from bytes {remaining_data!r}. Prior parsed field values: " 199 | f"{parsed_fields}" 200 | ) 201 | args.append(field) 202 | return cls(*args), remaining_data 203 | 204 | @classmethod 205 | async def read( 206 | cls: Type[P], 207 | reader: asyncio.StreamReader, 208 | byte_order: ByteOrder = ByteOrder.NETWORK, 209 | ) -> P: 210 | if hasattr(cls, "num_bytes"): 211 | data = await reader.read(cls.num_bytes) 212 | return cls.unpack(data, byte_order) 213 | # we need to read it one field at a time 214 | args = [] 215 | for field_name, field_type in cls.FIELDS.items(): 216 | try: 217 | field = field_type.read(reader, byte_order) 218 | errored = False 219 | except UnpackError: 220 | errored = True 221 | if errored: 222 | parsed_fields = [ 223 | f"{field_name} = {arg!r}" 224 | for field_name, arg in zip(cls.FIELDS.keys(), args) 225 | ] 226 | parsed_fields = ", ".join(parsed_fields) 227 | raise UnpackError( 228 | f"Error parsing field {cls.__name__}.{field_name} (field {len(args) + 1}) of type " 229 | f"{field_type.__name__}. Prior parsed field values: {parsed_fields}" 230 | ) 231 | args.append(field) 232 | return cls(*args) 233 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | name="fluxture", 6 | description="A crawling framework for blockchains and peer-to-peer systems", 7 | url="https://github.com/trailofbits/fluxture", 8 | author="Trail of Bits", 9 | version="0.0.1", 10 | packages=find_packages(exclude=["test"]), 11 | python_requires=">=3.7", 12 | install_requires=[ 13 | "fastkml~=0.11", 14 | "geoip2~=4.1.0", 15 | "graphviz~=0.14.1", 16 | "great-circle-calculator~=1.1.0", 17 | "keyring~=21.8.0", 18 | "lxml~=4.9.1", 19 | "networkx~=2.4", 20 | "numpy>=1.19.4", 21 | "shapely~=1.8.0", 22 | "shodan~=1.24.0", 23 | "six>=1.5", 24 | "tqdm>=4.48.0", 25 | "typing_extensions~=4.2.0 ; python_version < '3.8'", 26 | ], 27 | extras_require={"dev": ["flake8", "pytest", "twine"]}, 28 | entry_points={"console_scripts": ["fluxture = fluxture.__main__:main"]}, 29 | classifiers=[ 30 | "Development Status :: 4 - Beta", 31 | "Environment :: Console", 32 | "Intended Audience :: Science/Research", 33 | "License :: OSI Approved :: Apache Software License", 34 | "Programming Language :: Python :: 3 :: Only", 35 | "Topic :: Utilities", 36 | ], 37 | ) 38 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crytic/fluxture/882c44627a1ea677c5d44f5d6463fee03ebad3b6/test/__init__.py -------------------------------------------------------------------------------- /test/test_async_utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from time import sleep 3 | from typing import List, Tuple 4 | from unittest import TestCase 5 | 6 | from fluxture.async_utils import iterator_to_async, sync_to_async 7 | 8 | 9 | @iterator_to_async(poll_interval=0.24) 10 | def slow_iterator(n: int): 11 | for i in range(n): 12 | sleep(0.5) 13 | yield i 14 | 15 | 16 | async def slow_iterator_async(n: int) -> List[Tuple[int, float]]: 17 | loop = asyncio.get_running_loop() 18 | results = [] 19 | async for i in slow_iterator(n): 20 | results.append((i, loop.time())) 21 | return results 22 | 23 | 24 | async def sleep_and_return_time(duration: float) -> float: 25 | loop = asyncio.get_running_loop() 26 | await asyncio.sleep(duration) 27 | return loop.time() 28 | 29 | 30 | async def slow_iterator_test(test: TestCase, n: int): 31 | slow_iterator_results, sleep_time = await asyncio.gather(slow_iterator_async(n), sleep_and_return_time(n / 2.0)) 32 | expected = 0 33 | has_time_before = False 34 | has_time_after = False 35 | for i, end_time in slow_iterator_results: 36 | test.assertEqual(i, expected) 37 | expected += 1 38 | has_time_before = has_time_before or end_time < sleep_time 39 | has_time_after = has_time_after or end_time > sleep_time 40 | # ensure that asyncio actually scheduled `sleep_and_return` interleaved between `slow_iterator_async` iterations: 41 | test.assertTrue(has_time_before) 42 | test.assertTrue(has_time_after) 43 | 44 | 45 | @sync_to_async(poll_interval=0.25) 46 | def slow_function(): 47 | sleep(2.0) 48 | 49 | 50 | async def time_slow_function() -> float: 51 | loop = asyncio.get_running_loop() 52 | await slow_function() 53 | return loop.time() 54 | 55 | 56 | async def slow_function_test(test: TestCase): 57 | slow_func_end_time, sleep_time = await asyncio.gather(time_slow_function(), sleep_and_return_time(1.0)) 58 | # ensure that asyncio actually scheduled `sleep_and_return` before `time_slow_function`: 59 | test.assertLessEqual(sleep_time, slow_func_end_time) 60 | 61 | 62 | class TestAsyncUtils(TestCase): 63 | def test_iterator_to_async(self): 64 | asyncio.run(slow_iterator_test(self, 10)) 65 | 66 | def test_sync_to_async(self): 67 | asyncio.run(slow_function_test(self)) 68 | -------------------------------------------------------------------------------- /test/test_bitcoin.py: -------------------------------------------------------------------------------- 1 | import time 2 | from ipaddress import ip_address 3 | from unittest import TestCase 4 | 5 | from fluxture.bitcoin import BitcoinMessage, NetAddr, VersionMessage 6 | from fluxture.serialization import ByteOrder 7 | 8 | EXAMPLE_VERSION_MESSAGE = b"".join([ 9 | b"\x72\x11\x01\x00", # Protocol version: 70002 10 | b"\x01\x00\x00\x00\x00\x00\x00\x00", # Services: NODE_NETWORK 11 | b"\xbc\x8f\x5e\x54\x00\x00\x00\x00", # [Epoch time][unix epoch time]: 1415483324 12 | b"\x01\x00\x00\x00\x00\x00\x00\x00", # Receiving node's services 13 | b"\x00\x00\x00\x00\x00\x00\x00\x00", 14 | b"\x00\x00\xff\xff\xc6\x1b\x64\x09", # Receiving node's IPv6 address 15 | b"\x20\x8d", # Receiving node's port number 16 | b"\x01\x00\x00\x00\x00\x00\x00\x00", # Transmitting node's services 17 | b"\x00\x00\x00\x00\x00\x00\x00\x00", 18 | b"\x00\x00\xff\xff\xcb\x00\x71\xc0", # Transmitting node's IPv6 address 19 | b"\x20\x8d", # Transmitting node's port number 20 | b"\x12\x80\x35\xcb\xc9\x79\x53\xf8", # Nonce 21 | b"\x0F" # Bytes in user agent string: 15 22 | b"\x2f\x53\x61\x74\x6f\x73\x68\x69", 23 | b"\x3a\x30\x2e\x39\x2e\x33\x2f", # User agent: /Satoshi:0.9.3/ 24 | b"\xcf\x05\x05\x00", # Start height: 329167 25 | b"\x01", # Relay flag: true 26 | ]) 27 | 28 | 29 | class TestBitcoin(TestCase): 30 | def test_version_message(self): 31 | msg = VersionMessage( 32 | version=70015, 33 | services=0, 34 | timestamp=int(time.time()), 35 | addr_recv=NetAddr(), 36 | addr_from=NetAddr(), 37 | nonce=0, 38 | user_agent=b"BlockScraper", 39 | start_height=123, 40 | relay=True 41 | ) 42 | self.assertEqual(msg, BitcoinMessage.deserialize(msg.serialize())) 43 | msg = VersionMessage.unpack(EXAMPLE_VERSION_MESSAGE, byte_order=ByteOrder.LITTLE) 44 | self.assertIsInstance(msg, VersionMessage) 45 | self.assertEqual(msg.version, 70002) 46 | self.assertEqual(msg.timestamp, 1415483324) 47 | self.assertEqual(msg.addr_recv.port, 8333) 48 | self.assertEqual(msg.addr_from.port, 8333) 49 | self.assertEqual(msg.addr_recv.ip, ip_address("::ffff:c61b:6409")) 50 | self.assertEqual(msg.addr_from.ip, ip_address("::ffff:cb00:71c0")) 51 | -------------------------------------------------------------------------------- /test/test_db.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | 4 | from fluxture.db import ( 5 | AutoIncrement, column_options, ColumnOptions, Database, default, ForeignKey, Model, primary_key, Table 6 | ) 7 | 8 | 9 | class Person(Model): 10 | name: primary_key(str) 11 | age: int 12 | 13 | 14 | class TestDatabase(TestCase): 15 | def test_create_table(self): 16 | db = Database() 17 | table = db.create_table("people", Table[Person]) 18 | self.assertEqual(len(table), 0) 19 | person = Person(name="Foo", age=1337) 20 | table.append(person) 21 | self.assertEqual(len(table), 1) 22 | retrieved_person = next(iter(table)) 23 | self.assertIsInstance(retrieved_person, Person) 24 | self.assertEqual(retrieved_person, person) 25 | self.assertEqual(next(iter(table.select(age=1337))), person) 26 | self.assertCountEqual(table.select(age=0), ()) 27 | 28 | def test_define_db(self): 29 | class TestDB(Database): 30 | people: Table[Person] 31 | 32 | db = TestDB() 33 | self.assertEqual(len(db.people), 0) 34 | 35 | def test_primary_key(self): 36 | self.assertEqual(Person.primary_key_name, "name") 37 | 38 | class NoPrimaryKey(Model): 39 | not_primary_key: int 40 | not_primary_key_either: float 41 | 42 | self.assertEqual(NoPrimaryKey.primary_key_name, "rowid") 43 | 44 | def test_default(self): 45 | class Number(Model): 46 | n: default(primary_key(int), 1) 47 | 48 | class TestDB(Database): 49 | numbers: Table[Number] 50 | 51 | db = TestDB() 52 | db.numbers.append(Number()) 53 | self.assertEqual(next(iter(db.numbers)), Number(1)) 54 | 55 | def test_foreign_key(self): 56 | class Height(Model): 57 | person: primary_key(ForeignKey["people", Person]) # noqa: F821 58 | height: int 59 | 60 | class TestDB(Database): 61 | people: Table[Person] 62 | heights: Table[Height] 63 | 64 | db = TestDB() 65 | person = Person(name="Foo", age=1337) 66 | db.people.append(person) 67 | db.heights.append(Height(person="Foo", height=80)) 68 | h = next(iter(db.heights)) 69 | self.assertEqual(h.person, person) 70 | 71 | def test_auto_increment(self): 72 | class Counter(Model): 73 | id: column_options(AutoIncrement, ColumnOptions(primary_key=True, auto_increment=True)) 74 | 75 | class TestDB(Database): 76 | counters: Table[Counter] 77 | 78 | db = TestDB() 79 | counter = Counter() 80 | self.assertIsInstance(counter.id, AutoIncrement) 81 | self.assertEqual(counter.id.initialized, False) 82 | self.assertTrue(any(key == "id" for key, _ in counter.uninitialized_auto_increments())) 83 | db.counters.append(counter) 84 | self.assertEqual(counter.id.initialized, True) 85 | -------------------------------------------------------------------------------- /test/test_statistics.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from fluxture.statistics import Statistics 4 | 5 | 6 | class StatisticsTest(TestCase): 7 | def test_memoization(self): 8 | stats = Statistics((1, 2, 3, 4, 5)) 9 | self.assertEqual(stats.average, sum((1, 2, 3, 4, 5)) / 5.0) 10 | self.assertEqual(stats.std_dev, stats.std_dev) 11 | 12 | def test_median(self): 13 | stats = Statistics((1, 2, 3, 4, 5)) 14 | self.assertEqual(stats.median, 3) 15 | stats = Statistics((1, 2, 3, 4, 5, 6)) 16 | self.assertEqual(stats.median, (3 + 4) / 2.0) 17 | -------------------------------------------------------------------------------- /test/test_types.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import random 3 | 4 | from fluxture.structures import PackableStruct 5 | from tqdm import tqdm, trange 6 | from typing import List 7 | from unittest import TestCase 8 | 9 | from fluxture.serialization import * 10 | 11 | 12 | class TestTypes(TestCase): 13 | @classmethod 14 | def setUpClass(cls): 15 | cls.packable_types: List[Type[Packable]] = [ 16 | t for t in globals().values() 17 | if inspect.isclass(t) and issubclass(t, Packable) and ( 18 | not hasattr(t, "__abstractmethods__") or not t.__abstractmethods__ 19 | ) 20 | ] 21 | cls.sized_integer_types: List[Type[SizedInteger]] = [ 22 | ty for ty in cls.packable_types if issubclass(ty, SizedInteger) and ty is not SizedInteger 23 | ] 24 | 25 | def test_int_enum(self): 26 | class EnumTest(IntEnum): 27 | FOO = 0 28 | BAR = 10 29 | 30 | self.assertEqual(EnumTest.FOO, 0) 31 | self.assertEqual(EnumTest.BAR, 10) 32 | self.assertIsInstance(EnumTest.FOO, EnumTest) 33 | self.assertIsInstance(EnumTest.BAR, EnumTest) 34 | self.assertEqual(EnumTest.get("FOO"), EnumTest.FOO) 35 | self.assertEqual(EnumTest.get("BAR"), EnumTest.BAR) 36 | self.assertIs(EnumTest.get_type(), UInt8) 37 | self.assertEqual(EnumTest.DEFAULT, EnumTest.FOO) 38 | 39 | class SignedEnum(IntEnum): 40 | FOO = -1 41 | BAR = 100 42 | 43 | self.assertIs(SignedEnum.get_type(), Int8) 44 | 45 | class LargeEnum(IntEnum): 46 | FOO = -500 47 | BAR = 35000 48 | 49 | self.assertIs(LargeEnum.get_type(), Int32) 50 | 51 | def test_int_flags(self): 52 | class Flags(IntFlag): 53 | A = 0 54 | B = 1 55 | C = 2 56 | D = 4 57 | 58 | self.assertEqual(Flags.A, 0) 59 | self.assertEqual(Flags.B, 1) 60 | self.assertEqual(Flags.C, 2) 61 | self.assertEqual(Flags.D, 4) 62 | self.assertEqual(Flags.B | Flags.C, 3) 63 | 64 | def test_bad_int_enum(self): 65 | def make_bad_enum(): 66 | class BadEnum(IntEnum): 67 | NOT_INT = "foo" 68 | 69 | self.assertRaises(TypeError, make_bad_enum) 70 | 71 | def make_oversized_enum(): 72 | class OversizedEnum(IntEnum): 73 | FOO = 99999999999999999999999 74 | 75 | self.assertRaises(TypeError, make_oversized_enum) 76 | 77 | def test_sized_integers(self): 78 | for int_type in tqdm(self.sized_integer_types, desc="testing sized integers", unit=" types", leave=False): 79 | for _ in trange(1000, desc=f"testing {int_type.__name__}", unit=" tests", leave=False): 80 | value = random.randint(int_type.MIN_VALUE, int_type.MAX_VALUE) 81 | packed = int_type(value).pack() 82 | self.assertEqual(int_type.unpack(packed), value) 83 | 84 | def test_empty_struct(self): 85 | class EmptyStruct(PackableStruct): 86 | pass 87 | s = EmptyStruct() 88 | self.assertEqual(s, EmptyStruct.unpack(s.pack())) 89 | 90 | def test_struct_comparison(self): 91 | class S1(PackableStruct): 92 | a: Int32 93 | b: UnsignedChar 94 | c: UInt64 95 | 96 | class S2(PackableStruct): 97 | a: Int32 98 | b: UnsignedChar 99 | c: UInt64 100 | 101 | self.assertRaises(ValueError, S1, (0, 1)) 102 | self.assertRaises(ValueError, S1, (0, 1, 2, 3)) 103 | self.assertEqual(S1(0, 1, 2), S2(0, 1, 2)) 104 | self.assertNotEqual(S1(0, 1, 2), S2(0, 1, 3)) 105 | 106 | def test_struct_packing(self): 107 | class S3(PackableStruct): 108 | a: Int32 109 | b: UInt64 110 | c: Int16 111 | 112 | s3 = S3(0, 1, 2) 113 | self.assertEqual(S3.unpack(s3.pack()), s3) 114 | 115 | def test_byte_arrays(self): 116 | class HasArrays(PackableStruct): 117 | a: SizedByteArray[1024] 118 | b: SizedByteArray[0] 119 | c: SizedByteArray[10] 120 | 121 | self.assertRaises(ValueError, HasArrays, (b"abcd", b"defg", b"hijk")) 122 | has_arrays = HasArrays(b"abcd", b"", b"hijk") 123 | self.assertEqual(HasArrays.unpack(has_arrays.pack()), has_arrays) 124 | --------------------------------------------------------------------------------