├── .gitignore ├── LICENSE ├── README.md ├── assets ├── access_1.png ├── access_2.png ├── creating_1.png ├── creating_2.png ├── creating_3.png ├── creating_4.png ├── macrocosmos-black.png ├── macrocosmos-white.png ├── retrieval_1.png ├── retrieval_2.png └── retrieval_3.png ├── common ├── __init__.py ├── constants.py ├── data.py ├── data_v2.py ├── date_range.py ├── metagraph_syncer.py ├── old_protocol.py ├── organic_protocol.py ├── protocol.py └── utils.py ├── docs ├── apify.md ├── dd_validator_instructions.md ├── dynamic_desirability.md ├── hugging_face_validation.md ├── huggingface_setup.md ├── miner.md ├── miner_policy.md ├── on_demand.md ├── reddit.md ├── scoring.md ├── validator.md └── youtube.md ├── dynamic_desirability ├── chain_utils.py ├── constants.py ├── data.py ├── default.json ├── desirability_retrieval.py └── desirability_uploader.py ├── huggingface_utils ├── dataset_card.py ├── encoding_system.py ├── huggingface_uploader.py ├── s3_utils.py └── utils.py ├── neurons ├── __init__.py ├── config.py ├── miner.py └── validator.py ├── requirements.txt ├── rewards ├── __init__.py ├── data.py ├── data_desirability_lookup.py ├── data_value_calculator.py └── miner_scorer.py ├── scraping ├── __init__.py ├── apify.py ├── config │ ├── __init__.py │ ├── config_reader.py │ ├── model.py │ └── scraping_config.json ├── coordinator.py ├── provider.py ├── reddit │ ├── __init__.py │ ├── model.py │ ├── reddit_custom_scraper.py │ ├── reddit_lite_scraper.py │ └── utils.py ├── scraper.py ├── utils.py ├── x │ ├── __init__.py │ ├── apidojo_scraper.py │ ├── enhanced_apidojo_scraper.py │ ├── microworlds_scraper.py │ ├── model.py │ ├── on_demand_model.py │ ├── quacker_url_scraper.py │ └── utils.py └── youtube │ ├── model.py │ ├── utils.py │ └── youtube_custom_scraper.py ├── scripts └── start_validator.py ├── setup.py ├── storage ├── miner │ ├── miner_storage.py │ └── sqlite_miner_storage.py └── validator │ ├── hf_validator_storage.py │ ├── sqlite_memory_validator_storage.py │ └── validator_storage.py ├── tests ├── __init__.py ├── common │ ├── __init__.py │ ├── test_data.py │ ├── test_data_v2.py │ ├── test_metagraph_syncer.py │ ├── test_protocol.py │ └── test_utils.py ├── dynamic_desirability │ └── test_lookup_conversion.py ├── hf_validation │ ├── test_decode_url_protocol.py │ ├── test_encoding_key.json │ ├── test_reddit_dataset_validation.py │ └── test_x_dataset_validation.py ├── integration │ ├── __init__.py │ ├── test_on_demand.py │ └── test_protocol.py ├── neurons │ ├── __init__.py │ ├── test_miner_config.py │ └── test_validator_config.py ├── rewards │ ├── __init__.py │ ├── test_data_value_calculator.py │ └── test_miner_scorer.py ├── scraping │ ├── __init__.py │ ├── config │ │ ├── __init__.py │ │ ├── invalid_config.json │ │ ├── test_config_reader.py │ │ ├── test_model.py │ │ └── valid_config.json │ ├── reddit │ │ ├── __init__.py │ │ ├── test_model.py │ │ └── test_utils.py │ ├── test_coordinator.py │ ├── test_utils.py │ ├── x │ │ ├── __init__.py │ │ ├── test_model.py │ │ └── test_utils.py │ └── youtube │ │ └── test_compression.py ├── storage │ ├── __init__.py │ ├── miner │ │ ├── __init__.py │ │ └── test_sqlite_miner_storage.py │ └── validator │ │ ├── __init__.py │ │ └── test_sqlite_memory_validator_storage.py ├── test_all.py ├── utils.py └── vali_utils │ ├── __init__.py │ ├── test_miner_iterator.py │ ├── test_vali_utils.py │ └── test_validator_s3_access.py └── vali_utils ├── __init__.py ├── api ├── auth │ ├── auth.py │ └── key_routes.py ├── models.py ├── routes.py ├── server.py └── utils.py ├── hf_utils.py ├── load_balancer └── validator_registry.py ├── miner_evaluator.py ├── miner_iterator.py ├── utils.py └── validator_s3_access.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # VS code 7 | .vscode/ 8 | .env 9 | 10 | # Playground notebooks 11 | *Playground.ipynb 12 | 13 | # Test created DB 14 | myDb 15 | mydb 16 | *.sqlite 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | share/python-wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .nox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | *.py,cover 62 | .hypothesis/ 63 | .pytest_cache/ 64 | cover/ 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | local_settings.py 73 | db.sqlite3 74 | db.sqlite3-journal 75 | 76 | # Flask stuff: 77 | instance/ 78 | .webassets-cache 79 | 80 | # Scrapy stuff: 81 | .scrapy 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | 86 | # PyBuilder 87 | .pybuilder/ 88 | target/ 89 | 90 | # Jupyter Notebook 91 | .ipynb_checkpoints 92 | 93 | # IPython 94 | profile_default/ 95 | ipython_config.py 96 | 97 | # pyenv 98 | # For a library or package, you might want to ignore these files since the code is 99 | # intended to run in multiple environments; otherwise, check them in: 100 | # .python-version 101 | 102 | # pipenv 103 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 104 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 105 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 106 | # install all needed dependencies. 107 | #Pipfile.lock 108 | 109 | # poetry 110 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 111 | # This is especially recommended for binary packages to ensure reproducibility, and is more 112 | # commonly ignored for libraries. 113 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 114 | #poetry.lock 115 | 116 | # pdm 117 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 118 | #pdm.lock 119 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 120 | # in version control. 121 | # https://pdm.fming.dev/#use-with-ide 122 | .pdm.toml 123 | 124 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 125 | __pypackages__/ 126 | 127 | # Celery stuff 128 | celerybeat-schedule 129 | celerybeat.pid 130 | 131 | # SageMath parsed files 132 | *.sage.py 133 | 134 | # Environments 135 | .env 136 | .venv 137 | env/ 138 | venv/ 139 | ENV/ 140 | env.bak/ 141 | venv.bak/ 142 | 143 | # Spyder project settings 144 | .spyderproject 145 | .spyproject 146 | 147 | # Rope project settings 148 | .ropeproject 149 | 150 | # mkdocs documentation 151 | /site 152 | 153 | # mypy 154 | .mypy_cache/ 155 | .dmypy.json 156 | dmypy.json 157 | 158 | # Pyre type checker 159 | .pyre/ 160 | 161 | # pytype static type analyzer 162 | .pytype/ 163 | 164 | # Cython debug symbols 165 | cython_debug/ 166 | 167 | # PyCharm 168 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 169 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 170 | # and can be added to the global gitignore or merged into this file. For a more nuclear 171 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 172 | #.idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 RusticLuftig 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /assets/access_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/access_1.png -------------------------------------------------------------------------------- /assets/access_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/access_2.png -------------------------------------------------------------------------------- /assets/creating_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/creating_1.png -------------------------------------------------------------------------------- /assets/creating_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/creating_2.png -------------------------------------------------------------------------------- /assets/creating_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/creating_3.png -------------------------------------------------------------------------------- /assets/creating_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/creating_4.png -------------------------------------------------------------------------------- /assets/macrocosmos-black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/macrocosmos-black.png -------------------------------------------------------------------------------- /assets/macrocosmos-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/macrocosmos-white.png -------------------------------------------------------------------------------- /assets/retrieval_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/retrieval_1.png -------------------------------------------------------------------------------- /assets/retrieval_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/retrieval_2.png -------------------------------------------------------------------------------- /assets/retrieval_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/retrieval_3.png -------------------------------------------------------------------------------- /common/__init__.py: -------------------------------------------------------------------------------- 1 | # A package for common code shared between miners and validators. -------------------------------------------------------------------------------- /common/constants.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | from . import utils 3 | 4 | # Collection of constants for use throughout the codebase. 5 | 6 | # How big any one data entity bucket can be to limit size over the wire. 7 | DATA_ENTITY_BUCKET_SIZE_LIMIT_BYTES = utils.mb_to_bytes(128) 8 | 9 | # How many data entity buckets any one miner index can have to limit necessary storage on the validators. 10 | DATA_ENTITY_BUCKET_COUNT_LIMIT_PER_MINER_INDEX = 200_000 11 | DATA_ENTITY_BUCKET_COUNT_LIMIT_PER_MINER_INDEX_PROTOCOL_3 = 250_000 12 | DATA_ENTITY_BUCKET_COUNT_LIMIT_PER_MINER_INDEX_PROTOCOL_4 = 350_000 13 | 14 | # How big the collection of contents can be to limit size over the wire. 15 | BULK_CONTENTS_SIZE_LIMIT_BYTES = utils.mb_to_bytes(128) 16 | BULK_CONTENTS_COUNT_LIMIT = 200_000 17 | 18 | # How many different buckets can be requests at once. 19 | BULK_BUCKETS_COUNT_LIMIT = 100 20 | 21 | # How old a data entity bucket can be before the validators do not assign any value for them. 22 | DATA_ENTITY_BUCKET_AGE_LIMIT_DAYS = 30 23 | 24 | # The maximum number of characters a label can have. 25 | MAX_LABEL_LENGTH = 140 26 | 27 | # The current protocol version (int) 28 | PROTOCOL_VERSION = 4 29 | 30 | # Min evaluation period that must pass before a validator re-evaluates a miner. 31 | MIN_EVALUATION_PERIOD = dt.timedelta(minutes=60) 32 | 33 | # Miner compressed index cache freshness. 34 | MINER_CACHE_FRESHNESS = dt.timedelta(minutes=20) 35 | 36 | # Date after which only x.com URLs are accepted 37 | NO_TWITTER_URLS_DATE = dt.datetime(2024, 12, 28, tzinfo=dt.timezone.utc) # December 28, 2024 UTC 38 | 39 | # Date after which media content is required for tweets that contain media 40 | MEDIA_REQUIRED_DATE = dt.datetime(2025, 5, 23, tzinfo=dt.timezone.utc) # May 23, 2025 UTC 41 | BYTE_ALLOWANCE_DATE = dt.datetime(2025, 6, 7, tzinfo=dt.timezone.utc) # June 7, 2025 UTC 42 | EVALUATION_ON_STARTUP = 15 43 | -------------------------------------------------------------------------------- /common/data_v2.py: -------------------------------------------------------------------------------- 1 | """data.py contains the original data structures used for the project. 2 | 3 | data_v2.py contains the newer data structures used, which are more performant. 4 | 5 | From the original data structures we learned: 6 | 1. Pydantic adds a huge overhead for performance, particularly when creating > 1M objects 7 | 2. Object nesting has notable performance overhead 8 | 9 | Hence, with the V2 models, we make trade-off the nicer coding symantics in exchange for better performance. 10 | 11 | If a class needs to be included as a Field in a pydantic BaseModel, it should be a dataclass (which adds a small overhead), 12 | because pydantic know how to serialize dataclasses, as long as all fields are themselves JSON serializable. 13 | 14 | As a rule of thumb: 15 | 1. If the class needs to perform validation on fields, use a class with a custom __init__, __eq__, and __hash__. 16 | 2. Always use __slots__. 17 | """ 18 | 19 | import datetime as dt 20 | from pydantic import BaseModel, Field, ConfigDict 21 | from typing import List, Optional 22 | 23 | from common import constants 24 | from common.data import ( 25 | DataEntityBucket, 26 | DataEntityBucketId, 27 | DataLabel, 28 | DataSource, 29 | TimeBucket, 30 | ) 31 | 32 | 33 | class ScorableDataEntityBucket: 34 | """Composes both a DataEntityBucket and additional information required for scoring. 35 | 36 | Attributes: 37 | scorable_bytes: Scorable bytes are the bytes that can be credited to this miner for scoring. 38 | This is always less than or equal to the total size of the chunk. 39 | This scorable bytes are computed as: 40 | 1 byte for every byte in size_bytes that no other miner has in their index. 41 | 1 byte / # of miners that have this chunk in their index for every byte in size_bytes 42 | that at least one other miner has in their index. 43 | """ 44 | 45 | __slots__ = "time_bucket_id", "source", "label", "size_bytes", "scorable_bytes" 46 | 47 | def __init__( 48 | self, 49 | time_bucket_id: int, 50 | source: DataSource, 51 | label: Optional[str], 52 | size_bytes: int, 53 | scorable_bytes: int, 54 | ): 55 | if label and len(label) > constants.MAX_LABEL_LENGTH: 56 | raise ValueError("Label value cannot be longer than 140 characters.") 57 | if not 0 <= size_bytes <= constants.DATA_ENTITY_BUCKET_SIZE_LIMIT_BYTES: 58 | raise ValueError( 59 | f"Size must be between 0 and {constants.DATA_ENTITY_BUCKET_SIZE_LIMIT_BYTES}." 60 | ) 61 | if not 0 <= scorable_bytes <= constants.DATA_ENTITY_BUCKET_SIZE_LIMIT_BYTES: 62 | raise ValueError( 63 | f"Scorable bytes must be between 0 and {constants.DATA_ENTITY_BUCKET_SIZE_LIMIT_BYTES}." 64 | ) 65 | if scorable_bytes > size_bytes: 66 | raise ValueError( 67 | f"Scorable bytes cannot be greater than size bytes. Scorable bytes: {scorable_bytes}, size bytes: {size_bytes}." 68 | ) 69 | 70 | self.time_bucket_id = time_bucket_id 71 | self.source = source 72 | self.label = label.casefold() if label else None 73 | self.size_bytes = size_bytes 74 | self.scorable_bytes = scorable_bytes 75 | 76 | def __repr__(self): 77 | return f"ScorableDataEntityBucket(time_bucket_id={self.time_bucket_id}, source={self.source}, label={self.label}, size_bytes={self.size_bytes}, scorable_bytes={self.scorable_bytes})" 78 | 79 | def __eq__(self, other): 80 | return ( 81 | self.time_bucket_id == other.time_bucket_id 82 | and self.source == other.source 83 | and self.label == other.label 84 | and self.size_bytes == other.size_bytes 85 | and self.scorable_bytes == other.scorable_bytes 86 | ) 87 | 88 | def __hash__(self): 89 | return hash( 90 | ( 91 | self.time_bucket_id, 92 | self.source, 93 | self.label, 94 | self.size_bytes, 95 | self.scorable_bytes, 96 | ) 97 | ) 98 | 99 | def to_data_entity_bucket(self) -> DataEntityBucket: 100 | return DataEntityBucket( 101 | id=DataEntityBucketId( 102 | time_bucket=TimeBucket(id=self.time_bucket_id), 103 | source=self.source, 104 | label=DataLabel(value=self.label) if self.label else None, 105 | ), 106 | size_bytes=self.size_bytes, 107 | ) 108 | 109 | 110 | class ScorableMinerIndex(BaseModel): 111 | """The Miner index, with additional information required for scoring. 112 | 113 | Use a pydantic model for this class, because we only create 1 per miner, 114 | so the additional overhead is acceptable. 115 | """ 116 | 117 | model_config = ConfigDict( 118 | arbitrary_types_allowed=True, 119 | frozen=True 120 | ) 121 | 122 | scorable_data_entity_buckets: List[ScorableDataEntityBucket] = Field( 123 | description="DataEntityBuckets the miner is serving, scored on uniqueness.", 124 | max_length=constants.DATA_ENTITY_BUCKET_COUNT_LIMIT_PER_MINER_INDEX_PROTOCOL_4, 125 | ) 126 | last_updated: dt.datetime = Field(description="Time last updated in UTC.") -------------------------------------------------------------------------------- /common/date_range.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import datetime as dt 3 | 4 | 5 | @dataclasses.dataclass(frozen=True) 6 | class DateRange: 7 | """Represents a specific time range from start time inclusive to end time exclusive.""" 8 | 9 | # The start time inclusive of the time range. 10 | start: dt.datetime 11 | 12 | # The end time exclusive of the time range. 13 | end: dt.datetime 14 | 15 | def contains(self, datetime: dt.datetime) -> bool: 16 | """Returns True if the provided datetime is within this DateRange.""" 17 | return self.start <= datetime < self.end 18 | -------------------------------------------------------------------------------- /common/metagraph_syncer.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import dataclasses 3 | from dataclasses import field 4 | from datetime import datetime 5 | import functools 6 | import bittensor as bt 7 | from typing import Dict, List, Callable, Optional 8 | import threading 9 | import traceback 10 | 11 | from common import utils 12 | 13 | 14 | class MetagraphSyncer: 15 | @dataclasses.dataclass 16 | class _State: 17 | metagraph: Optional[bt.metagraph] = None 18 | last_synced_time: Optional[datetime] = None 19 | listeners: List = field(default_factory=list) 20 | 21 | def __init__(self, subtensor: bt.subtensor, config: Dict[int, int]): 22 | """Constructs a new MetagraphSyncer, that periodically refreshes metagraph defined in the config. 23 | 24 | Args: 25 | subtensor (bt.subtensor): The subtensor used to fetch the metagraphs. 26 | config (Dict[int, int]): A mapping of netuid to the cadence (in seconds) to sync the metagraph. 27 | """ 28 | self.subtensor = subtensor 29 | self.config = config 30 | self.metagraph_map: Dict[int, MetagraphSyncer._State] = { 31 | netuid: MetagraphSyncer._State() for netuid in config.keys() 32 | } 33 | self.is_running = False 34 | self.done_initial_sync = False 35 | self.lock = threading.RLock() 36 | 37 | bt.logging.info(f"MetagraphSyncer created with config: {config}") 38 | 39 | def do_initial_sync(self): 40 | """Performs an initial sync of all metagraphs. 41 | 42 | Unlike regular syncs, this will not notify listeners of the updated metagraph. 43 | """ 44 | bt.logging.debug("Metagraph syncer do_initial_sync called") 45 | 46 | for netuid in self.config.keys(): 47 | fn = functools.partial(self.subtensor.metagraph, netuid) 48 | metagraph = utils.run_in_thread(fn, ttl=120, name=f"InitalSync-{netuid}") 49 | with self.lock: 50 | state = self.metagraph_map[netuid] 51 | state.metagraph = metagraph 52 | state.last_synced_time = datetime.now() 53 | 54 | bt.logging.debug(f"Successfully loaded metagraph for {netuid}") 55 | 56 | self.done_initial_sync = True 57 | 58 | def start(self): 59 | bt.logging.debug("Metagraph syncer start called") 60 | 61 | assert self.done_initial_sync, "Must call do_initial_sync before starting" 62 | 63 | self.is_running = True 64 | thread = threading.Thread(target=self._run, daemon=True) 65 | thread.start() 66 | 67 | async def _sync_metagraph_loop(self, netuid: int, cadence: int): 68 | while self.is_running: 69 | # On start, wait cadence before the first sync. 70 | bt.logging.trace(f"Syncing metagraph for {netuid} in {cadence} seconds.") 71 | await asyncio.sleep(cadence) 72 | 73 | try: 74 | # Intentionally block the shared thread so that we only 75 | # sync 1 metagraph at a time. 76 | bt.logging.trace(f"Syncing metagraph for {netuid}.") 77 | metagraph = utils.run_in_thread( 78 | functools.partial(self.subtensor.metagraph, netuid), 79 | ttl=120, 80 | name=f"Sync-{netuid}", 81 | ) 82 | bt.logging.trace(f"Successfully synced metagraph for {netuid}.") 83 | state = None 84 | with self.lock: 85 | # Store metagraph and sync time 86 | state = self.metagraph_map[netuid] 87 | state.metagraph = metagraph 88 | state.last_synced_time = datetime.now() 89 | 90 | self._notify_listeners(state, netuid) 91 | except (BaseException, Exception) as e: 92 | bt.logging.error( 93 | f"Error when syncing metagraph for {netuid}: {e}. Retrying in 60 seconds." 94 | ) 95 | await asyncio.sleep(60) 96 | 97 | async def _run_async(self): 98 | # For each netuid we should sync metagraphs for, spawn a Task to sync it. 99 | await asyncio.wait( 100 | [ 101 | asyncio.create_task(self._sync_metagraph_loop(netuid, cadence)) 102 | for netuid, cadence in self.config.items() 103 | ], 104 | return_when=asyncio.ALL_COMPLETED, 105 | ) 106 | 107 | def _run(self): 108 | try: 109 | asyncio.run(self._run_async()) 110 | finally: 111 | bt.logging.info("MetagraphSyncer _run complete.") 112 | 113 | def register_listener( 114 | self, listener: Callable[[bt.metagraph, int], None], netuids: List[int] 115 | ): 116 | """Registers a listener to be notified when a metagraph for any netuid in netuids is updated. 117 | 118 | The listener will be called from a different thread, so it must be thread-safe. 119 | """ 120 | if not netuids: 121 | raise ValueError("Must provide at least 1 netuid") 122 | 123 | with self.lock: 124 | for netuid in netuids: 125 | if netuid not in self.metagraph_map: 126 | raise ValueError( 127 | f"Metagraph for {netuid} not being tracked in MetagraphSyncer." 128 | ) 129 | self.metagraph_map[netuid].listeners.append(listener) 130 | 131 | def get_metagraph(self, netuid: int) -> bt.metagraph: 132 | """Returns the last synced version of the metagraph for netuid.""" 133 | with self.lock: 134 | if netuid not in self.metagraph_map: 135 | raise ValueError( 136 | f"Metagraph for {netuid} not known to MetagraphSyncer." 137 | ) 138 | metagraph = self.metagraph_map[netuid].metagraph 139 | if not metagraph: 140 | raise ValueError(f"Metagraph for {netuid} has not been synced yet.") 141 | return metagraph 142 | 143 | def _notify_listeners(self, state: _State, netuid: int): 144 | """Notifies listeners of a new metagraph for netuid.""" 145 | bt.logging.debug(f"Notifying listeners of update to metagraph for {netuid}.") 146 | 147 | for listener in state.listeners: 148 | try: 149 | listener(state.metagraph, netuid) 150 | except Exception: 151 | bt.logging.error( 152 | f"Exception caught notifying {netuid} listener of metagraph update.\n{traceback.format_exc()}" 153 | ) 154 | -------------------------------------------------------------------------------- /common/old_protocol.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # Copyright © 2023 data-universe 3 | 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of 10 | # the Software. 11 | 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 16 | # DEALINGS IN THE SOFTWARE. 17 | 18 | import bittensor as bt 19 | import pydantic 20 | from common import constants 21 | from common.data import DataEntityBucket, DataEntity, DataEntityBucketId 22 | from typing import List, Optional 23 | 24 | 25 | class GetMinerIndex(bt.Synapse): 26 | """ 27 | Protocol by which Validators can retrieve the Index from a Miner. 28 | 29 | Attributes: 30 | - data_entity_buckets: A list of DataEntityBucket objects that the Miner can serve. 31 | """ 32 | 33 | # Required request output, filled by receiving axon. 34 | data_entity_buckets: List[DataEntityBucket] = pydantic.Field( 35 | title="data_entity_buckets", 36 | description="All of the data entity buckets that a Miner can serve.", 37 | frozen=False, 38 | repr=False, 39 | max_items=constants.DATA_ENTITY_BUCKET_COUNT_LIMIT_PER_MINER_INDEX, 40 | default_factory=list, 41 | ) 42 | 43 | 44 | class GetDataEntityBucket(bt.Synapse): 45 | """ 46 | Protocol by which Validators can retrieve the DataEntities of a Bucket from a Miner. 47 | 48 | Attributes: 49 | - bucket_id: The id of the bucket that the requester is asking for. 50 | - data_entities: A list of DataEntity objects that make up the requested DataEntityBucket. 51 | """ 52 | 53 | # Required request input, filled by sending dendrite caller. 54 | data_entity_bucket_id: Optional[DataEntityBucketId] = pydantic.Field( 55 | title="data_entity_bucket_id", 56 | description="The identifier for the requested DataEntityBucket.", 57 | frozen=True, 58 | repr=False, 59 | default=None, 60 | ) 61 | 62 | # Required request output, filled by recieving axon. 63 | data_entities: List[DataEntity] = pydantic.Field( 64 | title="data_entities", 65 | description="All of the data that makes up the requested DataEntityBucket.", 66 | frozen=False, 67 | repr=False, 68 | default_factory=list, 69 | ) 70 | 71 | 72 | # TODO Protocol for Users to Query Data which will accept query parameters such as a startDatetime, endDatetime. 73 | -------------------------------------------------------------------------------- /common/organic_protocol.py: -------------------------------------------------------------------------------- 1 | import bittensor as bt 2 | from typing import List, Dict, Any, Optional 3 | from common.data import DataSource 4 | 5 | 6 | class OrganicRequest(bt.Synapse): 7 | """Direct query synapse for organic data requests""" 8 | 9 | # Input fields 10 | source: str 11 | usernames: List[str] = [] 12 | keywords: List[str] = [] 13 | start_date: Optional[str] = None 14 | end_date: Optional[str] = None 15 | limit: int = 100 16 | 17 | # Output fields 18 | data: List[Dict[str, Any]] = [] 19 | meta: Dict[str, Any] = {} 20 | status: str = "pending" 21 | 22 | def deserialize(self) -> Dict[str, Any]: 23 | """Convert synapse to dictionary for response""" 24 | return { 25 | "status": self.status, 26 | "data": self.data, 27 | "meta": self.meta 28 | } -------------------------------------------------------------------------------- /common/protocol.py: -------------------------------------------------------------------------------- 1 | import bittensor as bt 2 | from pydantic import Field, ConfigDict, field_validator 3 | from common.data import ( 4 | DataSource, 5 | DataEntityBucket, 6 | DataEntity, 7 | DataEntityBucketId, 8 | HuggingFaceMetadata 9 | ) 10 | from typing import Dict, List, Optional, Tuple 11 | 12 | 13 | class BaseProtocol(bt.Synapse): 14 | model_config = ConfigDict( 15 | arbitrary_types_allowed=True, 16 | validate_assignment=True 17 | ) 18 | 19 | version: Optional[int] = Field( 20 | description="Protocol version", 21 | default=None 22 | ) 23 | 24 | 25 | class GetMinerIndex(BaseProtocol): 26 | """ 27 | Protocol by which Validators can retrieve the Index from a Miner. 28 | 29 | Attributes: 30 | - data_entity_buckets: A list of DataEntityBucket objects that the Miner can serve. 31 | """ 32 | 33 | # We opt to send the compressed index in pre-serialized form to have full control 34 | # over serialization and deserialization, rather than relying on fastapi and bittensors 35 | # interactions with pydantic serialization, which can be problematic for certain types. 36 | compressed_index_serialized: Optional[str] = Field( 37 | description="The compressed index of the Miner of type CompressedMinerIndex.", 38 | frozen=False, 39 | repr=False, 40 | default=None, 41 | ) 42 | 43 | 44 | class GetDataEntityBucket(BaseProtocol): 45 | """ 46 | Protocol by which Validators can retrieve the DataEntities of a Bucket from a Miner. 47 | 48 | Attributes: 49 | - bucket_id: The id of the bucket that the requester is asking for. 50 | - data_entities: A list of DataEntity objects that make up the requested DataEntityBucket. 51 | """ 52 | 53 | data_entity_bucket_id: Optional[DataEntityBucketId] = Field( 54 | title="data_entity_bucket_id", 55 | description="The identifier for the requested DataEntityBucket.", 56 | frozen=True, 57 | repr=False, 58 | default=None, 59 | ) 60 | 61 | data_entities: List[DataEntity] = Field( 62 | title="data_entities", 63 | description="All of the data that makes up the requested DataEntityBucket.", 64 | frozen=False, 65 | repr=False, 66 | default_factory=list, 67 | ) 68 | 69 | 70 | class GetContentsByBuckets(BaseProtocol): 71 | """ 72 | Protocol by which Validators can retrieve contents from one or more Miner Buckets. 73 | After March 1st all contents have their creation timestamp obfuscated to the minute. 74 | 75 | Attributes: 76 | - bucket_ids: The ids of the buckets that the requester is asking for. 77 | - bucket_ids_to_contents: A dict of DataEntityBucketId objects to a list of contained contents. 78 | """ 79 | 80 | data_entity_bucket_ids: Optional[List[DataEntityBucketId]] = Field( 81 | title="data_entity_bucket_ids", 82 | description="The identifiers for the requested DataEntityBuckets.", 83 | frozen=True, 84 | repr=False, 85 | default=None, 86 | ) 87 | 88 | bucket_ids_to_contents: List[Tuple[DataEntityBucketId, List[bytes]]] = Field( 89 | title="bucket_ids_to_contents", 90 | description="A list of bucket ids to the contents contained by that bucket. Each DataEntityBucketId appears at most once. This is just a flattened dictionary.", 91 | frozen=False, 92 | repr=False, 93 | default_factory=list, 94 | ) 95 | 96 | 97 | class GetHuggingFaceMetadata(BaseProtocol): 98 | """ 99 | Protocol by which Validators can retrieve HuggingFace metadata from a Miner. 100 | """ 101 | 102 | metadata: List[HuggingFaceMetadata] = Field( 103 | title="metadata", 104 | description="List of HuggingFace metadata entries.", 105 | default_factory=list 106 | ) 107 | 108 | 109 | class DecodeURLRequest(BaseProtocol): 110 | """ 111 | Protocol by which Validators can request URL decoding from a Miner. 112 | 113 | Attributes: 114 | - encoded_urls: A list of encoded URL strings to be decoded 115 | - decoded_urls: A list of decoded URL strings returned by the miner 116 | """ 117 | 118 | encoded_urls: List[str] = Field( 119 | title="encoded_urls", 120 | description="List of encoded URLs that need to be decoded", 121 | frozen=True, 122 | repr=False, 123 | default_factory=list, 124 | max_length=10 # Changed from validator to direct Field constraint 125 | ) 126 | 127 | decoded_urls: List[str] = Field( 128 | title="decoded_urls", 129 | description="List of decoded URLs corresponding to the encoded URLs", 130 | frozen=False, 131 | repr=False, 132 | default_factory=list, 133 | ) 134 | 135 | 136 | class OnDemandRequest(BaseProtocol): 137 | """Protocol for on-demand data retrieval requests""" 138 | 139 | # Request parameters 140 | source: Optional[DataSource] = Field( 141 | default=None, 142 | description="Source to query (X or Reddit)" 143 | ) 144 | 145 | usernames: List[str] = Field( 146 | default_factory=list, 147 | description="Usernames to fetch data from", 148 | max_length=10 149 | ) 150 | 151 | keywords: List[str] = Field( 152 | default_factory=list, 153 | description="Keywords/hashtags to search for", 154 | max_length=5 155 | ) 156 | 157 | start_date: Optional[str] = Field( 158 | default=None, 159 | description="Start date (ISO format)" 160 | ) 161 | 162 | end_date: Optional[str] = Field( 163 | default=None, 164 | description="End date (ISO format)" 165 | ) 166 | 167 | limit: int = Field( 168 | default=100, 169 | ge=1, 170 | le=1000, 171 | description="Maximum items to return" 172 | ) 173 | 174 | # Response fields 175 | data: List[DataEntity] = Field( 176 | default_factory=list, 177 | description="Retrieved data" 178 | ) 179 | 180 | version: Optional[int] = Field( 181 | default=None, 182 | description="Protocol version" 183 | ) 184 | 185 | # How many times validators can send requests per validation period. 186 | REQUEST_LIMIT_BY_TYPE_PER_PERIOD = { 187 | GetMinerIndex: 1, 188 | GetDataEntityBucket: 1, 189 | GetContentsByBuckets: 5, 190 | DecodeURLRequest: 2, 191 | GetHuggingFaceMetadata: 1, 192 | OnDemandRequest: 5, 193 | } -------------------------------------------------------------------------------- /docs/apify.md: -------------------------------------------------------------------------------- 1 | # Apify 2 | 3 | [Apify](http://apify.com) is a popular platform and market place for web scraping tools. 4 | 5 | Data Universe uses Apify to scrape certain DataSources. At this time, all Validators and Miners are required to use Apify. In future, Apify will become optional for Miners, depending on the DataSources they scrape from. 6 | 7 | ## Setting your API Token 8 | 9 | 1. Create an Apify account 10 | 2. Got to your Console -> Settings -> Integrations and copy your Personal API token 11 | 3. Create a file named `.env` in the `data-universe` directory if it doesn't already exist and add the following to it: 12 | ```py 13 | APIFY_API_TOKEN="YOUR_APIFY_API_TOKEN" 14 | ``` -------------------------------------------------------------------------------- /docs/hugging_face_validation.md: -------------------------------------------------------------------------------- 1 | # Hugging Face Dataset Validation on Bittensor Subnet 13 2 | 3 | This document outlines the validation process for Hugging Face datasets in the context of Bittensor Subnet 13. Validators on Subnet 13 are responsible for ensuring that miners are providing accurate and up-to-date datasets, specifically for X (formerly Twitter) and Reddit. The validation process is crucial for maintaining the integrity and utility of the network. 4 | 5 | **Note:** The functionality described here will be integrated into the `MinerEvaluator` in the future. 6 | 7 | ## Overview 8 | 9 | The validation process involves the following key steps: 10 | 11 | 1. **Querying Hugging Face Metadata from Miners** 12 | 2. **Selecting Random Files from the Latest Data Commit** 13 | 3. **Checking Dataset Updates and Size Changes** 14 | 4. **Validating Data Samples** 15 | 5. **Adjusting Miner Credibility Based on Validation Results** 16 | 17 | ## Validation Process Details 18 | 19 | ### 1. Querying Hugging Face Metadata from Miners 20 | 21 | Every **55,000 blocks**, validators query the `HuggingFaceMetadata` table from each miner. This metadata includes information about the datasets that miners have uploaded to Hugging Face, specifically focusing on the newest datasets for X and Reddit. 22 | 23 | ```python 24 | # Example of querying Hugging Face metadata 25 | async def _query_huggingface_metadata(self, hotkey: str, uid: int, miner_axon: bt.AxonInfo) -> Optional[List[HuggingFaceMetadata]]: 26 | # ... code to query metadata ... 27 | ``` 28 | 29 | ### 2. Selecting Random Files from the Latest Data Commit 30 | 31 | Validators select **10 random rows** from the latest data commit of the miner's Hugging Face dataset. This selection ensures that the validation covers recent data and that miners are continually updating their datasets. 32 | 33 | ```python 34 | # Function to select random rows from a dataset 35 | def select_random_rows_from_parquet(repo_id: str, num_rows: int = 10) -> pd.DataFrame: 36 | # ... code to select random rows ... 37 | ``` 38 | 39 | ### 3. Checking Dataset Updates and Size Changes 40 | 41 | Validators assess how frequently the dataset is updated and monitor changes in its size over time. This step ensures that miners are actively accumulating data in their repositories and contributing to the network's growth. 42 | 43 | - **Update Frequency:** Validators check the timestamps of data commits to verify regular updates. 44 | - **Size Changes:** Validators compare the sizes of successive data commits to confirm that new data is being added. 45 | 46 | ### 4. Validating Data Samples 47 | 48 | For each selected file, validators perform the following: 49 | 50 | - **Data Retrieval:** Fetch the data samples from the selected files. 51 | - **Data Verification:** Use appropriate scrapers to validate the correctness of the data. 52 | 53 | The validation criteria are: 54 | 55 | - **Reddit Dataset:** A validation ratio of **0.5**. If at least 5 out of 10 samples are valid, the validation is considered successful. 56 | - **X Dataset:** A validation ratio of **0.6**. If at least 6 out of 10 samples are valid, the validation is considered successful. 57 | 58 | ```python 59 | # Example of validating data samples 60 | async def main(): 61 | # ... code to validate data samples ... 62 | valid = await scraper.validate_hf(entities=selected_rows) 63 | # ... process validation results ... 64 | ``` 65 | 66 | ### 5. Adjusting Miner Credibility Based on Validation Results 67 | 68 | Based on the validation outcome, validators adjust the miner's credibility: 69 | 70 | - **Successful Validation:** Increase the miner's credibility score by **10%**. 71 | - **Failed Validation:** Decrease the miner's credibility score by **10%**. 72 | 73 | This adjustment incentivizes miners to provide high-quality, up-to-date datasets. 74 | 75 | ```python 76 | # Adjusting miner credibility 77 | if validation_successful: 78 | self.scorer.increase_credibility(uid, percentage=10) 79 | else: 80 | self.scorer.decrease_credibility(uid, percentage=10) 81 | ``` 82 | 83 | ## Future Integration into MinerEvaluator 84 | 85 | The `MinerEvaluator` will be updated to include this validation process. The planned changes involve: 86 | 87 | - Implementing the Hugging Face dataset validation as a separate component within the `MinerEvaluator`. 88 | - Scheduling the validation process to occur every **55,000 blocks**. 89 | - Incorporating the credibility adjustments based on validation outcomes. 90 | 91 | **Note:** The existing validation steps for data entity buckets will remain, but the Hugging Face dataset validation will be handled separately to ensure a focused and efficient validation process. 92 | 93 | ## Code Structure 94 | 95 | - **MinerEvaluator Class:** Responsible for evaluating miners and updating their scores. 96 | - **Hugging Face Validation Module:** Contains functions to select random rows from datasets and validate them. 97 | - **ScraperProvider:** Supplies the appropriate scraper for data validation (e.g., X or Reddit scrapers). 98 | 99 | ## Conclusion 100 | 101 | This validation process is designed to ensure that miners on Bittensor Subnet 13 contribute valuable and accurate datasets to the network. By regularly validating datasets and adjusting miner credibility accordingly, the network maintains high data quality standards. 102 | 103 | ## References 104 | 105 | - **Bittensor Documentation:** [https://bittensor.com/](https://bittensor.com/) 106 | - **Hugging Face Datasets:** [https://huggingface.co/datasets](https://huggingface.co/datasets) 107 | 108 | -------------------------------------------------------------------------------- /docs/huggingface_setup.md: -------------------------------------------------------------------------------- 1 | 2 | ### README: Configuring Access for Uploading Data to Hugging Face Datasets 3 | 4 | #### Creating and Configuring Your Hugging Face Access Token 5 | 6 | To upload datasets to Hugging Face, you'll need an access token with the appropriate permissions. Follow these steps to create and configure your token: 7 | 8 | #### Step 1: Create a Hugging Face Account 9 | If you haven't already, create an account at [Hugging Face's website](https://huggingface.co/join). 10 | 11 | #### Step 2: Generate an Access Token 12 | 1. Log into your Hugging Face account. 13 | 2. Navigate to your account settings by clicking on your profile picture in the upper right corner, then select 'Settings'. 14 | 3. Go to the 'Access Tokens' section. 15 | 4. Click on 'New Token'. 16 | 5. Name your token and select the appropriate role. To upload datasets, choose the "Write" role which allows you to upload and modify datasets. 17 | 6. Click 'Create a token'. 18 | 19 | #### Step 3: Configure the Token in Your Environment 20 | 1. Copy the generated token. 21 | 2. Open or create a `.env` file in the root directory of your project. 22 | 3. Add the following line to your `.env` file: 23 | 24 | ``` 25 | HUGGINGFACE_TOKEN= 26 | ``` 27 | 28 | Replace `` with the token you copied in the previous step. 29 | 30 | #### Step 4: Utilize the Token in Your Software 31 | Ensure that your software is configured to read the `HF_TOKEN` from the environment variables. This is typically handled in your Python script as follows: 32 | 33 | ```python 34 | import os 35 | from huggingface_hub import HfApi, HfFolder 36 | from dotenv import load_dotenv 37 | 38 | load_dotenv() 39 | # Ensure the token is loaded from the .env file 40 | api = HfApi(token=os.getenv('HUGGINGFACE_TOKEN')) 41 | ``` 42 | 43 | #### Finalizing Setup 44 | After configuring the token in your `.env` file, your miner should be able to authenticate with Hugging Face and upload datasets without requiring further login steps. 45 | 46 | ### Additional Information 47 | - Keep your token secure and do not share it publicly. 48 | - If you need to regenerate your token, repeat the steps above to generate a new one and update your `.env` file accordingly. 49 | -------------------------------------------------------------------------------- /docs/on_demand.md: -------------------------------------------------------------------------------- 1 | # On-Demand Data Request Implementation 2 | 3 | ## Overview 4 | On-demand data retrieval is ALREADY IMPLEMENTED in both validator and miner templates. This enhanced version now provides richer metadata for X/Twitter content while maintaining the original Reddit implementation. 5 | 6 | ## For Miners 7 | 8 | ### X/Twitter Scraping (Enhanced) 9 | The enhanced implementation uses `EnhancedApiDojoTwitterScraper` for X/Twitter which provides: 10 | 11 | - **Rich User Metadata** 12 | - User ID, display name, verification status 13 | - Follower/following counts 14 | 15 | - **Complete Tweet Information** 16 | - Engagement metrics (likes, retweets, replies, quotes, views) 17 | - Tweet type classification (reply, quote, retweet) 18 | - Conversation context and threading information 19 | 20 | - **Media Content** 21 | - Media URLs and content types 22 | - Support for photos and videos 23 | 24 | - **Advanced Formatting** 25 | - Properly ordered hashtags and cashtags 26 | - Full conversation context 27 | 28 | ### Reddit Scraping (Unchanged) 29 | The Reddit implementation remains the same, using the Reddit API. 30 | 31 | ## Implementation Options 32 | 33 | You can: 34 | - Use the enhanced implementation as-is (recommended) 35 | - Modify `handle_on_demand` in miner.py to use your own scrapers 36 | - Build custom scraping logic while maintaining the same request/response format 37 | 38 | ### Integration Steps: 39 | 40 | 1. **Simple Integration**: Import the enhanced scraper and provider: 41 | ```python 42 | from scraping.x.enhanced_apidojo_scraper import EnhancedApiDojoTwitterScraper 43 | from scraping.x.on_demand_model import EnhancedXContent 44 | ``` 45 | 46 | 2. **Update your scraper provider**: 47 | ```python 48 | # Create enhanced scraper provider 49 | scraper_provider = EnhancedScraperProvider() 50 | ``` 51 | 52 | 3. **Enjoy richer data**: The enhanced content is automatically used for X/Twitter requests 53 | 54 | ## Rewards 55 | - Top 50% of miners by stake participate in validation 56 | - 50% chance of validation per request 57 | - Successful validation: +1% credibility 58 | - Failed validation: Proportional credibility decrease 59 | 60 | ## Response Format Example 61 | 62 | ```json 63 | { 64 | "uri": "https://x.com/username/status/123456789", 65 | "datetime": "2025-03-17T12:34:56+00:00", 66 | "source": "X", 67 | "label": "#bitcoin", 68 | "content": "Tweet text content...", 69 | "user": { 70 | "username": "@username", 71 | "display_name": "User Display Name", 72 | "id": "12345678", 73 | "verified": true, 74 | "followers_count": 10000, 75 | "following_count": 1000 76 | }, 77 | "tweet": { 78 | "id": "123456789", 79 | "like_count": 500, 80 | "retweet_count": 100, 81 | "reply_count": 50, 82 | "quote_count": 25, 83 | "hashtags": ["#bitcoin", "#crypto"], 84 | "is_retweet": false, 85 | "is_reply": false, 86 | "is_quote": true, 87 | "conversation_id": "123456789" 88 | }, 89 | "media": [ 90 | {"url": "https://pbs.twimg.com/media/image1.jpg", "type": "photo"}, 91 | {"url": "https://video.twimg.com/video1.mp4", "type": "video"} 92 | ] 93 | } 94 | ``` 95 | 96 | That's it! The enhanced system is ready to use, providing significantly richer data while maintaining compatibility with existing implementations. 🚀 -------------------------------------------------------------------------------- /docs/reddit.md: -------------------------------------------------------------------------------- 1 | # Reddit 2 | 3 | [Reddit](https://reddit.com) is one source that Data Universe can pull from. 4 | 5 | In addition to the [Apify actor based scraping](apify.md) we also support using a personal reddit account. 6 | 7 | ## Getting a reddit account. 8 | 9 | If you already have a reddit account you can use that one. Otherwise [sign up](https://www.reddit.com/register/) for one (must support password based auth). 10 | 11 | ## Setting up your account for use with a script type app. 12 | 13 | Follow the [OAuth2 First Steps guide](https://github.com/reddit-archive/reddit/wiki/OAuth2-Quick-Start-Example#first-steps) to add a script type app to your account and find the associated app client id and app client secret. Do not share your client secret with anyone. 14 | 15 | ## Providing your information to your miner or validator. 16 | 17 | Create a file named`.env` in the `data-universe` directory if it doesn't already exist and add the following to it: 18 | ```py 19 | REDDIT_CLIENT_ID="YOUR_REDDIT_CLIENT_ID" 20 | REDDIT_CLIENT_SECRET="YOUR_REDDIT_CLIENT_SECRET" 21 | REDDIT_USERNAME="YOUR_REDDIT_USERNAME" 22 | REDDIT_PASSWORD="YOUR_REDDIT_PASSWORD" 23 | ``` -------------------------------------------------------------------------------- /docs/scoring.md: -------------------------------------------------------------------------------- 1 | # Bittensor Miner Evaluation System 2 | 3 | ## Overview 4 | 5 | This document outlines the key components of the Bittensor miner evaluation system: Credibility, Score, and Incentive. These components work together to create a fair and effective reward mechanism for miners in the network. 6 | 7 | ## Key Components 8 | 9 | ### 1. Credibility 10 | 11 | | Aspect | Description | 12 | |--------|-------------| 13 | | Range | 0 to 1 | 14 | | Purpose | Measures the miner's long-term reliability and consistency | 15 | | Calculation | `new_credibility = α * current_validation_result + (1 - α) * old_credibility` | 16 | | Characteristics | Slowly changes over time, reflecting consistent performance | 17 | 18 | ### 2. Score 19 | 20 | | Aspect | Description | 21 | |--------|-------------| 22 | | Range | Any non-negative number | 23 | | Purpose | Represents the value of data provided in a single evaluation | 24 | | Calculation | `raw_score = data_type_scale_factor * time_scalar * scorable_bytes`
`final_score = raw_score * (credibility ^ 2.5)` | 25 | | Characteristics | Can vary significantly between evaluations | 26 | 27 | ### 3. Incentive (Reward) 28 | 29 | | Aspect | Description | 30 | |--------|-------------| 31 | | Range | Proportional to the miner's share of the total network score | 32 | | Purpose | Determines the actual reward (e.g., tokens) given to the miner | 33 | | Calculation | `miner_reward = (miner_score / total_network_score) * total_reward_pool` | 34 | | Characteristics | Directly affects the miner's earnings | 35 | 36 | ## Relationships 37 | 38 | ### Credibility → Score 39 | - Credibility acts as a multiplier for the raw score 40 | - Higher credibility significantly boosts the final score due to the exponential factor (2.5) 41 | 42 | ### Score → Incentive 43 | - The miner's score determines their share of the total reward pool 44 | - Higher scores lead to higher rewards, but it's relative to other miners' scores 45 | 46 | ### Credibility → Incentive 47 | - Credibility indirectly affects incentives by boosting scores 48 | - Miners with higher credibility can earn more rewards even with the same raw data value 49 | 50 | ### HuggingFace validation 51 | - Credibility change is not enabled at this moment, but you can take a look how it is going to be implemented here: [hugging_face_validation.md](/docs/hugging_face_validation.md) file. 52 | ## System Flow 53 | 54 | 1. **Credibility Evaluation**: 55 | - Miner's current credibility is assessed based on past performance. 56 | 57 | 2. **Raw Score Calculation**: 58 | - Data Value is determined based on content, source, and timeliness. 59 | - Raw Score is computed using the Data Value and other factors. 60 | 61 | 3. **Final Score Computation**: 62 | - Credibility is applied as a multiplier to the Raw Score. 63 | - Final Score = Raw Score * (Credibility ^ 2.5) 64 | 65 | 4. **Incentive/Reward Allocation**: 66 | - Miner's Final Score is compared to the Total Network Score. 67 | - Reward is proportionally allocated based on this comparison. 68 | 69 | 5. **Feedback Loop**: 70 | - The allocated reward indirectly motivates the miner to maintain and improve their Credibility for future evaluations. 71 | 72 | Note: Credibility has a direct influence on the Final Score, while the Incentive/Reward indirectly influences future Credibility through miner behavior. 73 | 74 | ## Key Parameters 75 | 76 | - Starting Credibility: 0 77 | - Credibility Exponent: 2.5 78 | - Credibility Alpha (α): 0.15 79 | - Max Data Entity Bucket Size: 128 MB 80 | - Max Data Entity Bucket Count per Miner Index: 350,000 81 | - Data Age Limit: 30 days 82 | - Min Evaluation Period: 60 minutes 83 | 84 | ## Data Source Weights 85 | - Reddit: 55% (weight: 0.55) 86 | - X (Twitter): 35% (weight: 0.35) 87 | - Youtube: 10% (weight: 0.1) 88 | 89 | ## Desirable Data 90 | 91 | For the current list of desirable data sources and their jobs, run with the `--gravity` tag. 92 | 93 | ## Important Notes 94 | 95 | - Scores are relative to other miners in the network 96 | - Credibility builds over time, rewarding consistent good performance 97 | - Recent data is valued more highly than older data 98 | - The system adapts to changes in data desirability through configurable lookup tables 99 | - Negative scale factors can penalize undesirable data 100 | 101 | This scoring system is designed to be fair while also being resistant to gaming attempts, encouraging miners to consistently provide high-quality, relevant, and timely data to the Bittensor network. 102 | -------------------------------------------------------------------------------- /docs/validator.md: -------------------------------------------------------------------------------- 1 | # Validator 2 | 3 | The Validator is responsible for validating the Miners and scoring them according to the [incentive mechanism](../README.md#incentive-mechanism). It runs a loop to enumerate all Miners in the network, and for each, it performs the following sequence: 4 | 1. It requests the latest [MinerIndex](../README.md#terminology) from the miner, which it stores in a in-memory database. 5 | 2. It chooses a random (sampled by size) DataEntityBucket from the MinerIndex to sample. 6 | 3. It gets that DataEntityBucket from the Miner. 7 | 4. It chooses N DataEntities from the DataEntityBucket to validate. It then scrapes the content from the appropriate DataSource to get those DataEntities. 8 | 5. It then compares those retrieved DataEntities against the ones provided by the Miner and updates the Miner Credibility, based on the result. 9 | 6. Finally, it updates the Miner's score. This is based on the total MinerIndex scaled by Freshness/Desirability/Duplication/Credibility. 10 | 11 | Once this sequence has been performed for all Miners, the Validator waits a period of time before starting the next loop to ensure it does not evaluate a Miner more often than once per N minutes. This helps ensure the cost of running a Validator is not too high, and also protects the network against high amounts of traffic. 12 | 13 | As of Jan 13th 2024, the expected cost number of DataItems queried via Apify is roughly: `225 Miners * 1 evals per hour * 2 sample per period * 24 hours = 10800`. Assuming this is ~50% Reddit (Free with Custom Scraper) and ~50% per X ($1 per 1000), the total cost is roughly $5.40 per day. 14 | 15 | # System Requirements 16 | 17 | Validators require at least 32 GB of RAM but do not require a GPU. We recommend a decent CPU (4+ cores) and sufficient network bandwidth to handle protocol traffic. Must have python >= 3.10. 18 | 19 | # Getting Started 20 | 21 | ## Prerequisites 22 | 1. As of Jan 13th 2024, we support Twitter and Reddit scraping via Apify so you'll need to [setup your Apify API token](apify.md). 23 | We also support Reddit scraping via a [personal reddit account](reddit.md) which is completely free. 24 | Validators will default to using the personal reddit account for reliability but this can be changed editing the PREFERRED_SCRAPERS map in validator.py locally. 25 | We also support YouTube Scraping via a [official youtube api](youtube.md) which is completely free. 26 | 27 | 2. Clone the repo 28 | 29 | ```shell 30 | git clone https://github.com/RusticLuftig/data-universe.git 31 | ``` 32 | 33 | 3. Setup your python [virtual environment](https://docs.python.org/3/library/venv.html) or [Conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands). 34 | 35 | 4. Install the requirements. From your virtual environment, run 36 | ```shell 37 | cd data-universe 38 | python -m pip install -e . 39 | ``` 40 | 41 | 5. Make sure you've [created a Wallet](https://docs.bittensor.com/getting-started/wallets) and [registered a hotkey](https://docs.bittensor.com/subnets/register-and-participate). 42 | 43 | 6. (Optional) Setup a wandb account and login so your validator can store logs beyond 7 days. From your virtual environment, run 44 | ```shell 45 | wandb login 46 | ``` 47 | 48 | This will prompt you to navigate to https://wandb.ai/authorize and copy your api key back into the terminal. 49 | 50 | ## Running the Validator 51 | 52 | ### With auto-updates 53 | 54 | We highly recommend running the validator with auto-updates. This will help ensure your validator is always running the latest release, helping to maintain a high vtrust. 55 | 56 | Prerequisites: 57 | 1. To run with auto-update, you will need to have [pm2](https://pm2.keymetrics.io/) installed. 58 | 2. Make sure your virtual environment is activated. This is important because the auto-updater will automatically update the package dependencies with pip. 59 | 3. Make sure you're using the main branch: `git checkout main`. 60 | 61 | From the data-universe folder: 62 | ```shell 63 | pm2 start --name net13-vali-updater --interpreter python scripts/start_validator.py -- --pm2_name net13-vali --wallet.name cold_wallet --wallet.hotkey hotkey_wallet [other vali flags] 64 | ``` 65 | 66 | This will start a process called `net13-vali-updater`. This process periodically checks for a new git commit on the current branch. When one is found, it performs a `pip install` for the latest packages, and restarts the validator process (who's name is given by the `--pm2_name` flag) 67 | 68 | 69 | ### Without auto-updates 70 | 71 | If you'd prefer to manage your own validator updates... 72 | 73 | From the data-universe folder: 74 | ```shell 75 | pm2 start python -- ./neurons/validator.py --wallet.name your-wallet --wallet.hotkey your-hotkey 76 | ``` 77 | 78 | # Configuring the Validator 79 | 80 | ## Flags 81 | 82 | The Validator offers some flags to customize properties. 83 | 84 | You can view the full set of flags by running 85 | ```shell 86 | python ./neurons/validator.py -h 87 | ``` 88 | 89 | # Coming Soon 90 | 91 | We are working hard to add more features to the Subnet. For the Validators, we have plans to: 92 | 93 | 1. Have the Validator serve an Axon on the network, so neurons on other Subnets can retrieve data. 94 | 2. Add scrapers for other DataSources. 95 | 3. Add other (and cheaper) scrapers for the Validators to use. -------------------------------------------------------------------------------- /docs/youtube.md: -------------------------------------------------------------------------------- 1 | Step-by-Step Instructions 2 | 1. Create a Google Cloud Project 3 | - Visit [Google Cloud Console](https://console.cloud.google.com/). 4 | - Click on the project drop-down and select “New Project”. 5 | - Enter a name for your project (e.g., YouTubeScraper) and click “Create”. 6 | 7 | 2. Enable the YouTube Data API v3 8 | - Search for YouTube Data API v3 9 | - Click on it and then click “Enable”. 10 | 11 | 3. Generate an API Key 12 | - Go to the Credentials page. 13 | - Click “Create Credentials” > “API Key”. 14 | - A new API key will be generated. Copy and save it. 15 | 16 | 4. Set the API Key in Environment Variables 17 | - Add the following to your .env file in the root directory of your project: 18 | `YOUTUBE_API_KEY=your_actual_api_key_here 19 | ` 20 | 21 | ### Working with proxies ( webShare) 22 | If you are using cloud provider that belong to popular cloud providers (like AWS, Google Cloud Platform, Azure, DO etc.), 23 | 24 | You will need to add WebShare proxy 25 | Once you have created a Webshare account and purchased a "Residential" proxy package that suits your workload 5 GB for a vali (make sure NOT to purchase "Proxy Server" or "Static Residential"!), 26 | open the Webshare Proxy Settings to retrieve your "Proxy Username" and "Proxy Password". Using this information you can initialize the validator as follows: 27 | `WEB_SHARE_PROXY_USERNAME=WEB_SHARE_PROXY_USERNAME 28 | WEB_SHARE_PROXY_PASSWORD=WEB_SHARE_PROXY_PASSWORD` 29 | 30 | ### Working with any other proxies 31 | 32 | If you don't like webshare, you can define this works with any http proxy! 33 | 34 | YTT_PROXY_HOST=127.0.0.01 35 | YTT_PROXY_PORT=7777 36 | YTT_PROXY_USERNAME=myusername 37 | YTT_PROXY_PASSWORD=mypassword 38 | -------------------------------------------------------------------------------- /dynamic_desirability/chain_utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import functools 3 | from typing import Dict, Optional, Any 4 | import bittensor as bt 5 | import multiprocessing 6 | import argparse 7 | 8 | def add_args(parser: argparse.ArgumentParser, is_upload: bool): 9 | """Add arguments to the parser""" 10 | parser.add_argument('--wallet', type=str, required=True, help='Name of the wallet') 11 | parser.add_argument('--hotkey', type=str, required=True, help='Name of the hotkey') 12 | parser.add_argument('--network', type=str, required=True, help='Name of the subtensor network', default='finney') 13 | parser.add_argument('--netuid', type=int, required=True, help='UID of the subnet', default=13) 14 | 15 | if is_upload: 16 | parser.add_argument('--file_path', type=str, required=True, help='Path to the JSON file containing preferences') 17 | 18 | def _sync_retrieve_metadata(netuid: int, hotkey: str, network: str = "finney"): 19 | """Standalone function that can be pickled""" 20 | try: 21 | # Create a fresh subtensor instance for each call 22 | fresh_subtensor = bt.subtensor(network=network) 23 | 24 | metadata = bt.core.extrinsics.serving.get_metadata( 25 | fresh_subtensor, 26 | netuid, 27 | hotkey 28 | ) 29 | 30 | if not metadata: 31 | return None 32 | 33 | commitment = metadata["info"]["fields"][0] 34 | hex_data = commitment[list(commitment.keys())[0]][2:] 35 | return bytes.fromhex(hex_data).decode() 36 | except Exception as e: 37 | bt.logging.error(f"Error retrieving metadata for {hotkey}: {str(e)}") 38 | return None 39 | 40 | 41 | def _wrapped_func(func: functools.partial, queue: multiprocessing.Queue): 42 | try: 43 | result = func() 44 | queue.put(result) 45 | except Exception as e: 46 | queue.put(None) # Return None instead of raising on error 47 | 48 | 49 | def run_in_subprocess(func: functools.partial, ttl: int = 10) -> Any: 50 | """Runs with shorter timeout and better error handling""" 51 | ctx = multiprocessing.get_context('fork') 52 | queue = ctx.Queue() 53 | process = ctx.Process(target=_wrapped_func, args=[func, queue]) 54 | 55 | process.start() 56 | process.join(timeout=ttl) 57 | 58 | if process.is_alive(): 59 | process.terminate() 60 | process.join() 61 | return None # Return None on timeout instead of raising 62 | 63 | try: 64 | result = queue.get(block=False) 65 | return result 66 | except Exception: 67 | return None 68 | 69 | 70 | class ChainPreferenceStore: 71 | def __init__( 72 | self, 73 | subtensor: bt.subtensor, 74 | netuid: int, 75 | wallet: Optional[bt.wallet] = None, 76 | ): 77 | self.subtensor = subtensor 78 | self.wallet = wallet 79 | self.netuid = netuid 80 | 81 | async def store_preferences( 82 | self, 83 | data: str, 84 | wait_for_inclusion: bool = True, 85 | wait_for_finalization: bool = True, 86 | ): 87 | """Stores preferences on this subnet for a specific wallet.""" 88 | if self.wallet is None: 89 | raise ValueError("No wallet available to write to the chain.") 90 | if not data: 91 | raise ValueError("No data provided to store on the chain.") 92 | 93 | def sync_store(): 94 | return bt.core.extrinsics.serving.publish_metadata( 95 | self.subtensor, 96 | self.wallet, 97 | self.netuid, 98 | f"Raw{len(data)}", 99 | data.encode(), 100 | wait_for_inclusion, 101 | wait_for_finalization, 102 | ) 103 | 104 | partial = functools.partial(sync_store) 105 | bt.logging.info("Writing to chain...") 106 | return run_in_subprocess(partial, 60) 107 | 108 | async def retrieve_preferences(self, hotkey: str) -> Optional[str]: 109 | """Single retrieval with shorter timeout""" 110 | partial = functools.partial(_sync_retrieve_metadata, self.netuid, hotkey) 111 | return run_in_subprocess(partial, ttl=10) # Shorter timeout per validator 112 | 113 | async def batch_retrieve_preferences(self, hotkeys: list[str], chunk_size: int = 5) -> Dict[str, Optional[str]]: 114 | """Retrieve preferences for multiple validators in chunks""" 115 | results = {} 116 | 117 | # Process in chunks to avoid overwhelming the system 118 | for i in range(0, len(hotkeys), chunk_size): 119 | chunk = hotkeys[i:i + chunk_size] 120 | chunk_tasks = [] 121 | 122 | # Create tasks for each hotkey in the chunk 123 | for hotkey in chunk: 124 | task = asyncio.create_task(self.retrieve_preferences(hotkey)) 125 | chunk_tasks.append((hotkey, task)) 126 | 127 | # Wait for all tasks in chunk to complete 128 | for hotkey, task in chunk_tasks: 129 | try: 130 | result = await task 131 | results[hotkey] = result 132 | except Exception as e: 133 | bt.logging.error(f"Error processing {hotkey}: {str(e)}") 134 | results[hotkey] = None 135 | 136 | # Small delay between chunks 137 | await asyncio.sleep(0.1) 138 | 139 | return results 140 | 141 | 142 | if __name__ == "__main__": 143 | # Example usage 144 | parser = argparse.ArgumentParser() 145 | add_args(parser, is_upload=False) 146 | args = parser.parse_args() 147 | 148 | subtensor = bt.subtensor(network=args.network) 149 | wallet = bt.wallet(name=args.wallet, hotkey=args.hotkey) 150 | 151 | store = ChainPreferenceStore(subtensor, args.netuid, wallet) 152 | 153 | async def test(): 154 | result = await store.retrieve_preferences(args.hotkey) 155 | print(f"Retrieved preferences: {result}") 156 | 157 | asyncio.run(test()) -------------------------------------------------------------------------------- /dynamic_desirability/constants.py: -------------------------------------------------------------------------------- 1 | # The link to the github repo where preferences JSONs are uploaded. 2 | REPO_URL: str = 'https://github.com/macrocosm-os/gravity.git' 3 | BRANCH_NAME: str = 'main' 4 | PREFERENCES_FOLDER: str = 'validator_preferences' 5 | 6 | # Total weight of all validators. Subnet (default) voting weight = 1-TOTAL_VALI_WEIGHT. 7 | TOTAL_VALI_WEIGHT: float = 0.7 8 | DEFAULT_SCALE_FACTOR: float = 0.3 # number is subject to change 9 | AMPLICATION_FACTOR: float = 250 / TOTAL_VALI_WEIGHT * (1 - TOTAL_VALI_WEIGHT) 10 | 11 | # Paths of subnet preferences (default) and overall subnet + validator preferences. 12 | DEFAULT_JSON_PATH: str = 'default.json' 13 | AGGREGATE_JSON_PATH: str = 'total.json' 14 | 15 | VALID_SOURCES: dict[str, str] = { 16 | "reddit": "r/", 17 | "x": "#", 18 | "youtube": "", 19 | } -------------------------------------------------------------------------------- /huggingface_utils/encoding_system.py: -------------------------------------------------------------------------------- 1 | """Module for URL encoding and decoding using Fernet encryption.""" 2 | 3 | import base64 4 | import json 5 | import os 6 | import time 7 | from typing import Tuple, Optional 8 | 9 | import pandas as pd 10 | from cryptography.fernet import Fernet 11 | 12 | class EncodingKeyManager: 13 | """Manages the encryption key for URL encoding and decoding.""" 14 | 15 | def __init__(self, key_path: str = 'encoding_key.json'): 16 | """Initialize the EncodingKeyManager with a key file path.""" 17 | self.key_path = key_path 18 | self.sym_key = self._load_or_generate_key() 19 | self.fernet = Fernet(self.sym_key) 20 | 21 | def _load_or_generate_key(self) -> bytes: 22 | """Load an existing key or generate a new one if it doesn't exist.""" 23 | if os.path.exists(self.key_path): 24 | with open(self.key_path, 'r', encoding='utf-8') as f: 25 | key_data = json.load(f) 26 | return key_data['sym_key'].encode() 27 | else: 28 | sym_key = Fernet.generate_key() 29 | self._save_key(sym_key) 30 | return sym_key 31 | 32 | def _save_key(self, sym_key: bytes) -> None: 33 | """Save the symmetric key to a JSON file.""" 34 | key_data = { 35 | 'sym_key': sym_key.decode() 36 | } 37 | with open(self.key_path, 'w', encoding='utf-8') as f: 38 | json.dump(key_data, f) 39 | 40 | def get_fernet(self) -> Fernet: 41 | """Get the Fernet instance for encryption/decryption.""" 42 | return self.fernet 43 | 44 | 45 | class SymKeyEncodingKeyManager(EncodingKeyManager): 46 | """A subclass of EncodingKeyManager that uses a symmetric key directly.""" 47 | 48 | def __init__(self, sym_key: str): 49 | """ 50 | Initialize the SymKeyEncodingKeyManager with a symmetric key. 51 | 52 | Args: 53 | sym_key (str): A base64-encoded symmetric key string. 54 | """ 55 | self.sym_key = self._validate_and_encode_key(sym_key) 56 | self.fernet = Fernet(self.sym_key) 57 | 58 | def _validate_and_encode_key(self, sym_key: str) -> bytes: 59 | """Validate the provided key and return it as bytes.""" 60 | try: 61 | # Attempt to create a Fernet instance to validate the key 62 | Fernet(sym_key.encode()) 63 | return sym_key.encode() 64 | except Exception as e: 65 | raise ValueError(f"Invalid symmetric key provided: {str(e)}") 66 | 67 | def _load_or_generate_key(self) -> bytes: 68 | """Override to return the provided symmetric key.""" 69 | return self.sym_key 70 | 71 | def _save_key(self, sym_key: bytes) -> None: 72 | """Override to do nothing, as we don't want to save the key to a file.""" 73 | pass 74 | 75 | 76 | def encode_url(url: str, fernet: Fernet) -> Optional[str]: 77 | """Encode a URL using Fernet encryption.""" 78 | try: 79 | encoded = fernet.encrypt(url.encode()) 80 | return base64.urlsafe_b64encode(encoded).decode() 81 | except Exception as e: 82 | print(f"Encryption failed for URL: {url}") 83 | print(f"Error: {str(e)}") 84 | return None 85 | 86 | 87 | def decode_url(encoded_url: str, fernet: Fernet) -> Optional[str]: 88 | """Decode an encoded URL using Fernet decryption.""" 89 | try: 90 | decoded = fernet.decrypt(base64.urlsafe_b64decode(encoded_url.encode())) 91 | return decoded.decode() 92 | except Exception as e: 93 | print(f"Decryption failed for encoded URL: {encoded_url}") 94 | print(f"Error: {str(e)}") 95 | return None 96 | 97 | 98 | def encode_dataframe_column(df: pd.DataFrame, column_name: str, key_manager: EncodingKeyManager) -> pd.DataFrame: 99 | """Encode a column of URLs in a DataFrame.""" 100 | fernet = key_manager.get_fernet() 101 | df[f'{column_name}_encoded'] = df[column_name].apply(lambda url: encode_url(url, fernet)) 102 | return df 103 | 104 | 105 | def decode_dataframe_column(df: pd.DataFrame, column_name: str, key_manager: EncodingKeyManager) -> pd.DataFrame: 106 | """Decode a column of encoded URLs in a DataFrame.""" 107 | fernet = key_manager.get_fernet() 108 | original_column_name = column_name.replace('_encoded', '') 109 | df[original_column_name] = df[column_name].apply(lambda url: decode_url(url, fernet)) 110 | return df 111 | 112 | 113 | def main(): 114 | """Main function to demonstrate URL encoding and decoding.""" 115 | # Initialize EncodingKeyManager 116 | key_manager = EncodingKeyManager() 117 | 118 | # Create a larger sample DataFrame (1 million rows) 119 | n_rows = 1_000_000 120 | urls = [ 121 | 'https://example.com/short_url', 122 | 'https://example.com/medium_length_url_with_some_parameters?param1=value1¶m2=value2', 123 | 'https://example.com/very_long_url_with_many_parameters_and_some_special_characters?param1=value1¶m2=value2¶m3=value3¶m4=value4&special=!@#$%^&*()' 124 | ] 125 | df = pd.DataFrame({ 126 | 'url': urls * (n_rows // len(urls) + 1) 127 | }).head(n_rows) 128 | 129 | # Measure encoding time 130 | start_time = time.time() 131 | df_encoded = encode_dataframe_column(df, 'url', key_manager) 132 | encode_time = time.time() - start_time 133 | print(f"Encoding time for {n_rows} rows: {encode_time:.2f} seconds") 134 | 135 | # Measure decoding time 136 | start_time = time.time() 137 | df_decoded = decode_dataframe_column(df_encoded, 'url_encoded', key_manager) 138 | decode_time = time.time() - start_time 139 | print(f"Decoding time for {n_rows} rows: {decode_time:.2f} seconds") 140 | 141 | # Verify that the decoded URLs match the original 142 | print("\nVerification:") 143 | print(df['url'].equals(df_decoded['url'])) 144 | 145 | # Calculate and print rows processed per second 146 | print(f"\nEncoding speed: {n_rows / encode_time:.2f} rows/second") 147 | print(f"Decoding speed: {n_rows / decode_time:.2f} rows/second") 148 | 149 | 150 | if __name__ == "__main__": 151 | main() -------------------------------------------------------------------------------- /huggingface_utils/s3_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import requests 4 | import bittensor as bt 5 | from typing import Dict, Any, Optional 6 | 7 | 8 | class S3Auth: 9 | """Handles S3 authentication with blockchain commitments and Keypair signatures""" 10 | 11 | def __init__(self, s3_auth_url: str): 12 | self.s3_auth_url = s3_auth_url 13 | 14 | def get_credentials(self, 15 | wallet: bt.wallet, 16 | source_name: str, 17 | subtensor: bt.subtensor) -> Optional[Dict[str, Any]]: 18 | """Get S3 credentials using blockchain commitments and hotkey signature""" 19 | try: 20 | coldkey = wallet.get_coldkeypub().ss58_address 21 | hotkey = wallet.hotkey.ss58_address 22 | timestamp = int(time.time()) 23 | 24 | commitment = f"s3:access:{coldkey}:{source_name}:{timestamp}" 25 | # bt.logging.info(f"\ud83d\ude80 Committing to blockchain: {commitment}") todo add if it's going to be necessary 26 | # success = subtensor.commit(wallet=wallet, netuid=netuid, data=commitment) 27 | 28 | # Sign the commitment 29 | signature = wallet.hotkey.sign(commitment.encode()) 30 | signature_hex = signature.hex() 31 | 32 | payload = { 33 | "coldkey": coldkey, 34 | "hotkey": hotkey, 35 | "source": source_name, 36 | "timestamp": timestamp, 37 | "signature": signature_hex 38 | } 39 | 40 | response = requests.post( 41 | f"{self.s3_auth_url.rstrip('/')}/get-folder-access", 42 | json=payload, 43 | timeout=30 44 | ) 45 | 46 | if response.status_code != 200: 47 | try: 48 | error_detail = response.json().get("detail", "Unknown error") 49 | except Exception: 50 | error_detail = response.text or "Unknown error" 51 | bt.logging.error(f"\u274c Failed to get S3 credentials: {error_detail}") 52 | return None 53 | 54 | return response.json() 55 | 56 | except Exception as e: 57 | bt.logging.error(f"\u274c Error getting S3 credentials: {str(e)}") 58 | return None 59 | 60 | def upload_file(self, file_path: str, creds: Dict[str, Any]) -> bool: 61 | try: 62 | key = f"{creds['folder']}{os.path.basename(file_path)}" 63 | post_data = dict(creds['fields']) # clone all fields (V4-compatible) 64 | post_data['key'] = key # overwrite key with actual file key 65 | 66 | with open(file_path, 'rb') as f: 67 | files = {'file': f} 68 | response = requests.post(creds['url'], data=post_data, files=files) 69 | 70 | if response.status_code == 204: 71 | bt.logging.info(f"✅ Upload success: {key}") 72 | return True 73 | else: 74 | bt.logging.error(f"❌ Upload failed: {response.status_code} — {response.text}") 75 | return False 76 | 77 | except Exception as e: 78 | bt.logging.error(f"❌ S3 Upload Exception for {file_path}: {e}") 79 | return False 80 | -------------------------------------------------------------------------------- /neurons/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.3.8" 2 | version_split = __version__.split(".") 3 | __spec_version__ = ( 4 | (1000 * int(version_split[0])) 5 | + (10 * int(version_split[1])) 6 | + (1 * int(version_split[2])) 7 | ) 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | apify-client==1.6.1 2 | asyncpraw==7.8.0 3 | bittensor==9.7.0 4 | jupyter==1.0.0 5 | numpy==2.0.1 6 | pydantic==2.10.1 7 | python-dotenv==1.0.0 8 | pytz==2023.3.post1 9 | rich==13.7.0 10 | torch==2.5.1 11 | wandb==0.18.7 12 | pandas~=2.2.2 13 | cryptography==43.0.3 14 | requests==2.32.3 15 | huggingface-hub==0.27.1 16 | datasets~=2.20.0 17 | pyarrow==17.0.0 18 | fsspec==2024.5.0 19 | psutil==5.9.8 20 | loguru==0.7.3 21 | google-api-python-client==2.167.0 22 | youtube-transcript-api==1.0.3 23 | isodate==0.7.2 24 | -------------------------------------------------------------------------------- /rewards/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/rewards/__init__.py -------------------------------------------------------------------------------- /rewards/data_desirability_lookup.py: -------------------------------------------------------------------------------- 1 | from common import constants 2 | from common.data import DataLabel, DataSource 3 | from rewards.data import DataSourceDesirability, DataDesirabilityLookup 4 | 5 | ################################################################# 6 | 7 | # This list is outdated and is only used as a backup to Dynamic Desirability. 8 | # Please see the folder dynamic_desirability for more information on how reward 9 | # scale factors are constructed. 10 | 11 | ################################################################# 12 | 13 | LOOKUP = DataDesirabilityLookup( 14 | distribution={ 15 | DataSource.REDDIT: DataSourceDesirability( 16 | weight=0.6, 17 | default_scale_factor=0.5, 18 | label_scale_factors={ 19 | DataLabel(value="r/Bitcoin"): 1.0, 20 | DataLabel(value="r/BitcoinCash"): 1.0, 21 | DataLabel(value="r/Bittensor_"): 1.0, 22 | DataLabel(value="r/Btc"): 1.0, 23 | DataLabel(value="r/Cryptocurrency"): 1.0, 24 | DataLabel(value="r/Cryptomarkets"): 1.0, 25 | DataLabel(value="r/EthereumClassic"): 1.0, 26 | DataLabel(value="r/Ethtrader"): 1.0, 27 | DataLabel(value="r/Filecoin"): 1.0, 28 | DataLabel(value="r/Monero"): 1.0, 29 | DataLabel(value="r/Polkadot"): 1.0, 30 | DataLabel(value="r/Solana"): 1.0, 31 | DataLabel(value="r/WallstreetBets"): 1.0, 32 | }, 33 | ), 34 | DataSource.X: DataSourceDesirability( 35 | weight=0.4, 36 | default_scale_factor=0.5, 37 | label_scale_factors={ 38 | DataLabel(value="#bitcoin"): 1.0, 39 | DataLabel(value="#bitcoincharts"): 1.0, 40 | DataLabel(value="#bitcoiner"): 1.0, 41 | DataLabel(value="#bitcoinexchange"): 1.0, 42 | DataLabel(value="#bitcoinmining"): 1.0, 43 | DataLabel(value="#bitcoinnews"): 1.0, 44 | DataLabel(value="#bitcoinprice"): 1.0, 45 | DataLabel(value="#bitcointechnology"): 1.0, 46 | DataLabel(value="#bitcointrading"): 1.0, 47 | DataLabel(value="#bittensor"): 1.0, 48 | DataLabel(value="#btc"): 1.0, 49 | DataLabel(value="#cryptocurrency"): 1.0, 50 | DataLabel(value="#crypto"): 1.0, 51 | DataLabel(value="#defi"): 1.0, 52 | DataLabel(value="#decentralizedfinance"): 1.0, 53 | DataLabel(value="#tao"): 1.0, 54 | }, 55 | ), 56 | }, 57 | max_age_in_hours=constants.DATA_ENTITY_BUCKET_AGE_LIMIT_DAYS * 24, 58 | ) 59 | -------------------------------------------------------------------------------- /rewards/data_value_calculator.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | from typing import Optional, List, Dict, Tuple 3 | from common.data import DataSource, TimeBucket, DateRange 4 | from common.data_v2 import ScorableDataEntityBucket 5 | from rewards.data import DataDesirabilityLookup 6 | from scraping.scraper import HFValidationResult 7 | from rewards import data_desirability_lookup 8 | from common import utils 9 | 10 | class DataValueCalculator: 11 | """Calculates how rewards are distributed across DataSources and DataLabels.""" 12 | 13 | def __init__(self, model: DataDesirabilityLookup = data_desirability_lookup.LOOKUP): 14 | # Convert to primitive version for performance optimization 15 | self.model = model.to_primitive_data_desirability_lookup() 16 | 17 | 18 | def get_score_for_data_entity_bucket( 19 | self, 20 | scorable_data_entity_bucket: ScorableDataEntityBucket, 21 | current_time_bucket: TimeBucket 22 | ) -> float: 23 | """Returns the score for the given data entity bucket.""" 24 | # Extract frequently used values 25 | time_bucket_id = scorable_data_entity_bucket.time_bucket_id 26 | label = scorable_data_entity_bucket.label 27 | source = scorable_data_entity_bucket.source 28 | 29 | # Calculate time scalar 30 | time_scalar = self._scale_factor_for_age(time_bucket_id, current_time_bucket.id) 31 | if time_scalar == 0.0: 32 | return 0.0 # No need to do further processing 33 | 34 | # Find matching jobs directly using time bucket ID 35 | # Currently only finds matching jobs where keyword is None. 36 | matching_jobs = self.model.find_matching_jobs(source, None, label, time_bucket_id) 37 | 38 | # Rest of method remains the same... 39 | 40 | data_source_weight = self.model.get_data_source_weight(scorable_data_entity_bucket.source) 41 | 42 | if matching_jobs: 43 | # Calculate score based on matching jobs 44 | total_score = 0.0 45 | for job in matching_jobs: 46 | # Get job weight 47 | job_weight = job["job_weight"] 48 | 49 | # Calculate time scalar 50 | if job["start_timebucket"] or job["end_timebucket"]: 51 | # For jobs with date constraints, if we've reached here, the time bucket 52 | # overlaps with the job's date range, so use full time scalar of 1.0 53 | time_scalar = 1.0 54 | else: 55 | # For jobs without date constraints, use linear depreciation 56 | time_scalar = self._scale_factor_for_age( 57 | scorable_data_entity_bucket.time_bucket_id, 58 | current_time_bucket.id 59 | ) 60 | 61 | # Add this job's contribution to total score 62 | contribution = data_source_weight * job_weight * time_scalar * scorable_data_entity_bucket.scorable_bytes 63 | total_score += contribution 64 | 65 | return total_score 66 | else: 67 | # No matching jobs - use default scale factor 68 | default_scale_factor = self.model.get_default_scale_factor(scorable_data_entity_bucket.source) 69 | time_scalar = self._scale_factor_for_age( 70 | scorable_data_entity_bucket.time_bucket_id, 71 | current_time_bucket.id 72 | ) 73 | 74 | return ( 75 | data_source_weight 76 | * default_scale_factor 77 | * time_scalar 78 | * scorable_data_entity_bucket.scorable_bytes 79 | ) 80 | 81 | 82 | def _scale_factor_for_age( 83 | self, time_bucket_id: int, current_time_bucket_id: int 84 | ) -> float: 85 | """Returns the score scalar for data age. 86 | 87 | Uses a linear depreciation function: 88 | - Current data is scored 1.0 89 | - Data at max_age_in_hours is scored 0.5 90 | - Older data is scored 0.0 91 | """ 92 | # Data age is scored using a linear depreciation function, where data from now is scored 1 and data 93 | # that is max_age_in_hours old is scored 0.5. 94 | # All data older than max_age_in_hours is scored 0. 95 | 96 | # Note: This makes the assumption that TimeBuckets are 1 hour buckets, which isn't ideal, 97 | # but we make the trade-off because it has a notable impact on perf vs. constructing TimeBuckets 98 | # to compute the age in hours. 99 | data_age_in_hours = current_time_bucket_id - time_bucket_id 100 | 101 | # Safe guard against future data. 102 | data_age_in_hours = max(0, data_age_in_hours) 103 | 104 | if data_age_in_hours > self.model.max_age_in_hours: 105 | return 0.0 106 | return 1.0 - (data_age_in_hours / (2 * self.model.max_age_in_hours)) 107 | 108 | -------------------------------------------------------------------------------- /scraping/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/scraping/__init__.py -------------------------------------------------------------------------------- /scraping/apify.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Optional 3 | from apify_client import ApifyClientAsync 4 | from pydantic import BaseModel, Field, PositiveInt 5 | import bittensor as bt 6 | 7 | from dotenv import load_dotenv 8 | 9 | from common.data import StrictBaseModel 10 | 11 | load_dotenv() 12 | 13 | 14 | class RunConfig(StrictBaseModel): 15 | """Configuration parameters for a single Apify Actor run.""" 16 | 17 | api_key: str = Field( 18 | description="The Apify API token.", 19 | default=os.getenv("APIFY_API_TOKEN"), 20 | min_length=1, # Can't be empty. 21 | ) 22 | 23 | actor_id: str = Field( 24 | description="The ID of the actor to run.", 25 | min_length=1, # Can't be empty. 26 | ) 27 | 28 | timeout_secs: PositiveInt = Field( 29 | description="The timeout for the actor run.", 30 | default=180, 31 | ) 32 | 33 | max_data_entities: PositiveInt = Field( 34 | description="The maximum number of items to be returned by the actor. The client will not be charged for more items than this value.", 35 | default=100, 36 | ) 37 | 38 | debug_info: str = Field( 39 | description="Optional debug info to include in logs relating to this run." 40 | ) 41 | 42 | memory_mb: Optional[int] = Field( 43 | description="The amount of memory in mb to use for this run.", default=None 44 | ) 45 | 46 | 47 | class ActorRunError(Exception): 48 | """Exception raised when an actor run fails.""" 49 | 50 | def __init__(self, message: str): 51 | self.message = message 52 | super().__init__(self.message) 53 | 54 | 55 | class ActorRunner: 56 | def __init__(self): 57 | pass 58 | 59 | async def run(self, config: RunConfig, run_input: dict) -> List[dict]: 60 | """ 61 | Run an Apify actor and return the json results. 62 | 63 | Args: 64 | config (ActorConfig): The configuration to use for running the actor. 65 | run_input (dict): The input parameters for the actor run. 66 | 67 | Raises: 68 | ActorRunError: If the actor run fails, raises an exception, with the run details in the exception message. 69 | 70 | Returns: 71 | list[dict]: List of items fetched from the dataset. 72 | """ 73 | 74 | client = ApifyClientAsync(config.api_key) 75 | 76 | run = await client.actor(config.actor_id).call( 77 | run_input=run_input, 78 | max_items=config.max_data_entities, 79 | timeout_secs=config.timeout_secs, 80 | # If not set, the client will wait indefinitely for the run to finish. Ensure we don't wait forever. 81 | wait_secs=config.timeout_secs + 5, 82 | memory_mbytes=config.memory_mb, 83 | ) 84 | 85 | # We want a success status. Timeout is also okay because it will return partial results. 86 | if "status" not in run or not ( 87 | run["status"].casefold() == "SUCCEEDED".casefold() 88 | or run["status"].casefold() == "TIMED-OUT".casefold() 89 | ): 90 | raise ActorRunError( 91 | f"Actor ({config.actor_id}) [{config.debug_info}] failed: {run}" 92 | ) 93 | iterator = client.dataset(run["defaultDatasetId"]).iterate_items() 94 | items = [i async for i in iterator] 95 | 96 | return items 97 | -------------------------------------------------------------------------------- /scraping/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/scraping/config/__init__.py -------------------------------------------------------------------------------- /scraping/config/config_reader.py: -------------------------------------------------------------------------------- 1 | from scraping.config import model 2 | from scraping import coordinator 3 | 4 | class ConfigReader: 5 | """A class to read the scraping config from a json file.""" 6 | 7 | @classmethod 8 | def load_config(cls, filepath: str) -> coordinator.CoordinatorConfig: 9 | """Loads the scraping config from json and returns it as a CoordinatorConfig. 10 | 11 | Raises: 12 | ValidationError: if the file content is not valid. 13 | """ 14 | 15 | print(f"Loading file: {filepath}") 16 | parsed_file = model.ScrapingConfig.parse_file(path=filepath) 17 | print(f"Got parsed file: {parsed_file}") 18 | return parsed_file.to_coordinator_config() -------------------------------------------------------------------------------- /scraping/config/model.py: -------------------------------------------------------------------------------- 1 | """This file contains the pydantic classes for the scraping config JSON file. 2 | 3 | We use JSON for the configuring the scraping distribution config to make it easier 4 | for miner's to customize their miners, while still being able to take advantage of 5 | auto-updates, in future. 6 | 7 | The classes here are ~identical to their sibling classes in scraping/scraper.py, except 8 | they contain natively serializable/deseriazable fields. All code should use the classes 9 | in scraping/scraper.py. These classes are only intended to be used for deserializing 10 | the scraping_config JSON file. 11 | """ 12 | 13 | from typing import List, Optional 14 | from pydantic import BaseModel, Field, PositiveInt, ConfigDict 15 | from common import constants 16 | from common.data import DataLabel, StrictBaseModel 17 | from scraping import coordinator 18 | from scraping.scraper import ScraperId 19 | 20 | 21 | class LabelScrapingConfig(StrictBaseModel): 22 | """Describes what labels to scrape.""" 23 | 24 | model_config = ConfigDict() 25 | 26 | label_choices: Optional[List[str]] = Field( 27 | description="""The collection of labels to choose from when performing a scrape. 28 | On a given scrape, 1 label will be chosen at random from this list. 29 | 30 | An empty list is treated as a non-existant label. In that case, no filter is applied when scraping data from this source. 31 | """ 32 | ) 33 | 34 | max_age_hint_minutes: int = Field( 35 | description="""The maximum age of data that this scrape should fetch. A random TimeBucket (currently hour block), 36 | will be chosen within the time frame (now - max_age_hint_minutes, now), using a probality distribution aligned 37 | with how validators score data freshness. 38 | 39 | Note: not all data sources provide date filters, so this property should be thought of as a hint to the scraper, not a rule. 40 | """, 41 | default=60 * 24 * constants.DATA_ENTITY_BUCKET_AGE_LIMIT_DAYS, 42 | ) 43 | 44 | max_data_entities: Optional[PositiveInt] = Field( 45 | default=None, 46 | description="The maximum number of items to fetch in a single scrape for this label. If None, the scraper will fetch as many items possible.", 47 | ) 48 | 49 | def to_coordinator_label_scrape_config(self) -> coordinator.LabelScrapingConfig: 50 | """Returns the internal LabelScrapingConfig representation""" 51 | labels = ( 52 | [DataLabel(value=val) for val in self.label_choices] 53 | if self.label_choices 54 | else None 55 | ) 56 | return coordinator.LabelScrapingConfig( 57 | label_choices=labels, 58 | max_age_hint_minutes=self.max_age_hint_minutes, 59 | max_data_entities=self.max_data_entities, 60 | ) 61 | 62 | 63 | class ScraperConfig(StrictBaseModel): 64 | """Configures a specific scraper.""" 65 | 66 | model_config = ConfigDict() 67 | 68 | scraper_id: ScraperId = Field(description="The scraper being configured.") 69 | 70 | cadence_seconds: PositiveInt = Field( 71 | description="""Configures how often to scrape from this data source, measured in seconds.""" 72 | ) 73 | 74 | labels_to_scrape: List[LabelScrapingConfig] = Field( 75 | description="""Describes the type of data to scrape from this source. 76 | 77 | The scraper will perform one scrape per entry in this list every 'cadence_seconds'. 78 | """ 79 | ) 80 | 81 | def to_coordinator_scraper_config(self) -> coordinator.ScraperConfig: 82 | """Returns the internal ScraperConfig representation""" 83 | return coordinator.ScraperConfig( 84 | cadence_seconds=self.cadence_seconds, 85 | labels_to_scrape=[ 86 | label.to_coordinator_label_scrape_config() 87 | for label in self.labels_to_scrape 88 | ], 89 | ) 90 | 91 | 92 | class ScrapingConfig(StrictBaseModel): 93 | """Configuration for all scrapers.""" 94 | 95 | model_config = ConfigDict() 96 | 97 | scraper_configs: List[ScraperConfig] = Field( 98 | description="The list of scrapers (and their scraping config) this miner should scrape from. Only scrapers in this list will be used." 99 | ) 100 | 101 | def to_coordinator_config(self) -> coordinator.CoordinatorConfig: 102 | """Returns the CoordinatorConfig.""" 103 | ids_and_configs = [ 104 | [config.scraper_id, config.to_coordinator_scraper_config()] 105 | for config in self.scraper_configs 106 | ] 107 | return coordinator.CoordinatorConfig( 108 | scraper_configs={id: config for id, config in ids_and_configs} 109 | ) -------------------------------------------------------------------------------- /scraping/config/scraping_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "scraper_configs": [ 3 | { 4 | "scraper_id": "X.apidojo", 5 | "cadence_seconds": 300, 6 | "labels_to_scrape": [ 7 | { 8 | "label_choices": [ 9 | "#bitcoin", 10 | "#bitcoincharts", 11 | "#bitcoiner", 12 | "#bitcoinexchange", 13 | "#bitcoinmining", 14 | "#bitcoinnews", 15 | "#bitcoinprice", 16 | "#bitcointechnology", 17 | "#bitcointrading", 18 | "#bittensor", 19 | "#btc", 20 | "#cryptocurrency", 21 | "#crypto", 22 | "#defi", 23 | "#decentralizedfinance", 24 | "#tao" 25 | ], 26 | "max_data_entities": 75 27 | } 28 | ] 29 | }, 30 | { 31 | "scraper_id": "Reddit.custom", 32 | "cadence_seconds": 60, 33 | "labels_to_scrape": [ 34 | { 35 | "label_choices": [ 36 | "r/bittensor_", 37 | "r/bitcoin", 38 | "r/BitcoinCash", 39 | "r/Bittensor_", 40 | "r/Btc", 41 | "r/Cryptocurrency", 42 | "r/Cryptomarkets", 43 | "r/EthereumClassic", 44 | "r/Ethtrader", 45 | "r/Filecoin", 46 | "r/Monero", 47 | "r/Polkadot", 48 | "r/Solana", 49 | "r/WallstreetBets" 50 | ], 51 | "max_data_entities": 100 52 | } 53 | ] 54 | }, 55 | { 56 | "scraper_id": "YouTube.transcript", 57 | "cadence_seconds": 100, 58 | "labels_to_scrape": [ 59 | { 60 | "label_choices": [ 61 | "#ytc_c_UCAuUUnT6oDeKwE6v1NGQxug", 62 | "#ytc_c_UCYO_jab_esuFRV4b17AJtAw", 63 | "#ytc_c_UCsXVk37bltHxD1rDPwtNM8Q", 64 | "#ytc_c_UCSHZKyawb77ixDdsGog4iWA", 65 | "#ytc_c_UCR93yACeNzxMSk6Y1cHM2pA", 66 | "#ytc_c_UCbLhGKVY-bJPcawebgtNfbw" 67 | ], 68 | "max_data_entities": 50, 69 | "max_age_hint_minutes": 43200 70 | } 71 | ] 72 | } 73 | ] 74 | } -------------------------------------------------------------------------------- /scraping/provider.py: -------------------------------------------------------------------------------- 1 | import threading 2 | from typing import Callable, Dict 3 | from common.data import DataSource 4 | from scraping.reddit.reddit_lite_scraper import RedditLiteScraper 5 | from scraping.reddit.reddit_custom_scraper import RedditCustomScraper 6 | from scraping.scraper import Scraper, ScraperId 7 | from scraping.x.microworlds_scraper import MicroworldsTwitterScraper 8 | from scraping.x.apidojo_scraper import ApiDojoTwitterScraper 9 | from scraping.x.quacker_url_scraper import QuackerUrlScraper 10 | from scraping.youtube.youtube_custom_scraper import YouTubeTranscriptScraper 11 | 12 | 13 | DEFAULT_FACTORIES = { 14 | ScraperId.REDDIT_LITE: RedditLiteScraper, 15 | # For backwards compatibility with old configs, remap x.flash to x.apidojo. 16 | ScraperId.X_FLASH: MicroworldsTwitterScraper, 17 | ScraperId.REDDIT_CUSTOM: RedditCustomScraper, 18 | ScraperId.X_MICROWORLDS: MicroworldsTwitterScraper, 19 | ScraperId.X_APIDOJO: ApiDojoTwitterScraper, 20 | ScraperId.X_QUACKER: QuackerUrlScraper, 21 | ScraperId.YOUTUBE_TRANSCRIPT: YouTubeTranscriptScraper 22 | } 23 | 24 | 25 | class ScraperProvider: 26 | """A scraper provider will provide the correct scraper based on the source to be scraped.""" 27 | 28 | def __init__( 29 | self, factories: Dict[DataSource, Callable[[], Scraper]] = DEFAULT_FACTORIES 30 | ): 31 | self.factories = factories 32 | 33 | def get(self, scraper_id: ScraperId) -> Scraper: 34 | """Returns a scraper for the given scraper id.""" 35 | 36 | assert scraper_id in self.factories, f"Scraper id {scraper_id} not supported." 37 | 38 | return self.factories[scraper_id]() 39 | -------------------------------------------------------------------------------- /scraping/reddit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/scraping/reddit/__init__.py -------------------------------------------------------------------------------- /scraping/reddit/model.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | from enum import Enum 3 | from typing import Optional 4 | 5 | # Use v1 for these models to keep serialization consistent. 6 | # Pydantic v2 doesn't include spaces in its serialization. 7 | from pydantic.v1 import BaseModel, Field 8 | 9 | 10 | from common import constants 11 | from common.data import DataEntity, DataLabel, DataSource 12 | from scraping import utils 13 | 14 | # The username used for deleted users. 15 | # This is the value returned by the Apify lite scraper. 16 | # Other scrapers may need to adapt their code to use this value. 17 | DELETED_USER = "[deleted]" 18 | 19 | 20 | class RedditDataType(str, Enum): 21 | POST = "post" 22 | COMMENT = "comment" 23 | 24 | 25 | class RedditContent(BaseModel): 26 | """The content model for Reddit data. 27 | 28 | Useful to standardize the representation of Reddit data, that could be scraped from different sources. 29 | """ 30 | 31 | class Config: 32 | extra = "forbid" 33 | 34 | id: str = Field(description="The unique ID of the post/comment") 35 | url: str = Field( 36 | description="URL of the post/comment", 37 | ) 38 | username: str 39 | community: str = Field( 40 | alias="communityName", description="The subreddit. Includes the 'r/' prefix" 41 | ) 42 | body: str = Field() 43 | created_at: dt.datetime = Field(alias="createdAt") 44 | data_type: RedditDataType = Field(alias="dataType") 45 | 46 | # Post-only fields. 47 | title: Optional[str] = Field( 48 | description="Title of the post. Empty for comments", default=None 49 | ) 50 | 51 | # Comment-only fields. 52 | parent_id: Optional[str] = Field( 53 | description="The ID of the parent comment. Only applicable to comments.", 54 | alias="parentId", 55 | default=None, 56 | ) 57 | 58 | @classmethod 59 | def to_data_entity(cls, content: "RedditContent") -> DataEntity: 60 | """Converts the RedditContent to a DataEntity.""" 61 | entity_created_at = content.created_at 62 | content.created_at = utils.obfuscate_datetime_to_minute(entity_created_at) 63 | content_bytes = content.json(by_alias=True).encode("utf-8") 64 | 65 | return DataEntity( 66 | uri=content.url, 67 | datetime=entity_created_at, 68 | source=DataSource.REDDIT, 69 | label=DataLabel( 70 | value=content.community.lower()[: constants.MAX_LABEL_LENGTH] 71 | ), 72 | content=content_bytes, 73 | content_size_bytes=len(content_bytes), 74 | ) 75 | 76 | @classmethod 77 | def from_data_entity(cls, data_entity: DataEntity) -> "RedditContent": 78 | """Converts a DataEntity to a RedditContent.""" 79 | 80 | return RedditContent.parse_raw(data_entity.content.decode("utf-8")) 81 | -------------------------------------------------------------------------------- /scraping/scraper.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from enum import Enum 3 | from typing import Dict, List, Optional 4 | from pydantic import BaseModel, Field, PositiveInt, ConfigDict 5 | 6 | from common.data import DataEntity, DataLabel, DataSource, StrictBaseModel 7 | from common.date_range import DateRange 8 | from storage.miner.miner_storage import MinerStorage 9 | 10 | 11 | class ScraperId(str, Enum): 12 | """The id for each of the scrapers.""" 13 | 14 | REDDIT_LITE = "Reddit.lite" 15 | X_FLASH = "X.flash" 16 | REDDIT_CUSTOM = "Reddit.custom" 17 | X_MICROWORLDS = "X.microworlds" 18 | X_APIDOJO = "X.apidojo" 19 | X_QUACKER = "X.quacker" 20 | YOUTUBE_TRANSCRIPT = "YouTube.transcript" 21 | 22 | 23 | class ValidationResult(StrictBaseModel): 24 | """Data class to contain the result of a scraping validation.""" 25 | 26 | model_config = ConfigDict(frozen=True) 27 | 28 | is_valid: bool 29 | content_size_bytes_validated: int = Field( 30 | description="The content size in bytes validated as part of this check", ge=0 31 | ) 32 | reason: str = Field( 33 | description="An optional reason for the validation result.", 34 | default="", 35 | ) 36 | 37 | 38 | class HFValidationResult(StrictBaseModel): 39 | """Data class to contain the result of a validation for a miner's Hugging Face dataset. """ 40 | 41 | class Config: 42 | frozen = True 43 | 44 | is_valid: bool 45 | 46 | validation_percentage: float = Field( 47 | description="The percentage of successfully validated HF rows. " 48 | ) 49 | 50 | reason: str = Field( 51 | description="An optional reason for the validation result. ", 52 | default="" 53 | ) 54 | 55 | 56 | class ScrapeConfig(StrictBaseModel): 57 | """Data class to contain the configuration to be used for scraping.""" 58 | 59 | model_config = ConfigDict(frozen=True) 60 | 61 | entity_limit: Optional[PositiveInt] 62 | date_range: DateRange 63 | labels: Optional[List[DataLabel]] = Field( 64 | default=None, 65 | description="Optional labels to filter the scrape by. If none are provided, the data source will issue a scrape for 'all' data, without any label filters applied", 66 | ) 67 | 68 | 69 | class LabelScrapingFrequency(StrictBaseModel): 70 | """Data class to contain the frequency distribution for a set of labels.""" 71 | 72 | model_config = ConfigDict(frozen=True) 73 | 74 | labels: List[DataLabel] 75 | frequency: float 76 | 77 | 78 | class SourceScrapingFrequency(StrictBaseModel): 79 | """Data class to contain the frequency distribution for a source across labels.""" 80 | 81 | model_config = ConfigDict(frozen=True) 82 | 83 | source: DataSource 84 | frequency: float 85 | label_frequencies: List[LabelScrapingFrequency] 86 | 87 | 88 | class ScrapingDistribution(StrictBaseModel): 89 | """A relative distribution across sources and labels.""" 90 | 91 | model_config = ConfigDict(frozen=True) 92 | 93 | distribution: List[SourceScrapingFrequency] 94 | 95 | 96 | class Scraper(abc.ABC): 97 | """An abstract base class for scrapers across all data sources.""" 98 | 99 | @abc.abstractmethod 100 | async def validate(self, entities: List[DataEntity]) -> List[ValidationResult]: 101 | """Validate the correctness of a list of DataEntities by URI.""" 102 | pass 103 | 104 | @abc.abstractmethod 105 | async def scrape(self, scrape_config: ScrapeConfig) -> List[DataEntity]: 106 | """Scrapes a batch of data based on the specified ScrapeConfig.""" 107 | pass 108 | 109 | @abc.abstractmethod 110 | async def validate_hf(self, entities) -> bool: 111 | """Validate the correctness of a list of HF retrieved data""" 112 | pass -------------------------------------------------------------------------------- /scraping/utils.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | 3 | 4 | def obfuscate_datetime_to_minute(datetime_to_obfuscate: dt.datetime) -> dt.datetime: 5 | """_summary_ 6 | 7 | Args: 8 | datetime_to_obfuscate (dt.datetime): Datetime to generate an obfuscated version of. 9 | 10 | Returns: 11 | dt.datetime: obfuscated datetime. 12 | """ 13 | return datetime_to_obfuscate.replace(second=0, microsecond=0) 14 | -------------------------------------------------------------------------------- /scraping/x/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/scraping/x/__init__.py -------------------------------------------------------------------------------- /scraping/x/model.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | import json 3 | from typing import Dict, List, Optional 4 | # Use v1 for these models to keep serialization consistent. 5 | # Pydantic v2 doesn't include spaces in its serialization. 6 | from pydantic.v1 import BaseModel, Field 7 | 8 | from common import constants 9 | from common.data import DataEntity, DataLabel, DataSource 10 | from scraping import utils 11 | 12 | 13 | class XContent(BaseModel): 14 | """The content model for tweets. 15 | 16 | The model helps standardize the data format for tweets, even if they're scraped using different methods. 17 | """ 18 | 19 | class Config: 20 | extra = "forbid" 21 | 22 | # model_config should NOT be set by Miners. 23 | # In the near future, Validators will penalized Miners who set this field. 24 | model_config: Dict[str, str] = Field(default=None) 25 | 26 | username: str 27 | text: str 28 | url: str 29 | timestamp: dt.datetime 30 | tweet_hashtags: List[str] = Field( 31 | default_factory=list, 32 | description="A list of hashtags associated with the tweet, in order they appear in the tweet. Note: it's critical this ordering is respected as the first tag is used as the DataLabel for the index.", 33 | ) 34 | media: Optional[List[str]] = Field( 35 | default=None, 36 | description="A list of media URLs associated with the tweet. Can be None if no media is present.", 37 | ) 38 | 39 | # Enhanced fields 40 | user_id: Optional[str] = None 41 | user_display_name: Optional[str] = None 42 | user_verified: Optional[bool] = None 43 | 44 | # Non-dynamic tweet metadata 45 | tweet_id: Optional[str] = None 46 | is_reply: Optional[bool] = None 47 | is_quote: Optional[bool] = None 48 | 49 | # Additional metadata 50 | conversation_id: Optional[str] = None 51 | in_reply_to_user_id: Optional[str] = None 52 | 53 | @classmethod 54 | def to_data_entity(cls, content: "XContent") -> DataEntity: 55 | """Converts the XContent to a DataEntity.""" 56 | entity_timestamp = content.timestamp 57 | content.timestamp = utils.obfuscate_datetime_to_minute(entity_timestamp) 58 | content_bytes = content.json(exclude_none=True).encode("utf-8") 59 | 60 | return DataEntity( 61 | uri=content.url, 62 | datetime=entity_timestamp, 63 | source=DataSource.X, 64 | label=( 65 | DataLabel( 66 | value=content.tweet_hashtags[0].lower()[ 67 | : constants.MAX_LABEL_LENGTH 68 | ] 69 | ) 70 | if content.tweet_hashtags 71 | else None 72 | ), 73 | content=content_bytes, 74 | content_size_bytes=len(content_bytes), 75 | ) 76 | 77 | @classmethod 78 | def from_data_entity(cls, data_entity: DataEntity) -> "XContent": 79 | """Converts a DataEntity to an XContent.""" 80 | content_str = data_entity.content.decode("utf-8") 81 | return XContent.parse_raw(content_str) -------------------------------------------------------------------------------- /scraping/youtube/model.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | from typing import Dict, List, Optional 3 | from pydantic.v1 import BaseModel, Field 4 | from common.data import DataEntity, DataLabel, DataSource 5 | 6 | 7 | class YouTubeContent(BaseModel): 8 | """The content model for YouTube transcripts. 9 | 10 | This model standardizes how YouTube transcript data is stored, 11 | regardless of how it was scraped. 12 | """ 13 | 14 | class Config: 15 | extra = "forbid" 16 | 17 | video_id: str = Field( 18 | description="The YouTube video ID (e.g., 'dQw4w9WgXcQ')" 19 | ) 20 | 21 | title: str = Field( 22 | description="The title of the YouTube video" 23 | ) 24 | 25 | channel_id: str = Field( 26 | description="The YouTube channel ID" 27 | ) 28 | 29 | channel_name: str = Field( 30 | description="The name of the YouTube channel" 31 | ) 32 | 33 | upload_date: dt.datetime = Field( 34 | description="The date the video was uploaded" 35 | ) 36 | 37 | transcript: List[Dict] = Field( 38 | description="The transcript of the video, as a list of dictionaries with 'text', 'start', and 'duration' keys", 39 | default_factory=list 40 | ) 41 | 42 | url: str = Field( 43 | description="The URL of the YouTube video" 44 | ) 45 | 46 | language: str = Field( 47 | description="The language of the transcript", 48 | default="en" 49 | ) 50 | 51 | duration_seconds: int = Field( 52 | description="The duration of the video in seconds", 53 | default=0 54 | ) 55 | 56 | @classmethod 57 | def to_data_entity(cls, content: "YouTubeContent", original_label: Optional[str] = None) -> DataEntity: 58 | """Converts the YouTubeContent to a DataEntity. 59 | 60 | Args: 61 | content: The YouTubeContent object to convert 62 | original_label: The original label type that was used for scraping (optional) 63 | 64 | Returns: 65 | A DataEntity with the appropriate label 66 | """ 67 | entity_timestamp = content.upload_date 68 | content_bytes = content.json(exclude_none=True).encode("utf-8") 69 | 70 | # Create a DataLabel - ALWAYS use NEW format for output, but check BOTH old and new for input 71 | if original_label and (original_label.startswith('#youtube_v_') or original_label.startswith('#ytc_v_')): 72 | # If scraped with a video label, use NEW video label format 73 | label = DataLabel(value=f"#ytc_v_{content.video_id}") 74 | else: 75 | # Default to NEW channel label format 76 | label = DataLabel(value=f"#ytc_c_{content.channel_id}") 77 | 78 | return DataEntity( 79 | uri=content.url, 80 | datetime=entity_timestamp, 81 | source=DataSource.YOUTUBE, 82 | label=label, 83 | content=content_bytes, 84 | content_size_bytes=len(content_bytes), 85 | ) 86 | 87 | @classmethod 88 | def from_data_entity(cls, data_entity: DataEntity) -> "YouTubeContent": 89 | """Converts a DataEntity to a YouTubeContent.""" 90 | content_str = data_entity.content.decode("utf-8") 91 | return YouTubeContent.parse_raw(content_str) -------------------------------------------------------------------------------- /scraping/youtube/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib.parse import urlparse, parse_qs 3 | 4 | 5 | def extract_video_id(url: str) -> str: 6 | """ 7 | Extracts the YouTube video ID from a YouTube URL. 8 | 9 | Args: 10 | url: The YouTube video URL. 11 | 12 | Returns: 13 | The YouTube video ID or an empty string if no ID could be extracted. 14 | """ 15 | if not url: 16 | return "" 17 | 18 | # Standard YouTube URLs like https://www.youtube.com/watch?v=dQw4w9WgXcQ 19 | parsed_url = urlparse(url) 20 | if parsed_url.netloc in ('youtube.com', 'www.youtube.com'): 21 | query_params = parse_qs(parsed_url.query) 22 | if 'v' in query_params: 23 | return query_params['v'][0] 24 | 25 | # Short YouTube URLs like https://youtu.be/dQw4w9WgXcQ 26 | if parsed_url.netloc == 'youtu.be': 27 | return parsed_url.path.strip('/') 28 | 29 | # Embedded YouTube URLs like https://www.youtube.com/embed/dQw4w9WgXcQ 30 | if parsed_url.netloc in ('youtube.com', 'www.youtube.com') and '/embed/' in parsed_url.path: 31 | return parsed_url.path.split('/embed/')[1].split('/')[0].split('?')[0] 32 | 33 | # Try to find a video ID pattern in the URL 34 | video_id_pattern = r'(?:v=|v\/|embed\/|youtu\.be\/|\/v\/|\/e\/|watch\?v=|youtube.com\/v\/|youtube.com\/embed\/|youtu.be\/|v=|e=|u\/\w+\/|embed\?video_id=|\/videos\/|\/embed\/|\/v\/|watch\?.*v=|youtube.com\/embed\/)([\w-]{11})' 35 | match = re.search(video_id_pattern, url) 36 | if match: 37 | return match.group(1) 38 | 39 | return "" 40 | 41 | 42 | def normalize_youtube_url(url: str) -> str: 43 | """ 44 | Normalizes a YouTube URL to a standard form. 45 | 46 | Args: 47 | url: The YouTube URL to normalize. 48 | 49 | Returns: 50 | The normalized URL or the original if no normalization is possible. 51 | """ 52 | video_id = extract_video_id(url) 53 | if video_id: 54 | return f"https://www.youtube.com/watch?v={video_id}" 55 | return url 56 | 57 | 58 | def validate_youtube_content(actual_content, entity_to_validate, threshold=0.8): 59 | """ 60 | Validates a YouTube content entity against an actual content. 61 | 62 | Args: 63 | actual_content: The actual YouTube content from the API. 64 | entity_to_validate: The entity that needs validation. 65 | threshold: The similarity threshold for text comparison. 66 | 67 | Returns: 68 | A tuple (is_valid, reason) where is_valid is a boolean and reason is a string. 69 | """ 70 | # Check if the video IDs match 71 | if actual_content.video_id != entity_to_validate.video_id: 72 | return False, "Video IDs do not match" 73 | 74 | # Check if the upload dates are within a reasonable range 75 | # (YouTube may show slightly different timestamps depending on time zones) 76 | date_difference = abs((actual_content.upload_date - entity_to_validate.upload_date).total_seconds()) 77 | if date_difference > 86400: # More than 24 hours difference 78 | return False, "Upload dates do not match" 79 | 80 | # Check if the titles are similar enough 81 | if not texts_are_similar(actual_content.title, entity_to_validate.title, threshold): 82 | return False, "Titles do not match" 83 | 84 | # Check if the transcripts are similar enough 85 | if not transcripts_are_similar(actual_content.transcript, entity_to_validate.transcript, threshold): 86 | return False, "Transcripts do not match" 87 | 88 | return True, "Content is valid" 89 | 90 | 91 | def texts_are_similar(text1, text2, threshold=0.8): 92 | """ 93 | Check if two texts are similar enough. 94 | 95 | Args: 96 | text1: First text. 97 | text2: Second text. 98 | threshold: Similarity threshold (0-1). 99 | 100 | Returns: 101 | True if the texts are similar enough, False otherwise. 102 | """ 103 | if not text1 or not text2: 104 | return text1 == text2 105 | 106 | # Simple approach: check if enough words from one text appear in the other 107 | words1 = set(text1.lower().split()) 108 | words2 = set(text2.lower().split()) 109 | 110 | # Calculate overlap ratio 111 | overlap = len(words1.intersection(words2)) 112 | similarity = overlap / max(len(words1), len(words2)) 113 | 114 | return similarity >= threshold 115 | 116 | 117 | def transcripts_are_similar(transcript1, transcript2, threshold=0.8): 118 | """ 119 | Check if two transcripts are similar enough. 120 | 121 | Args: 122 | transcript1: First transcript (list of dicts with 'text' keys). 123 | transcript2: Second transcript (list of dicts with 'text' keys). 124 | threshold: Similarity threshold (0-1). 125 | 126 | Returns: 127 | True if the transcripts are similar enough, False otherwise. 128 | """ 129 | if not transcript1 or not transcript2: 130 | return transcript1 == transcript2 131 | 132 | # Extract text from both transcripts 133 | text1 = " ".join([item.get('text', '') for item in transcript1]) 134 | text2 = " ".join([item.get('text', '') for item in transcript2]) 135 | 136 | return texts_are_similar(text1, text2, threshold) -------------------------------------------------------------------------------- /scripts/start_validator.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script runs a validator process and automatically updates it when a new version is released. 3 | Command-line arguments will be forwarded to validator (`neurons/validator.py`), so you can pass 4 | them like this: 5 | python3 scripts/start_validator.py --wallet.name=my-wallet 6 | Auto-updates are enabled by default and will make sure that the latest version is always running 7 | by pulling the latest version from git and upgrading python packages. This is done periodically. 8 | Local changes may prevent the update, but they will be preserved. 9 | 10 | To disable auto-updates, pass --no_autoupdate. 11 | 12 | The script will use the same virtual environment as the one used to run it. If you want to run 13 | validator within virtual environment, run this auto-update script from the virtual environment. 14 | 15 | Pm2 is required for this script. This script will start a pm2 process using the name provided by 16 | the --pm2_name argument. 17 | """ 18 | import argparse 19 | import logging 20 | import subprocess 21 | import sys 22 | import time 23 | from datetime import timedelta 24 | from pathlib import Path 25 | from shlex import split 26 | from typing import List 27 | 28 | log = logging.getLogger(__name__) 29 | UPDATES_CHECK_TIME = timedelta(minutes=15) 30 | ROOT_DIR = Path(__file__).parent.parent 31 | 32 | 33 | def get_version() -> str: 34 | """Extract the version as current git commit hash""" 35 | result = subprocess.run( 36 | split("git rev-parse HEAD"), 37 | check=True, 38 | capture_output=True, 39 | cwd=ROOT_DIR, 40 | ) 41 | commit = result.stdout.decode().strip() 42 | assert len(commit) == 40, f"Invalid commit hash: {commit}" 43 | return commit[:8] 44 | 45 | 46 | def start_validator_process(pm2_name: str, args: List[str]) -> subprocess.Popen: 47 | """ 48 | Spawn a new python process running neurons.validator. 49 | `sys.executable` ensures thet the same python interpreter is used as the one 50 | used to run this auto-updater. 51 | """ 52 | assert sys.executable, "Failed to get python executable" 53 | 54 | log.info("Starting validator process with pm2, name: %s", pm2_name) 55 | process = subprocess.Popen( 56 | ( 57 | "pm2", 58 | "start", 59 | sys.executable, 60 | "--name", 61 | pm2_name, 62 | "--", 63 | "-m", 64 | "neurons.validator", 65 | *args, 66 | ), 67 | cwd=ROOT_DIR, 68 | ) 69 | process.pm2_name = pm2_name 70 | 71 | return process 72 | 73 | 74 | def stop_validator_process(process: subprocess.Popen) -> None: 75 | """Stop the validator process""" 76 | subprocess.run(("pm2", "delete", process.pm2_name), cwd=ROOT_DIR, check=True) 77 | 78 | 79 | def pull_latest_version() -> None: 80 | """ 81 | Pull the latest version from git. 82 | This uses `git pull --rebase`, so if any changes were made to the local repository, 83 | this will try to apply them on top of origin's changes. This is intentional, as we 84 | don't want to overwrite any local changes. However, if there are any conflicts, 85 | this will abort the rebase and return to the original state. 86 | The conflicts are expected to happen rarely since validator is expected 87 | to be used as-is. 88 | """ 89 | try: 90 | subprocess.run(split("git pull --rebase --autostash"), check=True, cwd=ROOT_DIR) 91 | except subprocess.CalledProcessError as exc: 92 | log.error("Failed to pull, reverting: %s", exc) 93 | subprocess.run(split("git rebase --abort"), check=True, cwd=ROOT_DIR) 94 | 95 | 96 | def upgrade_packages() -> None: 97 | """ 98 | Upgrade python packages by running `pip install --upgrade -r requirements.txt`. 99 | Notice: this won't work if some package in `requirements.txt` is downgraded. 100 | Ignored as this is unlikely to happen. 101 | """ 102 | 103 | log.info("Upgrading packages") 104 | try: 105 | subprocess.run( 106 | split(f"{sys.executable} -m pip install -e ."), 107 | check=True, 108 | cwd=ROOT_DIR, 109 | ) 110 | except subprocess.CalledProcessError as exc: 111 | log.error("Failed to upgrade packages, proceeding anyway. %s", exc) 112 | 113 | 114 | def main(pm2_name: str, args: List[str]) -> None: 115 | """ 116 | Run the validator process and automatically update it when a new version is released. 117 | This will check for updates every `UPDATES_CHECK_TIME` and update the validator 118 | if a new version is available. Update is performed as simple `git pull --rebase`. 119 | """ 120 | 121 | validator = start_validator_process(pm2_name, args) 122 | current_version = latest_version = get_version() 123 | log.info("Current version: %s", current_version) 124 | 125 | try: 126 | while True: 127 | pull_latest_version() 128 | latest_version = get_version() 129 | log.info("Latest version: %s", latest_version) 130 | 131 | if latest_version != current_version: 132 | log.info( 133 | "Upgraded to latest version: %s -> %s", 134 | current_version, 135 | latest_version, 136 | ) 137 | upgrade_packages() 138 | 139 | stop_validator_process(validator) 140 | validator = start_validator_process(pm2_name, args) 141 | current_version = latest_version 142 | 143 | time.sleep(UPDATES_CHECK_TIME.total_seconds()) 144 | 145 | finally: 146 | stop_validator_process(validator) 147 | 148 | 149 | if __name__ == "__main__": 150 | logging.basicConfig( 151 | level=logging.INFO, 152 | format="%(asctime)s %(levelname)s %(message)s", 153 | handlers=[logging.StreamHandler(sys.stdout)], 154 | ) 155 | 156 | parser = argparse.ArgumentParser( 157 | description="Automatically update and restart the validator process when a new version is released.", 158 | epilog="Example usage: python start_validator.py --pm2_name 'net13vali' --wallet_name 'wallet1' --wallet_hotkey 'key123' [--no-autoupdate]", 159 | ) 160 | 161 | parser.add_argument( 162 | "--pm2_name", default="net13vali", help="Name of the PM2 process." 163 | ) 164 | 165 | flags, extra_args = parser.parse_known_args() 166 | 167 | main(flags.pm2_name, extra_args) 168 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # Copyright © 2023 Data Universe 3 | 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of 10 | # the Software. 11 | 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 16 | # DEALINGS IN THE SOFTWARE. 17 | 18 | import re 19 | import os 20 | import codecs 21 | import pathlib 22 | from os import path 23 | from io import open 24 | from setuptools import setup, find_packages 25 | from pkg_resources import parse_requirements 26 | 27 | 28 | def read_requirements(path): 29 | with open(path, "r") as f: 30 | requirements = f.read().splitlines() 31 | processed_requirements = [] 32 | 33 | for req in requirements: 34 | # For git or other VCS links 35 | if req.startswith("git+") or "@" in req: 36 | pkg_name = re.search(r"(#egg=)([\w\-_]+)", req) 37 | if pkg_name: 38 | processed_requirements.append(pkg_name.group(2)) 39 | else: 40 | # You may decide to raise an exception here, 41 | # if you want to ensure every VCS link has an #egg= at the end 42 | continue 43 | else: 44 | processed_requirements.append(req) 45 | return processed_requirements 46 | 47 | 48 | requirements = read_requirements("requirements.txt") 49 | here = path.abspath(path.dirname(__file__)) 50 | 51 | with open(path.join(here, "README.md"), encoding="utf-8") as f: 52 | long_description = f.read() 53 | 54 | # loading version from setup.py 55 | with codecs.open( 56 | os.path.join(here, "neurons/__init__.py"), 57 | encoding="utf-8", 58 | ) as init_file: 59 | version_match = re.search( 60 | r"^__version__ = ['\"]([^'\"]*)['\"]", init_file.read(), re.M 61 | ) 62 | version_string = version_match.group(1) 63 | 64 | setup( 65 | name="bittensor_data_universe", 66 | version=version_string, 67 | description="Data Universe is a Bittensor subnet for collecting and storing large amounts of data from across a wide-range of sources, for use by other Subnets.", 68 | long_description=long_description, 69 | long_description_content_type="text/markdown", 70 | url="https://github.com/RusticLuftig/data-universe", 71 | author="Data Universe Team", 72 | packages=find_packages(), 73 | include_package_data=True, 74 | author_email="sid.data.universe@gmail.com", 75 | license="MIT", 76 | python_requires=">=3.10", 77 | install_requires=requirements, 78 | classifiers=[ 79 | "Development Status :: 3 - Alpha", 80 | "Intended Audience :: Developers", 81 | "Topic :: Software Development :: Build Tools", 82 | "License :: OSI Approved :: MIT License", 83 | "Programming Language :: Python :: 3 :: Only", 84 | "Programming Language :: Python :: 3.10", 85 | "Topic :: Scientific/Engineering", 86 | "Topic :: Scientific/Engineering :: Mathematics", 87 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 88 | "Topic :: Software Development", 89 | "Topic :: Software Development :: Libraries", 90 | "Topic :: Software Development :: Libraries :: Python Modules", 91 | ], 92 | ) 93 | -------------------------------------------------------------------------------- /storage/miner/miner_storage.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from common.data import ( 3 | CompressedMinerIndex, 4 | DataEntity, 5 | DataEntityBucketId, 6 | ) 7 | from typing import Dict, List 8 | import datetime as dt 9 | 10 | 11 | class MinerStorage(ABC): 12 | """An abstract class which defines the contract that all implementations of MinerStorage must fulfill.""" 13 | 14 | @abstractmethod 15 | def store_data_entities(self, data_entities: List[DataEntity]): 16 | """Stores any number of DataEntities, making space if necessary.""" 17 | raise NotImplemented 18 | 19 | @abstractmethod 20 | def list_data_entities_in_data_entity_bucket( 21 | self, data_entity_bucket_id: DataEntityBucketId 22 | ) -> List[DataEntity]: 23 | """Lists from storage all DataEntities matching the provided DataEntityBucket.""" 24 | raise NotImplemented 25 | 26 | @abstractmethod 27 | def get_compressed_index(self) -> CompressedMinerIndex: 28 | """Gets the compressed MinedIndex, which is a summary of all of the DataEntities that this MinerStorage is currently serving.""" 29 | raise NotImplemented 30 | 31 | @abstractmethod 32 | def refresh_compressed_index(self, date_time: dt.timedelta): 33 | """Refreshes the compressed MinerIndex.""" 34 | raise NotImplemented 35 | 36 | @abstractmethod 37 | def list_contents_in_data_entity_buckets( 38 | self, data_entity_bucket_ids: List[DataEntityBucketId] 39 | ) -> Dict[DataEntityBucketId, List[bytes]]: 40 | """Lists contents for each requested DataEntityBucketId. 41 | Args: 42 | data_entity_bucket_ids (List[DataEntityBucketId]): Which buckets to get contents for. 43 | Returns: 44 | Dict[DataEntityBucketId, List[bytes]]: Map of each bucket id to contained contents. 45 | """ 46 | raise NotImplemented 47 | -------------------------------------------------------------------------------- /storage/validator/hf_validator_storage.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import pyarrow as pa 4 | import pyarrow.parquet as pq 5 | 6 | 7 | class HFValidationStorage: 8 | def __init__(self, storage_path): 9 | self.file_path = storage_path 10 | self._ensure_file_exists() 11 | 12 | def _ensure_file_exists(self): 13 | if not os.path.exists(self.file_path): 14 | self._create_empty_dataframe() 15 | 16 | def _create_empty_dataframe(self): 17 | df = pd.DataFrame(columns=['hotkey', 'repo_name', 'block']) 18 | self._safe_write_parquet(df) 19 | 20 | def _safe_write_parquet(self, df): 21 | temp_file = f"{self.file_path}.temp" 22 | try: 23 | table = pa.Table.from_pandas(df) 24 | pq.write_table(table, temp_file) 25 | os.replace(temp_file, self.file_path) 26 | except Exception as e: 27 | if os.path.exists(temp_file): 28 | os.remove(temp_file) 29 | raise e 30 | 31 | def _safe_read_parquet(self): 32 | try: 33 | return pd.read_parquet(self.file_path) 34 | except Exception as e: 35 | print(f"Error reading Parquet file: {e}") 36 | print("Attempting to recover data...") 37 | return self._recover_data() 38 | 39 | def _recover_data(self): 40 | try: 41 | table = pq.read_table(self.file_path) 42 | return table.to_pandas() 43 | except Exception as e: 44 | print(f"Recovery failed: {e}") 45 | print("Creating a new empty dataframe.") 46 | return pd.DataFrame(columns=['hotkey', 'repo_name', 'block']) 47 | 48 | def get_validation_info(self, hotkey): 49 | df = self._safe_read_parquet() 50 | matching_rows = df[df['hotkey'] == hotkey] 51 | return matching_rows.to_dict('records')[0] if not matching_rows.empty else None 52 | 53 | def update_validation_info(self, hotkey, repo_name, block): 54 | df = self._safe_read_parquet() 55 | new_row = pd.DataFrame({'hotkey': [hotkey], 'repo_name': [repo_name], 'block': [block]}) 56 | df = pd.concat([df[df['hotkey'] != hotkey], new_row], ignore_index=True) 57 | self._safe_write_parquet(df) 58 | 59 | def get_all_validations(self): 60 | return self._safe_read_parquet() -------------------------------------------------------------------------------- /storage/validator/validator_storage.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from common.data import CompressedMinerIndex 3 | from typing import Optional 4 | import datetime as dt 5 | 6 | from common.data_v2 import ScorableMinerIndex 7 | 8 | 9 | class ValidatorStorage(ABC): 10 | """An abstract class which defines the contract that all implementations of ValidatorStorage must fulfill.""" 11 | 12 | @abstractmethod 13 | def upsert_compressed_miner_index( 14 | self, index: CompressedMinerIndex, hotkey: str, credibility: float = 0 15 | ): 16 | """Stores the index for all of the data that a specific miner promises to provide.""" 17 | raise NotImplemented 18 | 19 | @abstractmethod 20 | def read_miner_index(self, miner_hotkey: str) -> Optional[ScorableMinerIndex]: 21 | """Gets a scored index for all of the data that a specific miner promises to provide.""" 22 | raise NotImplemented 23 | 24 | @abstractmethod 25 | def delete_miner(self, miner_hotkey: str): 26 | """Removes the index and miner information for the specified miner.""" 27 | raise NotImplemented 28 | 29 | @abstractmethod 30 | def read_miner_last_updated(self, miner_hotkey: str) -> Optional[dt.datetime]: 31 | """Gets when a specific miner was last updated.""" 32 | raise NotImplemented 33 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/__init__.py -------------------------------------------------------------------------------- /tests/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/common/__init__.py -------------------------------------------------------------------------------- /tests/common/test_data.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | import random 3 | import string 4 | import time 5 | 6 | from common import constants, utils 7 | 8 | from common.data import ( 9 | CompressedEntityBucket, 10 | CompressedMinerIndex, 11 | DataLabel, 12 | DataSource, 13 | TimeBucket, 14 | ) 15 | import unittest 16 | 17 | from common.protocol import GetMinerIndex 18 | from pydantic import ValidationError 19 | 20 | 21 | class TestData(unittest.TestCase): 22 | def test_time_bucket_to_date_range(self): 23 | """Tests a Timebucket's date range function""" 24 | 25 | # Create a datetime that should align with the start of a time bucket. 26 | datetime = dt.datetime.fromtimestamp(36000, tz=dt.timezone.utc) 27 | time_bucket = TimeBucket.from_datetime(datetime) 28 | 29 | date_range = TimeBucket.to_date_range(time_bucket) 30 | 31 | for i in range(0, 60): 32 | self.assertTrue(date_range.contains(datetime + dt.timedelta(minutes=i))) 33 | 34 | self.assertFalse(date_range.contains(datetime + dt.timedelta(minutes=60))) 35 | 36 | def test_data_source_init(self): 37 | """Tests that the data source enum can be initialized""" 38 | source = 1 39 | self.assertEqual(DataSource.REDDIT, DataSource(source)) 40 | 41 | def test_compressed_index_bucket_count(self): 42 | """Tests that the compressed version of Miner index can get bucket count.""" 43 | # Make 5 compressed buckets per source, each containing 5 unique time bucket ids of size 10. 44 | sources = {} 45 | for source in [DataSource.REDDIT, DataSource.X]: 46 | compressed_buckets = [None] * 5 47 | for label_i in range(0, 5): 48 | label = "label" + str(label_i) 49 | compressed_buckets[label_i] = CompressedEntityBucket( 50 | label=label, 51 | time_bucket_ids=[i for i in range(1, 6)], 52 | sizes_bytes=[10 for i in range(1, 6)], 53 | ) 54 | sources[int(source)] = compressed_buckets 55 | 56 | index = CompressedMinerIndex(sources=sources) 57 | 58 | self.assertEqual(CompressedMinerIndex.bucket_count(index), 50) 59 | 60 | def test_compressed_index_size_bytes(self): 61 | """Tests that the compressed version of Miner index can get size in bytes.""" 62 | # Make 5 compressed buckets per source, each containing 5 unique time bucket ids of size 10. 63 | sources = {} 64 | for source in [DataSource.REDDIT, DataSource.X]: 65 | compressed_buckets = [None] * 5 66 | for label_i in range(0, 5): 67 | label = "label" + str(label_i) 68 | compressed_buckets[label_i] = CompressedEntityBucket( 69 | label=label, 70 | time_bucket_ids=[i for i in range(1, 6)], 71 | sizes_bytes=[10 for i in range(1, 6)], 72 | ) 73 | sources[int(source)] = compressed_buckets 74 | 75 | index = CompressedMinerIndex(sources=sources) 76 | 77 | self.assertEqual(CompressedMinerIndex.bucket_count(index), 50) 78 | 79 | def test_compressed_index_supports_max_index(self): 80 | """Tests that the compressed version of the maximal Miner index is under our response size limit.""" 81 | 82 | target_buckets = ( 83 | constants.DATA_ENTITY_BUCKET_COUNT_LIMIT_PER_MINER_INDEX_PROTOCOL_4 84 | ) 85 | 86 | # Figure out how many time buckets and labels we need to fill the index. 87 | buckets_per_source = target_buckets // 2 # Twitter/Reddit 88 | num_time_buckets = constants.DATA_ENTITY_BUCKET_AGE_LIMIT_DAYS * 7 * 24 89 | num_labels = buckets_per_source // num_time_buckets 90 | 91 | # Double check the math 92 | total_buckets = 2 * num_time_buckets * num_labels 93 | self.assertAlmostEqual( 94 | target_buckets, 95 | total_buckets, 96 | delta=target_buckets * 0.05, 97 | ) 98 | 99 | start = time.time() 100 | sources = {} 101 | 102 | def generate_random_string(length): 103 | # Combine letters and digits for the random string 104 | characters = string.ascii_letters + string.digits 105 | return "".join(random.choice(characters) for _ in range(length)) 106 | 107 | for source in [DataSource.REDDIT, DataSource.X]: 108 | compressed_buckets = [None] * num_labels 109 | for label_i in range(0, num_labels): 110 | label = generate_random_string(random.randint(4, 32)) 111 | compressed_buckets[label_i] = CompressedEntityBucket( 112 | label=label, 113 | time_bucket_ids=[i for i in range(1, num_time_buckets + 1)], 114 | sizes_bytes=[ 115 | random.randint(1, 112345678) 116 | for i in range(1, num_time_buckets + 1) 117 | ], 118 | ) 119 | sources[int(source)] = compressed_buckets 120 | 121 | print(f"Time to create index: {time.time() - start}") 122 | maximal_index = CompressedMinerIndex( 123 | sources=sources, 124 | ) 125 | 126 | start = time.time() 127 | serialized_compressed_index = maximal_index.json() 128 | print(f"Time to serialize index: {time.time() - start}") 129 | 130 | start = time.time() 131 | get_miner_index = GetMinerIndex( 132 | compressed_index_serialized=serialized_compressed_index 133 | ) 134 | print(f"Time to create synapse: {time.time() - start}") 135 | 136 | start = time.time() 137 | compressed_json = get_miner_index.json() 138 | print(f"Time to serialize synapse: {time.time() - start}") 139 | print(f"Compressed index size: {len(compressed_json)}") 140 | self.assertLess(len(compressed_json), utils.mb_to_bytes(mb=128)) 141 | 142 | start = time.time() 143 | deserialized_index = GetMinerIndex.parse_raw(compressed_json) 144 | deserialized_compressed_index = CompressedMinerIndex.parse_raw( 145 | deserialized_index.compressed_index_serialized 146 | ) 147 | print(f"Time to deserialize synapse: {time.time() - start}") 148 | 149 | # Verify the deserialized form is as expected. 150 | self.assertEqual(deserialized_compressed_index, maximal_index) 151 | 152 | def test_data_label_lower_validation(self): 153 | """Tests that the data label value is checked to be <32 characters even after .lower().""" 154 | with self.assertRaises(ValidationError): 155 | bad_label = DataLabel(value="#İsrailleTicaretFilistineİhanet") 156 | 157 | 158 | if __name__ == "__main__": 159 | unittest.main() 160 | -------------------------------------------------------------------------------- /tests/common/test_data_v2.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from common.data import DataEntityBucketId, DataLabel, DataSource, TimeBucket 3 | from common.data_v2 import ScorableDataEntityBucket, DataEntityBucket 4 | 5 | 6 | class TestDataV2(unittest.TestCase): 7 | def test_scorable_data_entity_bucket_to_data_entity(self): 8 | # Create a ScorableDataEntityBucket instance 9 | time_bucket_id = 123 10 | source = DataSource.REDDIT.value 11 | label = "EXAMPLE_label" 12 | size_bytes = 1000 13 | scorable_bytes = 500 14 | scorable_data_entity_bucket = ScorableDataEntityBucket( 15 | time_bucket_id=time_bucket_id, 16 | source=source, 17 | label=label, 18 | size_bytes=size_bytes, 19 | scorable_bytes=scorable_bytes, 20 | ) 21 | 22 | # Call the to_data_entity_bucket method 23 | data_entity_bucket = scorable_data_entity_bucket.to_data_entity_bucket() 24 | 25 | # Verify that the returned value is an instance of DataEntityBucket 26 | expected = DataEntityBucket( 27 | id=DataEntityBucketId( 28 | time_bucket=TimeBucket(id=time_bucket_id), 29 | source=source, 30 | label=DataLabel(value=label.casefold()), 31 | ), 32 | size_bytes=size_bytes, 33 | ) 34 | self.assertEqual(data_entity_bucket, expected) 35 | 36 | def test_scorable_data_entity_bucket_to_data_entity_none_label(self): 37 | # Create a ScorableDataEntityBucket instance 38 | time_bucket_id = 123 39 | source = DataSource.REDDIT.value 40 | size_bytes = 1000 41 | scorable_bytes = 500 42 | scorable_data_entity_bucket = ScorableDataEntityBucket( 43 | time_bucket_id=time_bucket_id, 44 | source=source, 45 | label=None, 46 | size_bytes=size_bytes, 47 | scorable_bytes=scorable_bytes, 48 | ) 49 | 50 | # Call the to_data_entity_bucket method 51 | data_entity_bucket = scorable_data_entity_bucket.to_data_entity_bucket() 52 | 53 | # Verify that the returned value is an instance of DataEntityBucket 54 | expected = DataEntityBucket( 55 | id=DataEntityBucketId( 56 | time_bucket=TimeBucket(id=time_bucket_id), 57 | source=source, 58 | label=None, 59 | ), 60 | size_bytes=size_bytes, 61 | ) 62 | self.assertEqual(data_entity_bucket, expected) 63 | 64 | def test_scorable_data_entity_bucket_equality(self): 65 | # Create two ScorableDataEntityBucket instances 66 | time_bucket_id = 123 67 | source = DataSource.REDDIT.value 68 | label = "EXAMPLE_label" 69 | size_bytes = 1000 70 | scorable_bytes = 500 71 | scorable_data_entity_bucket_1 = ScorableDataEntityBucket( 72 | time_bucket_id=time_bucket_id, 73 | source=source, 74 | label=label, 75 | size_bytes=size_bytes, 76 | scorable_bytes=scorable_bytes, 77 | ) 78 | scorable_data_entity_bucket_2 = ScorableDataEntityBucket( 79 | time_bucket_id=time_bucket_id, 80 | source=source, 81 | label=label, 82 | size_bytes=size_bytes, 83 | scorable_bytes=scorable_bytes, 84 | ) 85 | 86 | # Verify that the two instances are equal 87 | self.assertEqual(scorable_data_entity_bucket_1, scorable_data_entity_bucket_2) 88 | 89 | 90 | if __name__ == "__main__": 91 | unittest.main() 92 | -------------------------------------------------------------------------------- /tests/common/test_metagraph_syncer.py: -------------------------------------------------------------------------------- 1 | from curses import meta 2 | import threading 3 | from unittest import mock 4 | import unittest 5 | import bittensor as bt 6 | from common.metagraph_syncer import MetagraphSyncer 7 | 8 | 9 | class TestMetagraphSyncer(unittest.TestCase): 10 | def test_do_initial_sync(self): 11 | # Mock subtensor.metagraph() function 12 | metagraph1 = bt.metagraph(netuid=1, sync=False) 13 | metagraph2 = bt.metagraph(netuid=2, sync=False) 14 | 15 | def get_metagraph(netuid) -> bt.metagraph: 16 | if netuid == 1: 17 | return metagraph1 18 | elif netuid == 2: 19 | return metagraph2 20 | else: 21 | raise Exception("Invalid netuid") 22 | 23 | mock_subtensor = mock.MagicMock(spec=bt.subtensor) 24 | metagraph_mock = mock.MagicMock(side_effect=get_metagraph) 25 | mock_subtensor.metagraph = metagraph_mock 26 | 27 | # Create MetagraphSyncer instance with mock subtensor 28 | metagraph_syncer = MetagraphSyncer(mock_subtensor, {1: 1, 2: 1}) 29 | 30 | # Call do_initial_sync method 31 | metagraph_syncer.do_initial_sync() 32 | 33 | # Verify get_metagraph() returns the expected metagraph. 34 | # We can't check object equality because of how equality is done on bt.metagraph 35 | # so just check the netuid. 36 | self.assertEqual(metagraph_syncer.get_metagraph(1).netuid, metagraph1.netuid) 37 | self.assertEqual(metagraph_syncer.get_metagraph(2).netuid, metagraph2.netuid) 38 | 39 | def test_listener_called(self): 40 | # Mock subtensor.metagraph() function 41 | metagraph1 = bt.metagraph(netuid=1, sync=False) 42 | metagraph2 = bt.metagraph(netuid=2, sync=False) 43 | 44 | def get_metagraph(netuid) -> bt.metagraph: 45 | if netuid == 1: 46 | return metagraph1 47 | elif netuid == 2: 48 | return metagraph2 49 | else: 50 | raise Exception("Invalid netuid") 51 | 52 | mock_subtensor = mock.MagicMock(spec=bt.subtensor) 53 | metagraph_mock = mock.MagicMock(side_effect=get_metagraph) 54 | mock_subtensor.metagraph = metagraph_mock 55 | 56 | # Create MetagraphSyncer instance with mock subtensor 57 | metagraph_syncer = MetagraphSyncer(mock_subtensor, {1: 1, 2: 1}) 58 | 59 | # Call do_initial_sync method 60 | metagraph_syncer.do_initial_sync() 61 | 62 | # Register a listener for netuid 1. 63 | event = threading.Event() 64 | 65 | def listener(metagraph, netuid): 66 | self.assertEqual(metagraph.netuid, 1) 67 | self.assertEqual(netuid, 1) 68 | event.set() 69 | 70 | metagraph_syncer.register_listener(listener, [1]) 71 | 72 | # Since we sync every 1 second, verify the listener is called within 5 seconds. 73 | event.wait(5) 74 | 75 | 76 | if __name__ == "__main__": 77 | unittest.main() 78 | -------------------------------------------------------------------------------- /tests/common/test_protocol.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Type 3 | import unittest 4 | import datetime as dt 5 | import bittensor as bt 6 | from common.data import ( 7 | CompressedEntityBucket, 8 | CompressedMinerIndex, 9 | DataEntity, 10 | DataEntityBucket, 11 | DataEntityBucketId, 12 | DataLabel, 13 | DataSource, 14 | TimeBucket, 15 | ) 16 | from common import old_protocol 17 | 18 | from common.protocol import GetDataEntityBucket, GetMinerIndex 19 | 20 | 21 | def serialize_like_dendrite(synapse: bt.Synapse) -> str: 22 | """Serializes a synapse like a Dendrite would.""" 23 | d = synapse.dict() 24 | return json.dumps(d) 25 | 26 | 27 | def serialize_like_axon(synapse: bt.Synapse) -> str: 28 | """Serializes a synapse like an Axon would.""" 29 | return serialize_like_dendrite(synapse) 30 | 31 | 32 | def deserialize(json_str: str, cls: Type) -> bt.Synapse: 33 | """Deserializes the same way a dendrite/axon does.""" 34 | d = json.loads(json_str) 35 | return cls(**d) 36 | 37 | 38 | class TestGetMinerIndex(unittest.TestCase): 39 | def test_get_miner_index_old_format_round_trip(self): 40 | """Tests that the old miner index format can be serialized/deserialized for transport.""" 41 | request = GetMinerIndex() 42 | json = request.json() 43 | print(json) 44 | deserialized = GetMinerIndex.parse_raw(json) 45 | self.assertEqual(request, deserialized) 46 | 47 | # Also check that the headers can be constructed. 48 | request.to_headers() 49 | 50 | # Now construct a response and check it. 51 | response = GetMinerIndex( 52 | data_entity_buckets=[ 53 | DataEntityBucket( 54 | id=DataEntityBucketId( 55 | time_bucket=TimeBucket(id=5), 56 | label=DataLabel(value="r/bittensor_"), 57 | source=DataSource.REDDIT, 58 | ), 59 | size_bytes=100, 60 | ), 61 | DataEntityBucket( 62 | id=DataEntityBucketId( 63 | time_bucket=TimeBucket(id=6), 64 | source=DataSource.X, 65 | ), 66 | size_bytes=200, 67 | ), 68 | ] 69 | ) 70 | 71 | serialized = serialize_like_axon(response) 72 | deserialized = deserialize(serialized, GetMinerIndex) 73 | self.assertEqual(response, deserialized) 74 | 75 | def test_get_miner_index_new_format_round_trip(self): 76 | """Tests that the compressed miner index can be serialized/deserialized for transport.""" 77 | 78 | request = GetMinerIndex() 79 | 80 | serialized = serialize_like_dendrite(request) 81 | deserialized = deserialize(serialized, GetMinerIndex) 82 | self.assertEqual(request, deserialized) 83 | 84 | # Also check that the headers can be constructed. 85 | request.to_headers() 86 | 87 | # Now construct a response and check it. 88 | response = GetMinerIndex( 89 | compressed_index_serialized=CompressedMinerIndex( 90 | sources={ 91 | DataSource.REDDIT.value: [ 92 | CompressedEntityBucket( 93 | label="r/bittensor_", 94 | time_bucket_ids=[5, 6], 95 | sizes_bytes=[100, 200], 96 | ) 97 | ], 98 | DataSource.X.value: [ 99 | CompressedEntityBucket( 100 | time_bucket_ids=[10, 11, 12], sizes_bytes=[300, 400, 500] 101 | ), 102 | CompressedEntityBucket( 103 | label="#bittensor", time_bucket_ids=[5], sizes_bytes=[100] 104 | ), 105 | ], 106 | } 107 | ).json() 108 | ) 109 | 110 | serialized = serialize_like_axon(response) 111 | deserialized = deserialize(serialized, GetMinerIndex) 112 | self.assertEqual(response, deserialized) 113 | 114 | 115 | class TestGetDataEntityBucket(unittest.TestCase): 116 | def test_synapse_serialization(self): 117 | """Tests that the protocol messages can be serialized/deserialized for transport.""" 118 | request = GetDataEntityBucket( 119 | data_entity_bucket_id=DataEntityBucketId( 120 | time_bucket=TimeBucket.from_datetime(dt.datetime.utcnow()), 121 | label=DataLabel(value="r/bittensor_"), 122 | source=DataSource.REDDIT, 123 | ) 124 | ) 125 | json = request.json() 126 | print(json) 127 | deserialized = GetDataEntityBucket.parse_raw(json) 128 | self.assertEqual(request, deserialized) 129 | 130 | # Check that the enum is deserialized correctly 131 | self.assertEqual(deserialized.data_entity_bucket_id.source, DataSource.REDDIT) 132 | 133 | # Also check that the headers can be constructed. 134 | request.to_headers() 135 | 136 | # TODO: Add a test for the response. 137 | 138 | 139 | if __name__ == "__main__": 140 | unittest.main() 141 | -------------------------------------------------------------------------------- /tests/common/test_utils.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import time 3 | import unittest 4 | 5 | from common.utils import run_in_thread 6 | 7 | 8 | class TestUtils(unittest.TestCase): 9 | def test_run_in_thread(self): 10 | def test_func(a: int, b: int): 11 | return a + b 12 | 13 | partial = functools.partial(test_func, 1, 2) 14 | 15 | result = run_in_thread(func=partial, ttl=5) 16 | self.assertEqual(3, result) 17 | 18 | def test_run_in_thread_timeout(self): 19 | def test_func(a: int, b: int): 20 | time.sleep(3) 21 | return a + b 22 | 23 | partial = functools.partial(test_func, 1, 2) 24 | 25 | with self.assertRaises(TimeoutError): 26 | result = run_in_thread(func=partial, ttl=1) 27 | 28 | def test_run_in_thread_no_return(self): 29 | def test_func(a: int, b: int): 30 | pass 31 | 32 | partial = functools.partial(test_func, 1, 2) 33 | 34 | result = run_in_thread(func=partial, ttl=5) 35 | self.assertIsNone(result) 36 | 37 | def test_run_in_thread_tuple_return(self): 38 | def test_func(a: int, b: int): 39 | return a, b 40 | 41 | partial = functools.partial(test_func, 1, 2) 42 | 43 | result = run_in_thread(func=partial, ttl=5) 44 | self.assertEqual((1, 2), result) 45 | 46 | def test_run_in_thread_exception(self): 47 | def test_func(a: int, b: int): 48 | raise ValueError() 49 | 50 | partial = functools.partial(test_func, 1, 2) 51 | 52 | with self.assertRaises(ValueError): 53 | result = run_in_thread(func=partial, ttl=5) 54 | 55 | 56 | if __name__ == "__main__": 57 | unittest.main() 58 | -------------------------------------------------------------------------------- /tests/hf_validation/test_encoding_key.json: -------------------------------------------------------------------------------- 1 | {"sym_key": "XdcRI9sPT2e43a8fda53H13HpGqpQLTZHHeIoHPtIMI="} -------------------------------------------------------------------------------- /tests/hf_validation/test_reddit_dataset_validation.py: -------------------------------------------------------------------------------- 1 | """Module for selecting and processing random rows from Hugging Face datasets.""" 2 | 3 | import random 4 | from typing import List, Dict, Any 5 | import asyncio 6 | 7 | import bittensor as bt 8 | import requests 9 | import pandas as pd 10 | from datasets import load_dataset 11 | import itertools 12 | from huggingface_utils.encoding_system import EncodingKeyManager, decode_url 13 | from scraping.reddit.reddit_custom_scraper import RedditCustomScraper 14 | 15 | 16 | def get_parquet_files(repo_id: str) -> List[str]: 17 | """ 18 | Fetch a list of parquet files from a Hugging Face dataset repository. 19 | 20 | Args: 21 | repo_id (str): The Hugging Face dataset repository ID. 22 | 23 | Returns: 24 | List[str]: A list of parquet file paths. 25 | 26 | Raises: 27 | requests.RequestException: If the API request fails. 28 | """ 29 | api_url = f"https://huggingface.co/api/datasets/{repo_id}/tree/main/data" 30 | try: 31 | response = requests.get(api_url) 32 | response.raise_for_status() 33 | files = [item['path'] for item in response.json() if item['path'].endswith('.parquet')] 34 | return files 35 | except requests.RequestException as e: 36 | raise requests.RequestException(f"Failed to fetch file list: {e}") 37 | 38 | def select_random_rows_from_parquet(repo_id: str, num_rows: int = 10, buffer_size: int = 10_000) -> pd.DataFrame: 39 | """ 40 | Efficiently select random rows from a randomly chosen parquet file in a Hugging Face dataset 41 | using a streaming approach with shuffling. 42 | 43 | Args: 44 | repo_id (str): The Hugging Face dataset repository ID. 45 | num_rows (int, optional): Number of random rows to select. Defaults to 10. 46 | buffer_size (int, optional): Size of the buffer for shuffling. Defaults to 10,000. 47 | 48 | Returns: 49 | pd.DataFrame: A DataFrame containing the randomly selected rows. 50 | 51 | Raises: 52 | ValueError: If no parquet files are found in the dataset. 53 | """ 54 | parquet_files = get_parquet_files(repo_id) 55 | 56 | if not parquet_files: 57 | raise ValueError("No parquet files found in the dataset.") 58 | 59 | selected_file = random.choice(parquet_files) 60 | bt.logging.trace(f"Selected file: {selected_file}") 61 | 62 | # Load the dataset in streaming mode 63 | dataset = load_dataset( 64 | repo_id, 65 | data_files={'train': selected_file}, 66 | split='train', 67 | streaming=True 68 | ) 69 | 70 | # Generate random seed 71 | random_seed = random.randint(0, 2 ** 32 - 1) 72 | # Shuffle the dataset 73 | shuffled_dataset = dataset.shuffle(buffer_size=buffer_size, seed=random_seed) 74 | 75 | # Select the specified number of rows 76 | selected_rows = list(itertools.islice(shuffled_dataset, num_rows)) 77 | 78 | # Convert to DataFrame 79 | df = pd.DataFrame(selected_rows) 80 | 81 | # Decode encrypted columns 82 | key_manager = EncodingKeyManager(key_path='/Users/volodymyrtruba/data-universe/tests/hf_validation/test_encoding_key.json') 83 | fernet = key_manager.get_fernet() 84 | 85 | for column in ['url_encoded', 'username_encoded']: 86 | if column in df.columns: 87 | df[column.replace('_encoded', '')] = df[column].apply(lambda x: decode_url(x, fernet)) 88 | df = df.drop(columns=[column]) 89 | 90 | bt.logging.trace(df) 91 | 92 | return df 93 | 94 | 95 | async def main(): 96 | """Main function to demonstrate the usage of the script.""" 97 | repo_id = "arrmlet/reddit_dataset_123456" 98 | bt.logging.set_trace(True) 99 | try: 100 | selected_rows = select_random_rows_from_parquet(repo_id) 101 | print(selected_rows) 102 | s = selected_rows.to_dict(orient='records') 103 | scrapper = RedditCustomScraper() 104 | valid = await scrapper.validate_hf(entities=s) 105 | bt.logging.info(f"Number of rows: {len(selected_rows)}") 106 | bt.logging.info(valid) 107 | 108 | except (requests.RequestException, ValueError) as e: 109 | bt.logging.trace(f"An error occurred: {e}") 110 | 111 | if __name__ == "__main__": 112 | asyncio.run(main()) -------------------------------------------------------------------------------- /tests/hf_validation/test_x_dataset_validation.py: -------------------------------------------------------------------------------- 1 | """Module for selecting and processing random rows from Hugging Face datasets.""" 2 | 3 | import random 4 | from typing import List, Dict, Any 5 | import asyncio 6 | 7 | import bittensor as bt 8 | import requests 9 | import pandas as pd 10 | from datasets import load_dataset 11 | import itertools 12 | from huggingface_utils.encoding_system import EncodingKeyManager, decode_url 13 | from scraping.x.apidojo_scraper import ApiDojoTwitterScraper 14 | 15 | def get_parquet_files(repo_id: str) -> List[str]: 16 | """ 17 | Fetch a list of parquet files from a Hugging Face dataset repository. 18 | 19 | Args: 20 | repo_id (str): The Hugging Face dataset repository ID. 21 | 22 | Returns: 23 | List[str]: A list of parquet file paths. 24 | 25 | Raises: 26 | requests.RequestException: If the API request fails. 27 | """ 28 | api_url = f"https://huggingface.co/api/datasets/{repo_id}/tree/main/data" 29 | try: 30 | response = requests.get(api_url) 31 | response.raise_for_status() 32 | files = [item['path'] for item in response.json() if item['path'].endswith('.parquet')] 33 | return files 34 | except requests.RequestException as e: 35 | raise requests.RequestException(f"Failed to fetch file list: {e}") 36 | 37 | def select_random_rows_from_parquet(repo_id: str, num_rows: int = 10, buffer_size: int = 10_000) -> pd.DataFrame: 38 | """ 39 | Efficiently select random rows from a randomly chosen parquet file in a Hugging Face dataset 40 | using a streaming approach with shuffling. 41 | 42 | Args: 43 | repo_id (str): The Hugging Face dataset repository ID. 44 | num_rows (int, optional): Number of random rows to select. Defaults to 10. 45 | buffer_size (int, optional): Size of the buffer for shuffling. Defaults to 10,000. 46 | 47 | Returns: 48 | pd.DataFrame: A DataFrame containing the randomly selected rows. 49 | 50 | Raises: 51 | ValueError: If no parquet files are found in the dataset. 52 | """ 53 | parquet_files = get_parquet_files(repo_id) 54 | 55 | if not parquet_files: 56 | raise ValueError("No parquet files found in the dataset.") 57 | 58 | selected_file = random.choice(parquet_files) 59 | bt.logging.trace(f"Selected file: {selected_file}") 60 | 61 | # Load the dataset in streaming mode 62 | dataset = load_dataset( 63 | repo_id, 64 | data_files={'train': selected_file}, 65 | split='train', 66 | streaming=True 67 | ) 68 | 69 | # Generate random seed 70 | random_seed = random.randint(0, 2 ** 32 - 1) 71 | # Shuffle the dataset 72 | shuffled_dataset = dataset.shuffle(buffer_size=buffer_size, seed=random_seed) 73 | 74 | # Select the specified number of rows 75 | selected_rows = list(itertools.islice(shuffled_dataset, num_rows)) 76 | 77 | # Convert to DataFrame 78 | df = pd.DataFrame(selected_rows) 79 | 80 | # Decode encrypted columns 81 | key_manager = EncodingKeyManager(key_path='/Users/volodymyrtruba/data-universe/tests/hf_validation/test_encoding_key.json') 82 | fernet = key_manager.get_fernet() 83 | 84 | for column in ['url_encoded', 'username_encoded']: 85 | if column in df.columns: 86 | df[column.replace('_encoded', '')] = df[column].apply(lambda x: decode_url(x, fernet)) 87 | df = df.drop(columns=[column]) 88 | 89 | bt.logging.trace(df) 90 | 91 | return df 92 | 93 | 94 | async def main(): 95 | """Main function to demonstrate the usage of the script.""" 96 | repo_id = "arrmlet/x_dataset_123456" 97 | 98 | try: 99 | selected_rows = select_random_rows_from_parquet(repo_id) 100 | s = selected_rows.to_dict(orient='records') 101 | scrapper = ApiDojoTwitterScraper() 102 | valid = await scrapper.validate_hf(entities=s) 103 | bt.logging.info(f"Number of rows: {len(selected_rows)}") 104 | bt.logging.info(valid) 105 | 106 | except (requests.RequestException, ValueError) as e: 107 | bt.logging.trace(f"An error occurred: {e}") 108 | 109 | if __name__ == "__main__": 110 | asyncio.run(main()) -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/integration/__init__.py -------------------------------------------------------------------------------- /tests/integration/test_on_demand.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import asyncio 3 | import bittensor as bt 4 | import datetime as dt 5 | import random 6 | from common.data import DataLabel, DataSource, DataEntity 7 | from common.protocol import OnDemandRequest 8 | from common.date_range import DateRange 9 | from scraping.scraper import ScrapeConfig 10 | from scraping.x.apidojo_scraper import ApiDojoTwitterScraper 11 | 12 | 13 | class TestOnDemandProtocol(unittest.TestCase): 14 | def test_on_demand_flow(self): 15 | """Test the complete on-demand data flow""" 16 | 17 | async def run_test(): 18 | # Create OnDemand request 19 | test_request = OnDemandRequest( 20 | source=DataSource.X, 21 | keywords=["#TAO"], 22 | start_date=(dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=1)).isoformat(), 23 | end_date=dt.datetime.now(dt.timezone.utc).isoformat(), 24 | limit=5 25 | ) 26 | 27 | # Set up scraper 28 | scraper = ApiDojoTwitterScraper() 29 | 30 | # Create scrape config from request 31 | scrape_config = ScrapeConfig( 32 | entity_limit=test_request.limit, 33 | date_range=DateRange( 34 | start=dt.datetime.fromisoformat(test_request.start_date), 35 | end=dt.datetime.fromisoformat(test_request.end_date) 36 | ), 37 | labels=[DataLabel(value=k) for k in test_request.keywords] 38 | ) 39 | 40 | # Get data using scraper 41 | data = await scraper.scrape(scrape_config) 42 | 43 | # Verify data was retrieved 44 | self.assertTrue(len(data) > 0, "No data returned from scraper") 45 | 46 | # Select 1 random samples for validation 47 | if data: 48 | samples = random.sample(data, min(1, len(data))) 49 | validation_results = await scraper.validate(samples) 50 | print(data) 51 | # Check if any validation passed 52 | self.assertTrue( 53 | any(result.is_valid for result in validation_results), 54 | "All validation failed for sample data" 55 | ) 56 | 57 | # Run the async test 58 | asyncio.run(run_test()) 59 | 60 | 61 | if __name__ == "__main__": 62 | unittest.main() -------------------------------------------------------------------------------- /tests/neurons/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/neurons/__init__.py -------------------------------------------------------------------------------- /tests/neurons/test_miner_config.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from unittest.mock import patch 4 | 5 | from neurons.miner import Miner 6 | 7 | 8 | class TestMinerConfig(unittest.TestCase): 9 | def test_miner_config(self): 10 | with patch.object( 11 | sys, 12 | "argv", 13 | [ 14 | "miner.py", 15 | "--neuron.database_name", 16 | "mydb", 17 | "--subtensor.network", 18 | "test", 19 | ], 20 | ): 21 | miner = Miner() 22 | config = miner.get_config_for_test() 23 | 24 | self.assertEqual(config.neuron.database_name, "mydb") 25 | # Check the default values are still there. 26 | self.assertEqual(config.neuron.max_database_size_gb_hint, 250) 27 | 28 | 29 | if __name__ == "__main__": 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /tests/neurons/test_validator_config.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | from unittest.mock import patch 4 | from neurons.config import NeuronType, create_config 5 | 6 | 7 | class TestValidatorConfig(unittest.TestCase): 8 | def test_validator_config(self): 9 | with patch.object( 10 | sys, 11 | "argv", 12 | [ 13 | "validator.py", 14 | "--subtensor.network", 15 | "test", 16 | ], 17 | ): 18 | config = create_config(NeuronType.VALIDATOR) 19 | 20 | # Check the default values are still there. 21 | self.assertEqual(config.neuron.axon_off, False) 22 | self.assertEqual(config.subtensor.network, "test") 23 | 24 | 25 | if __name__ == "__main__": 26 | unittest.main() 27 | -------------------------------------------------------------------------------- /tests/rewards/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/rewards/__init__.py -------------------------------------------------------------------------------- /tests/scraping/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/scraping/__init__.py -------------------------------------------------------------------------------- /tests/scraping/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/scraping/config/__init__.py -------------------------------------------------------------------------------- /tests/scraping/config/invalid_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "scraper_configs": [ 3 | { 4 | "scraper_id": "bogus", 5 | "cadence_seconds": 300, 6 | "labels_to_scrape": [ 7 | { 8 | "label_choices": [ 9 | "#bittensor", 10 | "#tao" 11 | ], 12 | "max_age_hint_minutes": 1440, 13 | "max_data_entities": 100 14 | } 15 | ] 16 | }, 17 | { 18 | "scraper_id": "Reddit.lite", 19 | "cadence_seconds": 900, 20 | "labels_to_scrape": [ 21 | { 22 | "label_choices": [ 23 | "r/bittensor_", 24 | "r/bitcoin" 25 | ], 26 | "max_data_entities": 50 27 | } 28 | ] 29 | } 30 | ] 31 | } -------------------------------------------------------------------------------- /tests/scraping/config/test_config_reader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from unittest.mock import patch 4 | from common import constants 5 | from common.data import DataLabel, DataSource 6 | from scraping.config.config_reader import ConfigReader 7 | from scraping.coordinator import ( 8 | CoordinatorConfig, 9 | ScraperConfig, 10 | LabelScrapingConfig, 11 | ) 12 | from scraping.scraper import ScraperId 13 | 14 | 15 | class TestConfigReader(unittest.TestCase): 16 | def test_load_config_valid(self): 17 | """Tests a valid config is loaded correctly.""" 18 | expected_config = CoordinatorConfig( 19 | scraper_configs={ 20 | ScraperId.X_MICROWORLDS: ScraperConfig( 21 | cadence_seconds=300, 22 | labels_to_scrape=[ 23 | LabelScrapingConfig( 24 | label_choices=[ 25 | DataLabel(value="#bittensor"), 26 | DataLabel(value="#TAO"), 27 | ], 28 | max_age_hint_minutes=1440, 29 | max_data_entities=100, 30 | ), 31 | LabelScrapingConfig( 32 | max_age_hint_minutes=60 33 | * 24 34 | * constants.DATA_ENTITY_BUCKET_AGE_LIMIT_DAYS, 35 | max_data_entities=500, 36 | ), 37 | ], 38 | ), 39 | ScraperId.REDDIT_LITE: ScraperConfig( 40 | cadence_seconds=900, 41 | labels_to_scrape=[ 42 | LabelScrapingConfig( 43 | label_choices=[ 44 | DataLabel(value="r/bittensor_"), 45 | DataLabel(value="r/bitcoin"), 46 | ], 47 | max_age_hint_minutes=60 48 | * 24 49 | * constants.DATA_ENTITY_BUCKET_AGE_LIMIT_DAYS, 50 | max_data_entities=50, 51 | ), 52 | ], 53 | ), 54 | } 55 | ) 56 | 57 | this_dir = os.path.abspath(os.path.dirname(__file__)) 58 | filepath = os.path.join(this_dir, "valid_config.json") 59 | loaded_config = ConfigReader.load_config(filepath) 60 | 61 | self.assertEqual(loaded_config, expected_config) 62 | 63 | def test_load_config_invalid(self): 64 | """Tests that loading an invalid config raises an exception.""" 65 | this_dir = os.path.abspath(os.path.dirname(__file__)) 66 | filepath = os.path.join(this_dir, "invalid_config.json") 67 | 68 | with self.assertRaises(Exception) as e: 69 | ConfigReader.load_config(filepath) 70 | self.assertIn( 71 | "scraper_id\n value is not a valid enumeration member", str(e.exception) 72 | ) 73 | 74 | def test_load_real_config_valid(self): 75 | this_dir = os.path.abspath(os.path.dirname(__file__)) 76 | filepath = os.path.join( 77 | this_dir, "../../../scraping/config/scraping_config.json" 78 | ) 79 | loaded_config = ConfigReader.load_config(filepath) 80 | 81 | 82 | if __name__ == "__main__": 83 | unittest.main() 84 | -------------------------------------------------------------------------------- /tests/scraping/config/test_model.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from scraping.config.model import ( 4 | ScraperConfig, 5 | LabelScrapingConfig, 6 | ScrapingConfig, 7 | ) 8 | from scraping.scraper import ScraperId 9 | 10 | 11 | class TestScrapingConfig(unittest.TestCase): 12 | def test_serialization_deserialization(self): 13 | """Verifies a round-trip serialization/deserialization of the ScrapingConfig""" 14 | 15 | config = ScrapingConfig( 16 | scraper_configs=[ 17 | ScraperConfig( 18 | scraper_id=ScraperId.X_MICROWORLDS, 19 | cadence_seconds=300, 20 | labels_to_scrape=[ 21 | LabelScrapingConfig( 22 | label_choices=["#bittensor", "#TAO"], 23 | max_age_hint_minutes=1440, 24 | max_data_entities=100, 25 | ), 26 | LabelScrapingConfig( 27 | max_age_hint_minutes=10080, 28 | max_data_entities=500, 29 | ), 30 | ], 31 | ), 32 | ScraperConfig( 33 | scraper_id=ScraperId.REDDIT_LITE, 34 | cadence_seconds=900, 35 | labels_to_scrape=[ 36 | LabelScrapingConfig( 37 | label_choices=["r/bittensor_"], 38 | max_data_entities=50, 39 | ), 40 | ], 41 | ), 42 | ] 43 | ) 44 | 45 | # Serialize the object to JSON 46 | json_data = config.json() 47 | print(json_data) 48 | 49 | # Deserialize the JSON back to an object 50 | deserialized_config = ScrapingConfig.parse_raw(json_data) 51 | 52 | # Verify the deserialized object is equal to the starting object 53 | self.assertEqual(config, deserialized_config) 54 | 55 | 56 | if __name__ == "__main__": 57 | unittest.main() 58 | -------------------------------------------------------------------------------- /tests/scraping/config/valid_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "scraper_configs": [ 3 | { 4 | "scraper_id": "X.microworlds", 5 | "cadence_seconds": 300, 6 | "labels_to_scrape": [ 7 | { 8 | "label_choices": [ 9 | "#bittensor", 10 | "#tao" 11 | ], 12 | "max_age_hint_minutes": 1440, 13 | "max_data_entities": 100 14 | }, 15 | { 16 | "max_data_entities": 500 17 | } 18 | ] 19 | }, 20 | { 21 | "scraper_id": "Reddit.lite", 22 | "cadence_seconds": 900, 23 | "labels_to_scrape": [ 24 | { 25 | "label_choices": [ 26 | "r/bittensor_", 27 | "r/bitcoin" 28 | ], 29 | "max_data_entities": 50 30 | } 31 | ] 32 | } 33 | ] 34 | } -------------------------------------------------------------------------------- /tests/scraping/reddit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/scraping/reddit/__init__.py -------------------------------------------------------------------------------- /tests/scraping/reddit/test_model.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | import unittest 3 | 4 | from common import constants 5 | from scraping.reddit.model import RedditContent, RedditDataType 6 | 7 | 8 | class TestModel(unittest.TestCase): 9 | def test_label_truncation(self): 10 | """Tests that RedditContents correctly truncate labels to 32 characters when converting to DataEntities""" 11 | timestamp = dt.datetime.now(tz=dt.timezone.utc) 12 | content = RedditContent( 13 | id="postId", 14 | url="https://reddit.com/123", 15 | username="user1", 16 | communityName="r/looooooooooooooooooooooooongSubreddit", 17 | body="Hello world", 18 | createdAt=timestamp, 19 | dataType=RedditDataType.POST, 20 | title="Title text", 21 | ) 22 | entity = RedditContent.to_data_entity(content=content) 23 | 24 | self.assertEqual(len(entity.label.value), constants.MAX_LABEL_LENGTH) 25 | self.assertEqual(entity.label.value, "r/looooooooooooooooooooooooongsu") 26 | 27 | def test_label_truncation_lower(self): 28 | """Tests truncation of characters that become longer when .lower() is used on them.""" 29 | timestamp = dt.datetime.now(tz=dt.timezone.utc) 30 | content = RedditContent( 31 | id="postId", 32 | url="https://reddit.com/123", 33 | username="user1", 34 | communityName="r/İsrailleTicaretFilistineİhanet", 35 | body="Hello world", 36 | createdAt=timestamp, 37 | dataType=RedditDataType.POST, 38 | title="Title text", 39 | ) 40 | entity = RedditContent.to_data_entity(content=content) 41 | 42 | self.assertEqual(len(entity.label.value), constants.MAX_LABEL_LENGTH) 43 | self.assertEqual(entity.label.value, "r/i̇srailleticaretfilistinei̇han") 44 | 45 | def test_to_data_entity_obfuscated(self): 46 | timestamp = dt.datetime( 47 | year=2024, 48 | month=3, 49 | day=1, 50 | hour=1, 51 | minute=1, 52 | second=1, 53 | microsecond=1, 54 | tzinfo=dt.timezone.utc, 55 | ) 56 | content = RedditContent( 57 | id="postId", 58 | url="https://reddit.com/123", 59 | username="user1", 60 | communityName="r/bitcoin", 61 | body="Hello world", 62 | createdAt=timestamp, 63 | dataType=RedditDataType.POST, 64 | title="Title text", 65 | ) 66 | 67 | # Convert to entity and back to check granularity of the content timestamp. 68 | entity = RedditContent.to_data_entity(content=content) 69 | content_roundtrip = RedditContent.from_data_entity(entity) 70 | 71 | # The entity datetime should have full granularity but the roundtripped content should not. 72 | self.assertEqual(entity.datetime, timestamp) 73 | self.assertEqual( 74 | content_roundtrip.created_at, 75 | dt.datetime( 76 | year=2024, 77 | month=3, 78 | day=1, 79 | hour=1, 80 | minute=1, 81 | second=0, 82 | microsecond=0, 83 | tzinfo=dt.timezone.utc, 84 | ), 85 | ) 86 | 87 | def test_to_data_entity_content_serialization(self): 88 | """Verifies that the content is serialized correctly when converting to a DataEntity.""" 89 | content = RedditContent( 90 | id="postId", 91 | url="https://reddit.com/123", 92 | username="user1", 93 | communityName="r/bitcoin", 94 | body="Hello world", 95 | createdAt=dt.datetime(2024, 3, 30, 1, 2, 3, tzinfo=dt.timezone.utc), 96 | dataType=RedditDataType.POST, 97 | title="Title text", 98 | ) 99 | 100 | # Convert to entity and back to check granularity of the content timestamp. 101 | entity = RedditContent.to_data_entity(content=content) 102 | 103 | self.assertEqual( 104 | entity.content, 105 | b'{"id": "postId", "url": "https://reddit.com/123", "username": "user1", "communityName": "r/bitcoin", "body": "Hello world", "createdAt": "2024-03-30T01:02:00+00:00", "dataType": "post", "title": "Title text", "parentId": null}', 106 | ) 107 | 108 | 109 | if __name__ == "__main__": 110 | unittest.main() 111 | -------------------------------------------------------------------------------- /tests/scraping/test_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import datetime as dt 3 | 4 | from scraping import utils 5 | 6 | 7 | class TestUtils(unittest.TestCase): 8 | def test_obfuscate_datetime_to_minute(self): 9 | test_date = dt.datetime( 10 | year=2024, 11 | month=1, 12 | day=2, 13 | hour=3, 14 | minute=4, 15 | second=5, 16 | microsecond=6, 17 | tzinfo=dt.timezone.utc, 18 | ) 19 | 20 | obfuscated_date = utils.obfuscate_datetime_to_minute(test_date) 21 | 22 | self.assertEqual( 23 | obfuscated_date, 24 | dt.datetime( 25 | year=2024, 26 | month=1, 27 | day=2, 28 | hour=3, 29 | minute=4, 30 | second=0, 31 | microsecond=0, 32 | tzinfo=dt.timezone.utc, 33 | ), 34 | ) 35 | -------------------------------------------------------------------------------- /tests/scraping/x/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/scraping/x/__init__.py -------------------------------------------------------------------------------- /tests/scraping/x/test_model.py: -------------------------------------------------------------------------------- 1 | import datetime as dt 2 | import unittest 3 | from common import constants 4 | 5 | from sympy import timed 6 | from scraping.x.model import XContent 7 | 8 | 9 | class TestModel(unittest.TestCase): 10 | def test_equality(self): 11 | """Tests validation of equivalent XContent instances.""" 12 | timestamp = dt.datetime.now() 13 | # Create two XContent instances with the same values 14 | xcontent1 = XContent( 15 | username="user1", 16 | text="Hello world", 17 | url="https://twitter.com/123", 18 | timestamp=timestamp, 19 | tweet_hashtags=["#bittensor", "$TAO"], 20 | ) 21 | xcontent2 = XContent( 22 | username="user1", 23 | text="Hello world", 24 | url="https://twitter.com/123", 25 | timestamp=timestamp, 26 | tweet_hashtags=["#bittensor", "$TAO"], 27 | ) 28 | 29 | # Check if the two instances are equivalent 30 | self.assertTrue(xcontent1 == xcontent2) 31 | self.assertTrue(xcontent2 == xcontent1) 32 | 33 | def test_equality_not_equivalent(self): 34 | """Tests validation of non-equivalent XContent instances.""" 35 | timestamp = dt.datetime.now() 36 | content = XContent( 37 | username="user1", 38 | text="Hello world", 39 | url="https://twitter.com/123", 40 | timestamp=timestamp, 41 | tweet_hashtags=["#bittensor", "$TAO"], 42 | ) 43 | 44 | non_matching_content = [ 45 | content.copy(update={"username": "user2"}), 46 | content.copy(update={"text": "Hello world!"}), 47 | content.copy(update={"url": "https://twitter.com/456"}), 48 | content.copy(update={"timestamp": timestamp + dt.timedelta(seconds=1)}), 49 | # Hashtag ordering needs to be deterministic. Verify changing the order of the hashtags makes the content non-equivalent. 50 | content.copy(update={"tweet_hashtags": ["#TAO", "#bittensor"]}), 51 | ] 52 | 53 | for c in non_matching_content: 54 | self.assertFalse(content == c) 55 | self.assertFalse(c == content) 56 | 57 | def test_label_truncation(self): 58 | """Tests that XContents correctly truncate labels to 32 characters when converting to DataEntities""" 59 | timestamp = dt.datetime.now(tz=dt.timezone.utc) 60 | content = XContent( 61 | username="user1", 62 | text="Hello world", 63 | url="https://twitter.com/123", 64 | timestamp=timestamp, 65 | tweet_hashtags=["#loooooooooooooooooooooooonghashtag", "$TAO"], 66 | ) 67 | entity = XContent.to_data_entity(content=content) 68 | 69 | self.assertEqual(len(entity.label.value), constants.MAX_LABEL_LENGTH) 70 | self.assertEqual(entity.label.value, "#loooooooooooooooooooooooonghash") 71 | 72 | def test_label_truncation_lower(self): 73 | """Tests truncation of characters that become longer when .lower() is used on them.""" 74 | timestamp = dt.datetime.now(tz=dt.timezone.utc) 75 | content = XContent( 76 | username="user1", 77 | text="Hello world", 78 | url="https://twitter.com/123", 79 | timestamp=timestamp, 80 | tweet_hashtags=["#İsrailleTicaretFilistineİhanet", "$TAO"], 81 | ) 82 | entity = XContent.to_data_entity(content=content) 83 | 84 | self.assertEqual(len(entity.label.value), constants.MAX_LABEL_LENGTH) 85 | self.assertEqual(entity.label.value, "#i̇srailleticaretfilistinei̇hane") 86 | 87 | def test_to_data_entity_obfuscated(self): 88 | timestamp = dt.datetime( 89 | year=2024, 90 | month=3, 91 | day=1, 92 | hour=1, 93 | minute=1, 94 | second=1, 95 | microsecond=1, 96 | tzinfo=dt.timezone.utc, 97 | ) 98 | content = XContent( 99 | username="user1", 100 | text="Hello world", 101 | url="https://twitter.com/123", 102 | timestamp=timestamp, 103 | tweet_hashtags=["#bittensor", "$TAO"], 104 | ) 105 | 106 | # Convert to entity and back to check granularity of the content timestamp. 107 | entity = XContent.to_data_entity(content=content) 108 | content_roundtrip = XContent.from_data_entity(entity) 109 | 110 | # The entity datetime should have full granularity but the roundtripped content should not. 111 | self.assertEqual(entity.datetime, timestamp) 112 | self.assertEqual( 113 | content_roundtrip.timestamp, 114 | dt.datetime( 115 | year=2024, 116 | month=3, 117 | day=1, 118 | hour=1, 119 | minute=1, 120 | second=0, 121 | microsecond=0, 122 | tzinfo=dt.timezone.utc, 123 | ), 124 | ) 125 | 126 | def test_to_data_entity_content_serialization(self): 127 | """Verifies that the content is serialized correctly when converting to a DataEntity.""" 128 | content = XContent( 129 | username="user1", 130 | text="Hello world", 131 | url="https://twitter.com/123", 132 | timestamp=dt.datetime(2024, 3, 30, 1, 2, 3, tzinfo=dt.timezone.utc), 133 | tweet_hashtags=["#bittensor", "$TAO"], 134 | ) 135 | 136 | # Convert to entity and back to check granularity of the content timestamp. 137 | entity = XContent.to_data_entity(content=content) 138 | 139 | # The content should not contain the model_config field. 140 | self.assertEqual( 141 | entity.content, 142 | b'{"username": "user1", "text": "Hello world", "url": "https://twitter.com/123", "timestamp": "2024-03-30T01:02:00+00:00", "tweet_hashtags": ["#bittensor", "$TAO"]}', 143 | ) 144 | 145 | 146 | if __name__ == "__main__": 147 | unittest.main() 148 | -------------------------------------------------------------------------------- /tests/storage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/storage/__init__.py -------------------------------------------------------------------------------- /tests/storage/miner/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/storage/miner/__init__.py -------------------------------------------------------------------------------- /tests/storage/validator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/storage/validator/__init__.py -------------------------------------------------------------------------------- /tests/test_all.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | def create_test_suite(): 5 | test_suite = unittest.TestSuite() 6 | 7 | # Find all tests in the current directory and subdirectories 8 | loader = unittest.TestLoader() 9 | suite = loader.discover("./tests", pattern="test_*.py") 10 | print(loader.errors) 11 | 12 | # Add the discovered tests to the test suite 13 | test_suite.addTest(suite) 14 | 15 | return test_suite 16 | 17 | 18 | # TODO: Fix this. 19 | # The tests fail because of a ModuleNotFoundError. 20 | if __name__ == "__main__": 21 | suite = create_test_suite() 22 | 23 | # Run the tests 24 | runner = unittest.TextTestRunner() 25 | result = runner.run(suite) 26 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import Any, Callable, Iterable, Tuple 3 | import time 4 | import datetime as dt 5 | 6 | from common.data import ( 7 | CompressedMinerIndex, 8 | DataSource, 9 | ) 10 | from common.data_v2 import ScorableDataEntityBucket, ScorableMinerIndex 11 | 12 | 13 | def get_only_element_matching_filter( 14 | iterable: Iterable[Any], filter: Callable[[Any], bool] 15 | ) -> Any: 16 | """Returns the only element in the iterable that matches the filter, or raises an exception if there are zero or more than one elements.""" 17 | results = [x for x in iterable if filter(x)] 18 | if len(results) != 1: 19 | raise Exception( 20 | f"Expected exactly one element matching filter, but found {len(results)}" 21 | ) 22 | return results[0] 23 | 24 | 25 | def wait_for_condition(condition: Callable[[], bool], timeout: float = 10.0): 26 | """Waits until the provided condition is true, or until the timeout is reached.""" 27 | start_time = time.time() 28 | while not condition(): 29 | if time.time() - start_time > timeout: 30 | raise Exception("Timed out waiting for condition to be true.") 31 | time.sleep(0.1) 32 | 33 | 34 | def convert_compressed_index_to_scorable_miner_index( 35 | index: CompressedMinerIndex, last_updated: dt.datetime 36 | ) -> ScorableMinerIndex: 37 | """Converts a CompressedMinerIndex to a ScorableMinerIndex, assuming size_bytes are fully scorable.""" 38 | 39 | return ScorableMinerIndex( 40 | scorable_data_entity_buckets=[ 41 | ScorableDataEntityBucket( 42 | time_bucket_id=time_bucket_id, 43 | source=source, 44 | label=bucket.label, 45 | size_bytes=size_bytes, 46 | scorable_bytes=size_bytes, 47 | ) 48 | for source in index.sources 49 | for bucket in index.sources[source] 50 | for time_bucket_id, size_bytes in zip( 51 | bucket.time_bucket_ids, bucket.sizes_bytes 52 | ) 53 | ], 54 | last_updated=last_updated, 55 | ) 56 | 57 | 58 | def are_scorable_indexes_equal( 59 | index1: ScorableMinerIndex, index2: ScorableMinerIndex 60 | ) -> Tuple[bool, str]: 61 | """Compares two ScorableMinerIndex instances for equality.""" 62 | 63 | # Compare the last_updated fields. 64 | if index1.last_updated != index2.last_updated: 65 | return ( 66 | False, 67 | f"last_updated fields do not match. {index1.last_updated} != {index2.last_updated}", 68 | ) 69 | 70 | def sort_key(bucket: ScorableDataEntityBucket): 71 | return ( 72 | bucket.time_bucket_id, 73 | bucket.source, 74 | bucket.label if bucket.label else "NULL", 75 | ) 76 | 77 | index1_sorted = sorted(index1.scorable_data_entity_buckets, key=sort_key) 78 | index2_sorted = sorted(index2.scorable_data_entity_buckets, key=sort_key) 79 | for bucket1, bucket2 in zip(index1_sorted, index2_sorted): 80 | if bucket1 != bucket2: 81 | return ( 82 | False, 83 | f"Buckets do not match. {bucket1} != {bucket2}", 84 | ) 85 | 86 | return True, None 87 | 88 | 89 | def are_compressed_indexes_equal( 90 | index1: CompressedMinerIndex, index2: CompressedMinerIndex 91 | ) -> bool: 92 | """Compares two CompressedMinerIndex instances for equality.""" 93 | 94 | # Iterate both indexes, in order of sources. 95 | for source1, source2 in zip(sorted(index1.sources), sorted(index2.sources)): 96 | if source1 != source2: 97 | print(f"Sources do not match. {source1} != {source2}") 98 | return False 99 | 100 | # For a given source, compare the buckets. 101 | buckets1 = sorted( 102 | index1.sources[source1], key=lambda b: b.label if b.label else "NULL" 103 | ) 104 | buckets2 = sorted( 105 | index2.sources[source2], key=lambda b: b.label if b.label else "NULL" 106 | ) 107 | if buckets1 != buckets2: 108 | print(f"Buckets do not match. {buckets1} != {buckets2}") 109 | return False 110 | 111 | return True 112 | 113 | 114 | def create_scorable_index(num_buckets: int) -> ScorableMinerIndex: 115 | """Creates a CompressedMinerIndex with ~ the specified number of buckets.""" 116 | assert num_buckets > 1000 117 | 118 | labels = [f"label{i}" for i in range(num_buckets // 2 // 500)] 119 | time_buckets = [i for i in range(1, (num_buckets // 2 // len(labels)) + 1)] 120 | 121 | # Split max buckets equaly between sources with reddit having 100 time buckets and x having 500. 122 | buckets = [] 123 | for source in [DataSource.REDDIT.value, DataSource.X.value]: 124 | for time_bucket in time_buckets: 125 | for label in labels: 126 | size = random.randint(50, 1000) 127 | scorable_bytes = int(random.random() * size) 128 | buckets.append( 129 | ScorableDataEntityBucket( 130 | time_bucket_id=time_bucket, 131 | source=source, 132 | label=label, 133 | size_bytes=size, 134 | scorable_bytes=scorable_bytes, 135 | ) 136 | ) 137 | return ScorableMinerIndex( 138 | scorable_data_entity_buckets=buckets, last_updated=dt.datetime.now() 139 | ) 140 | -------------------------------------------------------------------------------- /tests/vali_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/vali_utils/__init__.py -------------------------------------------------------------------------------- /tests/vali_utils/test_miner_iterator.py: -------------------------------------------------------------------------------- 1 | from vali_utils.miner_iterator import MinerIterator 2 | import unittest 3 | 4 | 5 | class TestMinerIterator(unittest.TestCase): 6 | def test_miner_uids_are_sorted(self): 7 | """Creates a MinerIterator with unsorted miner UIDs and verifies that the miner UIDs are sorted.""" 8 | uids = [2, 5, 1, 0] 9 | iterator = MinerIterator(uids) 10 | 11 | # The iterator starts at a random position. Move it until we're pointing to 0. 12 | while iterator.peek() != 0: 13 | next(iterator) 14 | 15 | # Now verify the UIDs are iterated in sorted order. 16 | iterated_uids = [next(iterator) for _ in range(len(uids))] 17 | self.assertEqual(iterated_uids, sorted(uids)) 18 | 19 | def test_iterator_is_infinite(self): 20 | """Creates a MinerIterator and verifies calling it more times than the number of miner UIDs cycles the UIDs.""" 21 | uids = [3, 2, 1] 22 | expected = [1, 2, 3] * 10 23 | iterator = MinerIterator(uids) 24 | iterated_uids = [next(iterator) for _ in range(30)] 25 | self.assertEqual(sorted(iterated_uids), sorted(expected)) 26 | 27 | def test_peek(self): 28 | """Creates a MinerIterator and verifies that peek returns the next UID without advancing the iterator.""" 29 | uids = [1, 2, 3] 30 | iterator = MinerIterator(uids) 31 | 32 | peeked = iterator.peek() 33 | self.assertEqual(peeked, iterator.peek()) 34 | self.assertEqual(peeked, next(iterator)) 35 | self.assertNotEqual(peeked, iterator.peek()) 36 | 37 | def test_set_miner_uids(self): 38 | """Verifies the iterator position is maintained when the miner UIDs are updated.""" 39 | initial_miner_uids = [1, 2, 3, 4, 5] 40 | iterator = MinerIterator(initial_miner_uids) 41 | 42 | # Advance the iterator so it should now point to 3 43 | # The iterator starts at a random position. Advance it until it returns 2. 44 | while next(iterator) != 2: 45 | pass 46 | 47 | iterator.set_miner_uids([1, 4, 6]) 48 | 49 | # Verify the iterator picks up from the next UID greater than or equal to 3. 50 | self.assertEqual(next(iterator), 4) 51 | self.assertEqual(next(iterator), 6) 52 | self.assertEqual(next(iterator), 1) 53 | 54 | def test_set_miner_uids_edge_case(self): 55 | """Verifies the iterator position is reset when the miner UIDs are updated and the current position is no longer valid.""" 56 | # Create a MinerIterator with initial miner UIDs 57 | initial_miner_uids = [1, 2, 3, 4, 5] 58 | iterator = MinerIterator(initial_miner_uids) 59 | 60 | # Advance the iterator so it should now point to 5 61 | while iterator.peek() != 5: 62 | next(iterator) 63 | 64 | iterator.set_miner_uids([1, 2, 3, 4]) 65 | 66 | self.assertEqual(next(iterator), 1) 67 | self.assertEqual(next(iterator), 2) 68 | self.assertEqual(next(iterator), 3) 69 | self.assertEqual(next(iterator), 4) 70 | 71 | 72 | if __name__ == "__main__": 73 | unittest.main() 74 | -------------------------------------------------------------------------------- /tests/vali_utils/test_validator_s3_access.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import sys 4 | import bittensor as bt 5 | from pathlib import Path 6 | import json 7 | from vali_utils.validator_s3_access import ValidatorS3Access 8 | 9 | 10 | def main(): 11 | # Parse arguments 12 | parser = argparse.ArgumentParser(description="Test S3 access for validators") 13 | parser.add_argument("--wallet", type=str, required=True, help="Wallet name") 14 | parser.add_argument("--hotkey", type=str, required=True, help="Hotkey name") 15 | parser.add_argument("--s3_auth_url", type=str, default="https://sn13-data.api.macrocosmos.ai", 16 | help="S3 authentication URL") 17 | parser.add_argument("--netuid", type=int, default=13, help="Network UID") 18 | parser.add_argument("--network", type=str, default="finney", help="Network name") 19 | parser.add_argument("--action", type=str, choices=['auth', 'list_sources', 'list_miners', 'list_files'], 20 | default='auth', help="Action to perform") 21 | parser.add_argument("--source", type=str, help="Data source (x or reddit)") 22 | parser.add_argument("--miner", type=str, help="Miner ID (coldkey)") 23 | 24 | args = parser.parse_args() 25 | 26 | # Create config 27 | config = bt.config() 28 | config.netuid = args.netuid 29 | config.s3_auth_url = args.s3_auth_url 30 | 31 | # Create wallet and S3 access 32 | wallet = bt.wallet(name=args.wallet, hotkey=args.hotkey) 33 | s3_access = ValidatorS3Access( 34 | wallet=wallet, 35 | s3_auth_url=args.s3_auth_url 36 | ) 37 | 38 | # Perform requested action 39 | if args.action == 'auth': 40 | # Test authentication 41 | if s3_access.ensure_access(): 42 | print("✅ Authentication successful") 43 | print(f"Access data received:") 44 | 45 | # Print readable summary of access data 46 | access_data = s3_access.access_data 47 | print(f" Bucket: {access_data.get('bucket')}") 48 | print(f" Region: {access_data.get('region')}") 49 | print(f" Expiry: {access_data.get('expiry')}") 50 | 51 | # Print URLs structure 52 | urls = access_data.get('urls', {}) 53 | sources = urls.get('sources', {}) 54 | print(f" Available sources: {list(sources.keys())}") 55 | 56 | return 0 57 | else: 58 | print("❌ Authentication failed") 59 | return 1 60 | 61 | elif args.action == 'list_sources': 62 | # List available sources 63 | sources = s3_access.list_sources() 64 | if sources: 65 | print(f"✅ Available sources: {sources}") 66 | return 0 67 | else: 68 | print("❌ Failed to list sources or none available") 69 | return 1 70 | 71 | elif args.action == 'list_miners': 72 | # List miners for a source 73 | if not args.source: 74 | print("❌ --source is required for list_miners action") 75 | return 1 76 | 77 | miners = s3_access.list_miners(args.source) 78 | if miners: 79 | print(f"✅ Found {len(miners)} miners for source {args.source}:") 80 | for m in miners[:20]: # Show first 20 81 | print(f" - {m}") 82 | if len(miners) > 20: 83 | print(f" ... and {len(miners) - 20} more") 84 | return 0 85 | else: 86 | print(f"❌ No miners found for source {args.source} or listing failed") 87 | return 1 88 | 89 | elif args.action == 'list_files': 90 | # List files for a miner 91 | if not args.source or not args.miner: 92 | print("❌ --source and --miner are required for list_files action") 93 | return 1 94 | 95 | files = s3_access.list_files(args.source, args.miner) 96 | if files: 97 | print(f"✅ Found {len(files)} files for miner {args.miner} in source {args.source}:") 98 | for i, f in enumerate(files[:10]): # Show first 10 99 | print(f" {i + 1}. {f['filename']} ({f['size']} bytes, modified: {f['last_modified']})") 100 | if len(files) > 10: 101 | print(f" ... and {len(files) - 10} more") 102 | return 0 103 | else: 104 | print(f"❌ No files found for miner {args.miner} or listing failed") 105 | return 1 106 | 107 | return 0 108 | 109 | 110 | if __name__ == "__main__": 111 | sys.exit(main()) -------------------------------------------------------------------------------- /vali_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/vali_utils/__init__.py -------------------------------------------------------------------------------- /vali_utils/api/auth/key_routes.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Depends 2 | from .auth import require_master_key, key_manager 3 | from pydantic import BaseModel 4 | from typing import List 5 | from vali_utils.api.utils import endpoint_error_handler 6 | 7 | class APIKeyCreate(BaseModel): 8 | name: str 9 | 10 | 11 | class APIKeyResponse(BaseModel): 12 | key: str 13 | name: str 14 | 15 | 16 | router = APIRouter(tags=["key management"]) 17 | 18 | 19 | @router.post("", response_model=APIKeyResponse) 20 | @endpoint_error_handler 21 | async def create_api_key( 22 | request: APIKeyCreate, 23 | _: bool = Depends(require_master_key) 24 | ): 25 | """Create new API key (requires master key)""" 26 | key = key_manager.create_api_key(request.name) 27 | return {"key": key, "name": request.name} 28 | 29 | 30 | @router.get("") 31 | @endpoint_error_handler 32 | async def list_api_keys(_: bool = Depends(require_master_key)): 33 | """List all API keys (requires master key)""" 34 | return {"keys": key_manager.list_api_keys()} 35 | 36 | 37 | @router.post("/{key}/deactivate") 38 | @endpoint_error_handler 39 | async def deactivate_api_key( 40 | key: str, 41 | _: bool = Depends(require_master_key) 42 | ): 43 | """Deactivate an API key (requires master key)""" 44 | key_manager.deactivate_api_key(key) 45 | return {"status": "success"} -------------------------------------------------------------------------------- /vali_utils/api/models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field, field_validator 2 | from typing import List, Optional, Dict, Any 3 | import datetime as dt 4 | from common.data import DataSource, StrictBaseModel 5 | 6 | 7 | class DesirabilityRequest(BaseModel): 8 | desirabilities: List[Dict[str, Any]] = Field( 9 | description="List of source items with label weights" 10 | ) 11 | 12 | 13 | class QueryRequest(StrictBaseModel): 14 | """Request model for data queries""" 15 | source: str = Field( 16 | ..., # Required field 17 | description="Data source (x or reddit)" 18 | ) 19 | usernames: List[str] = Field( 20 | default_factory=list, 21 | description="List of usernames to fetch data from", 22 | max_length=10 23 | ) 24 | keywords: List[str] = Field( 25 | default_factory=list, 26 | description="List of keywords to search for", 27 | max_length=5 28 | ) 29 | # Change to optional strings for ISO format 30 | start_date: Optional[str] = Field( 31 | default=None, 32 | description="Start date (ISO format)" 33 | ) 34 | end_date: Optional[str] = Field( 35 | default=None, 36 | description="End date (ISO format)" 37 | ) 38 | limit: int = Field( 39 | default=100, 40 | ge=1, 41 | le=1000, 42 | description="Maximum number of items to return" 43 | ) 44 | 45 | @field_validator('source') 46 | @classmethod 47 | def validate_source(cls, v: str) -> str: 48 | try: 49 | source = DataSource[v.upper()] 50 | if source.weight == 0: # Check if it's an active source 51 | raise ValueError(f"Source {v} is not currently active") 52 | return v.upper() # Return uppercase to match enum 53 | except KeyError: 54 | valid_sources = [s.name.lower() for s in DataSource if s.weight > 0] 55 | raise ValueError(f"Invalid source. Must be one of: {valid_sources}") 56 | 57 | 58 | class QueryResponse(StrictBaseModel): 59 | """Response model for data queries""" 60 | status: str = Field(description="Request status (success/error)") 61 | data: List[Dict[str, Any]] = Field(default_factory=list) 62 | meta: Dict[str, Any] = Field( 63 | default_factory=dict, 64 | description="Additional metadata about the request" 65 | ) 66 | 67 | 68 | class DataItem(StrictBaseModel): 69 | """Single data item in response""" 70 | content: bytes 71 | datetime: dt.datetime 72 | uri: str 73 | source: DataSource 74 | label: Optional[str] = None 75 | 76 | 77 | class HfReposResponse(BaseModel): 78 | count: int 79 | repo_names: List[str] 80 | 81 | 82 | class HealthResponse(StrictBaseModel): 83 | """Response model for health check""" 84 | status: str = Field(description="Service status") 85 | timestamp: dt.datetime = Field(description="Current UTC timestamp") 86 | miners_available: int = Field(description="Number of available miners") 87 | version: str = Field(default="1.0.0", description="API version") 88 | netuid: int = Field(description="Network UID") 89 | hotkey: str = Field(description="Validator hotkey address") 90 | 91 | 92 | class MinerInfo(BaseModel): 93 | """Information about a miner's current data""" 94 | hotkey: str 95 | credibility: float 96 | bucket_count: int 97 | content_size_bytes_reddit: int 98 | content_size_bytes_twitter: int 99 | last_updated: dt.datetime 100 | 101 | 102 | class LabelSize(BaseModel): 103 | """Content size information for a specific label""" 104 | label_value: str 105 | content_size_bytes: int 106 | adj_content_size_bytes: int 107 | 108 | 109 | class AgeSize(BaseModel): 110 | """Content size information for a specific time bucket""" 111 | time_bucket_id: int 112 | content_size_bytes: int 113 | adj_content_size_bytes: int 114 | 115 | 116 | class LabelBytes(BaseModel): 117 | """Byte size information for a particular label""" 118 | label: str 119 | total_bytes: int 120 | adj_total_bytes: float 121 | -------------------------------------------------------------------------------- /vali_utils/api/server.py: -------------------------------------------------------------------------------- 1 | import time 2 | from fastapi import FastAPI, Depends, HTTPException, Request 3 | from fastapi.middleware.cors import CORSMiddleware 4 | from fastapi.openapi.docs import get_swagger_ui_html, get_redoc_html 5 | import uvicorn 6 | from threading import Thread 7 | import bittensor as bt 8 | from typing import Optional 9 | from .routes import router, get_validator 10 | from vali_utils.api.auth.key_routes import router as key_router 11 | from vali_utils.api.auth.auth import APIKeyManager, key_manager, require_master_key 12 | from vali_utils.api.utils import endpoint_error_handler 13 | 14 | 15 | class ValidatorAPI: 16 | """API server for validator on-demand queries""" 17 | 18 | def __init__(self, validator, port: int = 8000): 19 | """ 20 | Initialize API server 21 | 22 | Args: 23 | validator: Validator instance 24 | port: Port number to run API on 25 | """ 26 | self.validator = validator 27 | self.port = port 28 | self.key_manager = key_manager 29 | self.app = self._create_app() 30 | self.server_thread: Optional[Thread] = None 31 | 32 | def _create_app(self) -> FastAPI: 33 | """Create and configure FastAPI application""" 34 | app = FastAPI( 35 | title="Data Universe Validator API", 36 | description="API for on-demand data queries from the Data Universe network", 37 | version="1.0.0", 38 | docs_url=None, # Disable default docs routes 39 | ) 40 | 41 | # Add CORS middleware 42 | app.add_middleware( 43 | CORSMiddleware, 44 | allow_origins=["*"], 45 | allow_credentials=True, 46 | allow_methods=["*"], 47 | allow_headers=["*"], 48 | ) 49 | 50 | # Protected Swagger UI docs endpoint 51 | @app.get("/docs", include_in_schema=False) 52 | async def get_docs(_: bool = Depends(require_master_key)): 53 | return get_swagger_ui_html( 54 | openapi_url="/openapi.json", 55 | title="API Documentation" 56 | ) 57 | 58 | # Protected ReDoc docs endpoint using default styling 59 | @app.get("/redoc", include_in_schema=False) 60 | async def get_redoc(_: bool = Depends(require_master_key)): 61 | return get_redoc_html( 62 | openapi_url="/openapi.json", 63 | title="API Documentation" 64 | ) 65 | 66 | # Protected OpenAPI JSON schema endpoint 67 | @app.get("/openapi.json", include_in_schema=False) 68 | @endpoint_error_handler 69 | async def openapi_schema(_: bool = Depends(require_master_key)): 70 | try: 71 | if not app.openapi_schema: 72 | from fastapi.openapi.utils import get_openapi 73 | app.openapi_schema = get_openapi( 74 | title=app.title, 75 | version=app.version, 76 | description=app.description, 77 | routes=app.routes, 78 | ) 79 | # Remove sensitive security information if needed 80 | for path in app.openapi_schema.get("paths", {}).values(): 81 | for operation in path.values(): 82 | if "security" in operation: 83 | del operation["security"] 84 | return app.openapi_schema 85 | except Exception as e: 86 | bt.logging.error(f"Failed to generate OpenAPI schema: {str(e)}") 87 | raise HTTPException(status_code=500, detail="Could not generate API documentation") 88 | 89 | # Rate limit headers middleware 90 | @app.middleware("http") 91 | async def add_rate_limit_headers(request: Request, call_next): 92 | response = await call_next(request) 93 | api_key = request.headers.get("X-API-Key") 94 | if api_key and self.key_manager.is_valid_key(api_key): 95 | _, headers = self.key_manager.check_rate_limit(api_key) 96 | for header_name, header_value in headers.items(): 97 | response.headers[header_name] = header_value 98 | return response 99 | 100 | # Set validator instance for dependency injection 101 | get_validator.api = self 102 | 103 | # Include API routes 104 | app.include_router(router, prefix="/api/v1") 105 | app.include_router(key_router, prefix="/api/v1/keys") 106 | 107 | return app 108 | 109 | def start(self): 110 | """Start API server with better error handling""" 111 | if self.server_thread and self.server_thread.is_alive(): 112 | bt.logging.warning("API server already running") 113 | return 114 | 115 | def run_server(): 116 | try: 117 | bt.logging.info(f"Starting API server on port {self.port}") 118 | uvicorn.run( 119 | self.app, 120 | host="0.0.0.0", 121 | port=self.port, 122 | log_level="info" 123 | ) 124 | except Exception as e: 125 | bt.logging.error(f"API server error: {str(e)}") 126 | 127 | self.server_thread = Thread(target=run_server, daemon=True) 128 | self.server_thread.start() 129 | bt.logging.success(f"API server started on port {self.port}") 130 | 131 | def stop(self): 132 | """Stop API server""" 133 | bt.logging.info("Stopping API server") 134 | # The uvicorn server will stop when the thread is terminated 135 | self.server_thread = None # Allow for garbage collection 136 | 137 | def restart(self): 138 | """Restart API server""" 139 | bt.logging.info("Restarting API server") 140 | self.stop() 141 | time.sleep(2) # Give it a moment to fully stop 142 | self.start() 143 | def stop(self): 144 | """Stop API server""" 145 | if self.server_thread and self.server_thread.is_alive(): 146 | self.server_thread.join(timeout=5) 147 | bt.logging.info("API server stopped") -------------------------------------------------------------------------------- /vali_utils/api/utils.py: -------------------------------------------------------------------------------- 1 | import bittensor as bt 2 | import random 3 | from fastapi import HTTPException 4 | from functools import wraps 5 | from common.organic_protocol import OrganicRequest 6 | 7 | def select_validation_samples(data, sample_size: int = 1): 8 | """Select random samples from the data for validation""" 9 | if not data: 10 | return [] 11 | 12 | # Select up to sample_size random items, or all items if less than sample_size 13 | sample_count = min(sample_size, len(data)) 14 | return random.sample(data, sample_count) 15 | 16 | 17 | def endpoint_error_handler(func): 18 | """Return 500 status code if endpoint failed""" 19 | @wraps(func) 20 | async def wrapper(*args, **kwargs): 21 | try: 22 | return await func(*args, **kwargs) 23 | except HTTPException: 24 | # Re-raise FastAPI HTTP exceptions 25 | raise 26 | except Exception as e: 27 | bt.logging.error(f"API endpoint error: {str(e)}") 28 | raise HTTPException( 29 | status_code=500, 30 | detail="Internal server error" 31 | ) 32 | return wrapper 33 | 34 | async def query_validator( 35 | wallet: bt.wallet, 36 | validator_host: str, 37 | validator_port: int, 38 | validator_hotkey: str, 39 | source: str, 40 | keywords: list = [], 41 | usernames: list = [], 42 | start_date: str = None, 43 | end_date: str = None, 44 | limit: int = 1000 45 | ): 46 | """ 47 | Query a validator using the OrganicRequest protocol 48 | 49 | Args: 50 | wallet: Bittensor wallet for signing the request 51 | validator_host: Validator IP address or hostname 52 | validator_port: Validator port number 53 | validator_hotkey: Validator hotkey (str) 54 | source: Data source (X or REDDIT) 55 | keywords: List of keywords to search for 56 | usernames: List of usernames to search for 57 | start_date: ISO-formatted start date 58 | end_date: ISO-formatted end date 59 | limit: Maximum number of results to return 60 | 61 | Returns: 62 | OrganicRequest response with data or error information 63 | """ 64 | bt.logging.info(f"Querying validator at {validator_host}:{validator_port} for {source} data") 65 | 66 | # Create an AxonInfo with required fields 67 | axon_info = bt.AxonInfo( 68 | ip=validator_host, 69 | port=validator_port, 70 | ip_type=0, # v4 71 | hotkey=validator_hotkey, 72 | coldkey="", # Not needed 73 | protocol=0, 74 | version=1 75 | ) 76 | 77 | # Prepare the OrganicRequest synapse 78 | synapse = OrganicRequest( 79 | source=source.upper(), 80 | usernames=usernames, 81 | keywords=keywords, 82 | start_date=start_date, 83 | end_date=end_date, 84 | limit=limit 85 | ) 86 | 87 | # Send the request to the validator 88 | try: 89 | async with bt.dendrite(wallet=wallet) as dendrite: 90 | response = await dendrite.forward( 91 | axons=[axon_info], 92 | synapse=synapse, 93 | timeout=180 # 3 minute timeout 94 | ) 95 | 96 | if not response or len(response) == 0: 97 | bt.logging.error("No response received from validator") 98 | return None 99 | 100 | return response[0] 101 | except Exception as e: 102 | bt.logging.error(f"Error querying validator at {validator_host}:{validator_port}: {str(e)}") 103 | raise 104 | -------------------------------------------------------------------------------- /vali_utils/load_balancer/validator_registry.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | from collections import deque 4 | from typing import List, Tuple, Optional, ClassVar 5 | import bittensor as bt 6 | import numpy as np 7 | from pydantic import BaseModel, Field, model_validator 8 | from common.data import DataSource 9 | from common.organic_protocol import OrganicRequest 10 | 11 | class Validator(BaseModel): 12 | uid: int 13 | stake: float 14 | axon: str 15 | hotkey: str 16 | timeout: int = 1 # starting cooldown in seconds; doubles on failure (capped at 86400) 17 | available_at: float = 0.0 # Unix timestamp indicating when the validator is next available 18 | 19 | def update_failure(self, status: str) -> int: 20 | """ 21 | Update the validator's timeout based on failure status. 22 | """ 23 | current_time = time.time() 24 | if status != "error": 25 | self.timeout = 1 26 | self.available_at = current_time 27 | else: 28 | self.timeout = min(self.timeout * 4, 86400) 29 | self.available_at = current_time + self.timeout 30 | 31 | def is_available(self): 32 | """ 33 | Check if the validator is available based on its cooldown. 34 | """ 35 | return time.time() >= self.available_at 36 | 37 | 38 | class ValidatorRegistry(BaseModel): 39 | """ 40 | Class to store the success of forwards to validator axons. 41 | Validators that routinely fail to respond to requests are timed out. 42 | """ 43 | 44 | # Using a default factory ensures validators is always a dict. 45 | validators: dict[int, Validator] = Field(default_factory=dict) 46 | current_index: int = Field(default=0) 47 | 48 | def __init__(self, metagraph: bt.metagraph = None, organic_whitelist: List[str] = None, **data): 49 | super().__init__(**data) 50 | # Initialize with empty dict first 51 | self.validators = {} 52 | 53 | # If metagraph is provided, create validator list immediately 54 | if metagraph is not None: 55 | organic_whitelist = organic_whitelist or [] 56 | validator_uids = np.where(metagraph.stake >= 50_000)[0].tolist() 57 | validator_axons = [metagraph.axons[uid].ip_str().split("/")[2] for uid in validator_uids] 58 | validator_stakes = [metagraph.stake[uid] for uid in validator_uids] 59 | validator_hotkeys = [metagraph.hotkeys[uid] for uid in validator_uids] 60 | self.validators = { 61 | uid: Validator(uid=uid, stake=stake, axon=axon, hotkey=hotkey) 62 | for uid, stake, axon, hotkey in zip(validator_uids, validator_stakes, validator_axons, validator_hotkeys) 63 | if hotkey in organic_whitelist 64 | } 65 | bt.logging.info(f"Validator registry for organics: {self.validators}") 66 | 67 | def get_available_validators(self) -> List[int]: 68 | """ 69 | Get a list of available validators, starting from the current index for cycling. 70 | """ 71 | available = [uid for uid, validator in self.validators.items() if validator.is_available()] 72 | 73 | if not available: 74 | return [] 75 | available.sort() 76 | 77 | # Reorder the list to start from current_index for cycling 78 | if self.current_index >= len(available): 79 | self.current_index = 0 80 | 81 | # If current_index points to a validator that's no longer available, 82 | # just start from the beginning 83 | if self.current_index >= len(available): 84 | ordered_validators = available 85 | else: 86 | # Start the list from current_index 87 | ordered_validators = available[self.current_index:] + available[:self.current_index] 88 | self.current_index = (self.current_index + 1) % max(1, len(available)) 89 | 90 | return ordered_validators 91 | 92 | def update_validators(self, uid: int, response_code: int) -> None: 93 | """ 94 | Update a specific validator's failure count based on the response code. 95 | If the validator's failure count exceeds the maximum allowed failures, 96 | the validator is removed from the registry. 97 | """ 98 | if uid in self.validators: 99 | self.validators[uid].update_failure(response_code) -------------------------------------------------------------------------------- /vali_utils/miner_iterator.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | import copy 3 | import threading 4 | from typing import List 5 | 6 | import random 7 | 8 | 9 | class MinerIterator: 10 | """A thread safe infinite iterator to cyclically enumerate the current set of miner UIDs. 11 | 12 | Why? To perform miner evaluations, the validator will enumerate through the miners in order to help ensure 13 | each miner is evaluated at least once per epoch. 14 | """ 15 | 16 | def __init__(self, miner_uids: List[int]): 17 | self.miner_uids = sorted(copy.deepcopy(miner_uids)) 18 | # Start the index at a random position. This helps ensure that miners with high UIDs aren't penalized if 19 | # the validator restarts frequently. 20 | self.index = random.randint(0, len(self.miner_uids) - 1) 21 | self.lock = threading.Lock() 22 | 23 | def __iter__(self): 24 | return self 25 | 26 | def __next__(self) -> int: 27 | with self.lock: 28 | if len(self.miner_uids) == 0: 29 | # This iterator should be infinite. If there are no miner UIDs, raise an error. 30 | raise IndexError("No miner UIDs.") 31 | 32 | uid = self.miner_uids[self.index] 33 | self.index += 1 34 | if self.index >= len(self.miner_uids): 35 | self.index = 0 36 | return uid 37 | 38 | def peek(self) -> int: 39 | """Returns the next miner UID without advancing the iterator.""" 40 | with self.lock: 41 | if len(self.miner_uids) == 0: 42 | # This iterator should be infinite. If there are no miner UIDs, raise an error. 43 | raise IndexError("No miner UIDs.") 44 | 45 | return self.miner_uids[self.index] 46 | 47 | def set_miner_uids(self, miner_uids: List[int]): 48 | """Updates the miner UIDs to iterate. 49 | 50 | The iterator will be updated to the first miner uid that is greater than or equal to UID that would be next 51 | returned by the iterator. This helps ensure that frequent updates to the miner_uids does not cause too much 52 | churn in the sequence of UIDs returned by the iterator. 53 | """ 54 | sorted_uids = sorted(copy.deepcopy(miner_uids)) 55 | with self.lock: 56 | next_uid = self.miner_uids[self.index] 57 | new_index = bisect.bisect_left(sorted_uids, next_uid) 58 | if new_index >= len(sorted_uids): 59 | new_index = 0 60 | self.index = new_index 61 | self.miner_uids = sorted_uids 62 | --------------------------------------------------------------------------------