├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── access_1.png
    ├── access_2.png
    ├── creating_1.png
    ├── creating_2.png
    ├── creating_3.png
    ├── creating_4.png
    ├── macrocosmos-black.png
    ├── macrocosmos-white.png
    ├── retrieval_1.png
    ├── retrieval_2.png
    └── retrieval_3.png
├── common
    ├── __init__.py
    ├── constants.py
    ├── data.py
    ├── data_v2.py
    ├── date_range.py
    ├── metagraph_syncer.py
    ├── old_protocol.py
    ├── organic_protocol.py
    ├── protocol.py
    └── utils.py
├── docs
    ├── apify.md
    ├── dd_validator_instructions.md
    ├── dynamic_desirability.md
    ├── hugging_face_validation.md
    ├── huggingface_setup.md
    ├── miner.md
    ├── miner_policy.md
    ├── on_demand.md
    ├── reddit.md
    ├── scoring.md
    ├── validator.md
    └── youtube.md
├── dynamic_desirability
    ├── chain_utils.py
    ├── constants.py
    ├── data.py
    ├── default.json
    ├── desirability_retrieval.py
    └── desirability_uploader.py
├── huggingface_utils
    ├── dataset_card.py
    ├── encoding_system.py
    ├── huggingface_uploader.py
    ├── s3_utils.py
    └── utils.py
├── neurons
    ├── __init__.py
    ├── config.py
    ├── miner.py
    └── validator.py
├── requirements.txt
├── rewards
    ├── __init__.py
    ├── data.py
    ├── data_desirability_lookup.py
    ├── data_value_calculator.py
    └── miner_scorer.py
├── scraping
    ├── __init__.py
    ├── apify.py
    ├── config
    │   ├── __init__.py
    │   ├── config_reader.py
    │   ├── model.py
    │   └── scraping_config.json
    ├── coordinator.py
    ├── provider.py
    ├── reddit
    │   ├── __init__.py
    │   ├── model.py
    │   ├── reddit_custom_scraper.py
    │   ├── reddit_lite_scraper.py
    │   └── utils.py
    ├── scraper.py
    ├── utils.py
    ├── x
    │   ├── __init__.py
    │   ├── apidojo_scraper.py
    │   ├── enhanced_apidojo_scraper.py
    │   ├── microworlds_scraper.py
    │   ├── model.py
    │   ├── on_demand_model.py
    │   ├── quacker_url_scraper.py
    │   └── utils.py
    └── youtube
    │   ├── model.py
    │   ├── utils.py
    │   └── youtube_custom_scraper.py
├── scripts
    └── start_validator.py
├── setup.py
├── storage
    ├── miner
    │   ├── miner_storage.py
    │   └── sqlite_miner_storage.py
    └── validator
    │   ├── hf_validator_storage.py
    │   ├── sqlite_memory_validator_storage.py
    │   └── validator_storage.py
├── tests
    ├── __init__.py
    ├── common
    │   ├── __init__.py
    │   ├── test_data.py
    │   ├── test_data_v2.py
    │   ├── test_metagraph_syncer.py
    │   ├── test_protocol.py
    │   └── test_utils.py
    ├── dynamic_desirability
    │   └── test_lookup_conversion.py
    ├── hf_validation
    │   ├── test_decode_url_protocol.py
    │   ├── test_encoding_key.json
    │   ├── test_reddit_dataset_validation.py
    │   └── test_x_dataset_validation.py
    ├── integration
    │   ├── __init__.py
    │   ├── test_on_demand.py
    │   └── test_protocol.py
    ├── neurons
    │   ├── __init__.py
    │   ├── test_miner_config.py
    │   └── test_validator_config.py
    ├── rewards
    │   ├── __init__.py
    │   ├── test_data_value_calculator.py
    │   └── test_miner_scorer.py
    ├── scraping
    │   ├── __init__.py
    │   ├── config
    │   │   ├── __init__.py
    │   │   ├── invalid_config.json
    │   │   ├── test_config_reader.py
    │   │   ├── test_model.py
    │   │   └── valid_config.json
    │   ├── reddit
    │   │   ├── __init__.py
    │   │   ├── test_model.py
    │   │   └── test_utils.py
    │   ├── test_coordinator.py
    │   ├── test_utils.py
    │   ├── x
    │   │   ├── __init__.py
    │   │   ├── test_model.py
    │   │   └── test_utils.py
    │   └── youtube
    │   │   └── test_compression.py
    ├── storage
    │   ├── __init__.py
    │   ├── miner
    │   │   ├── __init__.py
    │   │   └── test_sqlite_miner_storage.py
    │   └── validator
    │   │   ├── __init__.py
    │   │   └── test_sqlite_memory_validator_storage.py
    ├── test_all.py
    ├── utils.py
    └── vali_utils
    │   ├── __init__.py
    │   ├── test_miner_iterator.py
    │   ├── test_vali_utils.py
    │   └── test_validator_s3_access.py
└── vali_utils
    ├── __init__.py
    ├── api
        ├── auth
        │   ├── auth.py
        │   └── key_routes.py
        ├── models.py
        ├── routes.py
        ├── server.py
        └── utils.py
    ├── hf_utils.py
    ├── load_balancer
        └── validator_registry.py
    ├── miner_evaluator.py
    ├── miner_iterator.py
    ├── utils.py
    └── validator_s3_access.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # VS code
  7 | .vscode/
  8 | .env
  9 | 
 10 | # Playground notebooks
 11 | *Playground.ipynb
 12 | 
 13 | # Test created DB
 14 | myDb
 15 | mydb
 16 | *.sqlite
 17 | 
 18 | # C extensions
 19 | *.so
 20 | 
 21 | # Distribution / packaging
 22 | .Python
 23 | build/
 24 | develop-eggs/
 25 | dist/
 26 | downloads/
 27 | eggs/
 28 | .eggs/
 29 | lib/
 30 | lib64/
 31 | parts/
 32 | sdist/
 33 | var/
 34 | wheels/
 35 | share/python-wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | MANIFEST
 40 | 
 41 | # PyInstaller
 42 | #  Usually these files are written by a python script from a template
 43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 44 | *.manifest
 45 | *.spec
 46 | 
 47 | # Installer logs
 48 | pip-log.txt
 49 | pip-delete-this-directory.txt
 50 | 
 51 | # Unit test / coverage reports
 52 | htmlcov/
 53 | .tox/
 54 | .nox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | *.py,cover
 62 | .hypothesis/
 63 | .pytest_cache/
 64 | cover/
 65 | 
 66 | # Translations
 67 | *.mo
 68 | *.pot
 69 | 
 70 | # Django stuff:
 71 | *.log
 72 | local_settings.py
 73 | db.sqlite3
 74 | db.sqlite3-journal
 75 | 
 76 | # Flask stuff:
 77 | instance/
 78 | .webassets-cache
 79 | 
 80 | # Scrapy stuff:
 81 | .scrapy
 82 | 
 83 | # Sphinx documentation
 84 | docs/_build/
 85 | 
 86 | # PyBuilder
 87 | .pybuilder/
 88 | target/
 89 | 
 90 | # Jupyter Notebook
 91 | .ipynb_checkpoints
 92 | 
 93 | # IPython
 94 | profile_default/
 95 | ipython_config.py
 96 | 
 97 | # pyenv
 98 | #   For a library or package, you might want to ignore these files since the code is
 99 | #   intended to run in multiple environments; otherwise, check them in:
100 | # .python-version
101 | 
102 | # pipenv
103 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
104 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
105 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
106 | #   install all needed dependencies.
107 | #Pipfile.lock
108 | 
109 | # poetry
110 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
111 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
112 | #   commonly ignored for libraries.
113 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
114 | #poetry.lock
115 | 
116 | # pdm
117 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
118 | #pdm.lock
119 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
120 | #   in version control.
121 | #   https://pdm.fming.dev/#use-with-ide
122 | .pdm.toml
123 | 
124 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
125 | __pypackages__/
126 | 
127 | # Celery stuff
128 | celerybeat-schedule
129 | celerybeat.pid
130 | 
131 | # SageMath parsed files
132 | *.sage.py
133 | 
134 | # Environments
135 | .env
136 | .venv
137 | env/
138 | venv/
139 | ENV/
140 | env.bak/
141 | venv.bak/
142 | 
143 | # Spyder project settings
144 | .spyderproject
145 | .spyproject
146 | 
147 | # Rope project settings
148 | .ropeproject
149 | 
150 | # mkdocs documentation
151 | /site
152 | 
153 | # mypy
154 | .mypy_cache/
155 | .dmypy.json
156 | dmypy.json
157 | 
158 | # Pyre type checker
159 | .pyre/
160 | 
161 | # pytype static type analyzer
162 | .pytype/
163 | 
164 | # Cython debug symbols
165 | cython_debug/
166 | 
167 | # PyCharm
168 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
169 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
170 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
171 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
172 | #.idea/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 RusticLuftig
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/assets/access_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/access_1.png


--------------------------------------------------------------------------------
/assets/access_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/access_2.png


--------------------------------------------------------------------------------
/assets/creating_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/creating_1.png


--------------------------------------------------------------------------------
/assets/creating_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/creating_2.png


--------------------------------------------------------------------------------
/assets/creating_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/creating_3.png


--------------------------------------------------------------------------------
/assets/creating_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/creating_4.png


--------------------------------------------------------------------------------
/assets/macrocosmos-black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/macrocosmos-black.png


--------------------------------------------------------------------------------
/assets/macrocosmos-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/macrocosmos-white.png


--------------------------------------------------------------------------------
/assets/retrieval_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/retrieval_1.png


--------------------------------------------------------------------------------
/assets/retrieval_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/retrieval_2.png


--------------------------------------------------------------------------------
/assets/retrieval_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/assets/retrieval_3.png


--------------------------------------------------------------------------------
/common/__init__.py:
--------------------------------------------------------------------------------
1 | # A package for common code shared between miners and validators.


--------------------------------------------------------------------------------
/common/constants.py:
--------------------------------------------------------------------------------
 1 | import datetime as dt
 2 | from . import utils
 3 | 
 4 | # Collection of constants for use throughout the codebase.
 5 | 
 6 | # How big any one data entity bucket can be to limit size over the wire.
 7 | DATA_ENTITY_BUCKET_SIZE_LIMIT_BYTES = utils.mb_to_bytes(128)
 8 | 
 9 | # How many data entity buckets any one miner index can have to limit necessary storage on the validators.
10 | DATA_ENTITY_BUCKET_COUNT_LIMIT_PER_MINER_INDEX = 200_000
11 | DATA_ENTITY_BUCKET_COUNT_LIMIT_PER_MINER_INDEX_PROTOCOL_3 = 250_000
12 | DATA_ENTITY_BUCKET_COUNT_LIMIT_PER_MINER_INDEX_PROTOCOL_4 = 350_000
13 | 
14 | # How big the collection of contents can be to limit size over the wire.
15 | BULK_CONTENTS_SIZE_LIMIT_BYTES = utils.mb_to_bytes(128)
16 | BULK_CONTENTS_COUNT_LIMIT = 200_000
17 | 
18 | # How many different buckets can be requests at once.
19 | BULK_BUCKETS_COUNT_LIMIT = 100
20 | 
21 | # How old a data entity bucket can be before the validators do not assign any value for them.
22 | DATA_ENTITY_BUCKET_AGE_LIMIT_DAYS = 30
23 | 
24 | # The maximum number of characters a label can have.
25 | MAX_LABEL_LENGTH = 140
26 | 
27 | # The current protocol version (int)
28 | PROTOCOL_VERSION = 4
29 | 
30 | # Min evaluation period that must pass before a validator re-evaluates a miner.
31 | MIN_EVALUATION_PERIOD = dt.timedelta(minutes=60)
32 | 
33 | # Miner compressed index cache freshness.
34 | MINER_CACHE_FRESHNESS = dt.timedelta(minutes=20)
35 | 
36 | # Date after which only x.com URLs are accepted
37 | NO_TWITTER_URLS_DATE = dt.datetime(2024, 12, 28, tzinfo=dt.timezone.utc)  # December 28, 2024 UTC
38 | 
39 | # Date after which media content is required for tweets that contain media
40 | MEDIA_REQUIRED_DATE = dt.datetime(2025, 5, 23, tzinfo=dt.timezone.utc)  # May 23, 2025 UTC
41 | BYTE_ALLOWANCE_DATE = dt.datetime(2025, 6, 7, tzinfo=dt.timezone.utc)  # June 7, 2025 UTC
42 | EVALUATION_ON_STARTUP = 15
43 | 


--------------------------------------------------------------------------------
/common/data_v2.py:
--------------------------------------------------------------------------------
  1 | """data.py contains the original data structures used for the project.
  2 | 
  3 | data_v2.py contains the newer data structures used, which are more performant.
  4 | 
  5 | From the original data structures we learned:
  6 | 1. Pydantic adds a huge overhead for performance, particularly when creating > 1M objects
  7 | 2. Object nesting has notable performance overhead
  8 | 
  9 | Hence, with the V2 models, we make trade-off the nicer coding symantics in exchange for better performance.
 10 | 
 11 | If a class needs to be included as a Field in a pydantic BaseModel, it should be a dataclass (which adds a small overhead),
 12 | because pydantic know how to serialize dataclasses, as long as all fields are themselves JSON serializable. 
 13 | 
 14 | As a rule of thumb:
 15 | 1. If the class needs to perform validation on fields, use a class with a custom __init__, __eq__, and __hash__.
 16 | 2. Always use __slots__.
 17 | """
 18 | 
 19 | import datetime as dt
 20 | from pydantic import BaseModel, Field, ConfigDict
 21 | from typing import List, Optional
 22 | 
 23 | from common import constants
 24 | from common.data import (
 25 |     DataEntityBucket,
 26 |     DataEntityBucketId,
 27 |     DataLabel,
 28 |     DataSource,
 29 |     TimeBucket,
 30 | )
 31 | 
 32 | 
 33 | class ScorableDataEntityBucket:
 34 |     """Composes both a DataEntityBucket and additional information required for scoring.
 35 | 
 36 |     Attributes:
 37 |         scorable_bytes: Scorable bytes are the bytes that can be credited to this miner for scoring.
 38 |         This is always less than or equal to the total size of the chunk.
 39 |         This scorable bytes are computed as:
 40 |             1 byte for every byte in size_bytes that no other miner has in their index.
 41 |             1 byte / # of miners that have this chunk in their index for every byte in size_bytes
 42 |             that at least one other miner has in their index.
 43 |     """
 44 | 
 45 |     __slots__ = "time_bucket_id", "source", "label", "size_bytes", "scorable_bytes"
 46 | 
 47 |     def __init__(
 48 |         self,
 49 |         time_bucket_id: int,
 50 |         source: DataSource,
 51 |         label: Optional[str],
 52 |         size_bytes: int,
 53 |         scorable_bytes: int,
 54 |     ):
 55 |         if label and len(label) > constants.MAX_LABEL_LENGTH:
 56 |             raise ValueError("Label value cannot be longer than 140 characters.")
 57 |         if not 0 <= size_bytes <= constants.DATA_ENTITY_BUCKET_SIZE_LIMIT_BYTES:
 58 |             raise ValueError(
 59 |                 f"Size must be between 0 and {constants.DATA_ENTITY_BUCKET_SIZE_LIMIT_BYTES}."
 60 |             )
 61 |         if not 0 <= scorable_bytes <= constants.DATA_ENTITY_BUCKET_SIZE_LIMIT_BYTES:
 62 |             raise ValueError(
 63 |                 f"Scorable bytes must be between 0 and {constants.DATA_ENTITY_BUCKET_SIZE_LIMIT_BYTES}."
 64 |             )
 65 |         if scorable_bytes > size_bytes:
 66 |             raise ValueError(
 67 |                 f"Scorable bytes cannot be greater than size bytes. Scorable bytes: {scorable_bytes}, size bytes: {size_bytes}."
 68 |             )
 69 | 
 70 |         self.time_bucket_id = time_bucket_id
 71 |         self.source = source
 72 |         self.label = label.casefold() if label else None
 73 |         self.size_bytes = size_bytes
 74 |         self.scorable_bytes = scorable_bytes
 75 | 
 76 |     def __repr__(self):
 77 |         return f"ScorableDataEntityBucket(time_bucket_id={self.time_bucket_id}, source={self.source}, label={self.label}, size_bytes={self.size_bytes}, scorable_bytes={self.scorable_bytes})"
 78 | 
 79 |     def __eq__(self, other):
 80 |         return (
 81 |             self.time_bucket_id == other.time_bucket_id
 82 |             and self.source == other.source
 83 |             and self.label == other.label
 84 |             and self.size_bytes == other.size_bytes
 85 |             and self.scorable_bytes == other.scorable_bytes
 86 |         )
 87 | 
 88 |     def __hash__(self):
 89 |         return hash(
 90 |             (
 91 |                 self.time_bucket_id,
 92 |                 self.source,
 93 |                 self.label,
 94 |                 self.size_bytes,
 95 |                 self.scorable_bytes,
 96 |             )
 97 |         )
 98 | 
 99 |     def to_data_entity_bucket(self) -> DataEntityBucket:
100 |         return DataEntityBucket(
101 |             id=DataEntityBucketId(
102 |                 time_bucket=TimeBucket(id=self.time_bucket_id),
103 |                 source=self.source,
104 |                 label=DataLabel(value=self.label) if self.label else None,
105 |             ),
106 |             size_bytes=self.size_bytes,
107 |         )
108 | 
109 | 
110 | class ScorableMinerIndex(BaseModel):
111 |     """The Miner index, with additional information required for scoring.
112 | 
113 |     Use a pydantic model for this class, because we only create 1 per miner,
114 |     so the additional overhead is acceptable.
115 |     """
116 | 
117 |     model_config = ConfigDict(
118 |         arbitrary_types_allowed=True,
119 |         frozen=True
120 |     )
121 | 
122 |     scorable_data_entity_buckets: List[ScorableDataEntityBucket] = Field(
123 |         description="DataEntityBuckets the miner is serving, scored on uniqueness.",
124 |         max_length=constants.DATA_ENTITY_BUCKET_COUNT_LIMIT_PER_MINER_INDEX_PROTOCOL_4,
125 |     )
126 |     last_updated: dt.datetime = Field(description="Time last updated in UTC.")


--------------------------------------------------------------------------------
/common/date_range.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | import datetime as dt
 3 | 
 4 | 
 5 | @dataclasses.dataclass(frozen=True)
 6 | class DateRange:
 7 |     """Represents a specific time range from start time inclusive to end time exclusive."""
 8 | 
 9 |     # The start time inclusive of the time range.
10 |     start: dt.datetime
11 | 
12 |     # The end time exclusive of the time range.
13 |     end: dt.datetime
14 | 
15 |     def contains(self, datetime: dt.datetime) -> bool:
16 |         """Returns True if the provided datetime is within this DateRange."""
17 |         return self.start <= datetime < self.end
18 | 


--------------------------------------------------------------------------------
/common/metagraph_syncer.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import dataclasses
  3 | from dataclasses import field
  4 | from datetime import datetime
  5 | import functools
  6 | import bittensor as bt
  7 | from typing import Dict, List, Callable, Optional
  8 | import threading
  9 | import traceback
 10 | 
 11 | from common import utils
 12 | 
 13 | 
 14 | class MetagraphSyncer:
 15 |     @dataclasses.dataclass
 16 |     class _State:
 17 |         metagraph: Optional[bt.metagraph] = None
 18 |         last_synced_time: Optional[datetime] = None
 19 |         listeners: List = field(default_factory=list)
 20 | 
 21 |     def __init__(self, subtensor: bt.subtensor, config: Dict[int, int]):
 22 |         """Constructs a new MetagraphSyncer, that periodically refreshes metagraph defined in the config.
 23 | 
 24 |         Args:
 25 |             subtensor (bt.subtensor): The subtensor used to fetch the metagraphs.
 26 |             config (Dict[int, int]): A mapping of netuid to the cadence (in seconds) to sync the metagraph.
 27 |         """
 28 |         self.subtensor = subtensor
 29 |         self.config = config
 30 |         self.metagraph_map: Dict[int, MetagraphSyncer._State] = {
 31 |             netuid: MetagraphSyncer._State() for netuid in config.keys()
 32 |         }
 33 |         self.is_running = False
 34 |         self.done_initial_sync = False
 35 |         self.lock = threading.RLock()
 36 | 
 37 |         bt.logging.info(f"MetagraphSyncer created with config: {config}")
 38 | 
 39 |     def do_initial_sync(self):
 40 |         """Performs an initial sync of all metagraphs.
 41 | 
 42 |         Unlike regular syncs, this will not notify listeners of the updated metagraph.
 43 |         """
 44 |         bt.logging.debug("Metagraph syncer do_initial_sync called")
 45 | 
 46 |         for netuid in self.config.keys():
 47 |             fn = functools.partial(self.subtensor.metagraph, netuid)
 48 |             metagraph = utils.run_in_thread(fn, ttl=120, name=f"InitalSync-{netuid}")
 49 |             with self.lock:
 50 |                 state = self.metagraph_map[netuid]
 51 |                 state.metagraph = metagraph
 52 |                 state.last_synced_time = datetime.now()
 53 | 
 54 |             bt.logging.debug(f"Successfully loaded metagraph for {netuid}")
 55 | 
 56 |         self.done_initial_sync = True
 57 | 
 58 |     def start(self):
 59 |         bt.logging.debug("Metagraph syncer start called")
 60 | 
 61 |         assert self.done_initial_sync, "Must call do_initial_sync before starting"
 62 | 
 63 |         self.is_running = True
 64 |         thread = threading.Thread(target=self._run, daemon=True)
 65 |         thread.start()
 66 | 
 67 |     async def _sync_metagraph_loop(self, netuid: int, cadence: int):
 68 |         while self.is_running:
 69 |             # On start, wait cadence before the first sync.
 70 |             bt.logging.trace(f"Syncing metagraph for {netuid} in {cadence} seconds.")
 71 |             await asyncio.sleep(cadence)
 72 | 
 73 |             try:
 74 |                 # Intentionally block the shared thread so that we only
 75 |                 # sync 1 metagraph at a time.
 76 |                 bt.logging.trace(f"Syncing metagraph for {netuid}.")
 77 |                 metagraph = utils.run_in_thread(
 78 |                     functools.partial(self.subtensor.metagraph, netuid),
 79 |                     ttl=120,
 80 |                     name=f"Sync-{netuid}",
 81 |                 )
 82 |                 bt.logging.trace(f"Successfully synced metagraph for {netuid}.")
 83 |                 state = None
 84 |                 with self.lock:
 85 |                     # Store metagraph and sync time
 86 |                     state = self.metagraph_map[netuid]
 87 |                     state.metagraph = metagraph
 88 |                     state.last_synced_time = datetime.now()
 89 | 
 90 |                 self._notify_listeners(state, netuid)
 91 |             except (BaseException, Exception) as e:
 92 |                 bt.logging.error(
 93 |                     f"Error when syncing metagraph for {netuid}: {e}. Retrying in 60 seconds."
 94 |                 )
 95 |                 await asyncio.sleep(60)
 96 | 
 97 |     async def _run_async(self):
 98 |         # For each netuid we should sync metagraphs for, spawn a Task to sync it.
 99 |         await asyncio.wait(
100 |             [
101 |                 asyncio.create_task(self._sync_metagraph_loop(netuid, cadence))
102 |                 for netuid, cadence in self.config.items()
103 |             ],
104 |             return_when=asyncio.ALL_COMPLETED,
105 |         )
106 | 
107 |     def _run(self):
108 |         try:
109 |             asyncio.run(self._run_async())
110 |         finally:
111 |             bt.logging.info("MetagraphSyncer _run complete.")
112 | 
113 |     def register_listener(
114 |         self, listener: Callable[[bt.metagraph, int], None], netuids: List[int]
115 |     ):
116 |         """Registers a listener to be notified when a metagraph for any netuid in netuids is updated.
117 | 
118 |         The listener will be called from a different thread, so it must be thread-safe.
119 |         """
120 |         if not netuids:
121 |             raise ValueError("Must provide at least 1 netuid")
122 | 
123 |         with self.lock:
124 |             for netuid in netuids:
125 |                 if netuid not in self.metagraph_map:
126 |                     raise ValueError(
127 |                         f"Metagraph for {netuid} not being tracked in MetagraphSyncer."
128 |                     )
129 |                 self.metagraph_map[netuid].listeners.append(listener)
130 | 
131 |     def get_metagraph(self, netuid: int) -> bt.metagraph:
132 |         """Returns the last synced version of the metagraph for netuid."""
133 |         with self.lock:
134 |             if netuid not in self.metagraph_map:
135 |                 raise ValueError(
136 |                     f"Metagraph for {netuid} not known to MetagraphSyncer."
137 |                 )
138 |             metagraph = self.metagraph_map[netuid].metagraph
139 |             if not metagraph:
140 |                 raise ValueError(f"Metagraph for {netuid} has not been synced yet.")
141 |             return metagraph
142 | 
143 |     def _notify_listeners(self, state: _State, netuid: int):
144 |         """Notifies listeners of a new metagraph for netuid."""
145 |         bt.logging.debug(f"Notifying listeners of update to metagraph for {netuid}.")
146 | 
147 |         for listener in state.listeners:
148 |             try:
149 |                 listener(state.metagraph, netuid)
150 |             except Exception:
151 |                 bt.logging.error(
152 |                     f"Exception caught notifying {netuid} listener of metagraph update.\n{traceback.format_exc()}"
153 |                 )
154 | 


--------------------------------------------------------------------------------
/common/old_protocol.py:
--------------------------------------------------------------------------------
 1 | # The MIT License (MIT)
 2 | # Copyright © 2023 data-universe
 3 | 
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation
 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 8 | 
 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of
10 | # the Software.
11 | 
12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
16 | # DEALINGS IN THE SOFTWARE.
17 | 
18 | import bittensor as bt
19 | import pydantic
20 | from common import constants
21 | from common.data import DataEntityBucket, DataEntity, DataEntityBucketId
22 | from typing import List, Optional
23 | 
24 | 
25 | class GetMinerIndex(bt.Synapse):
26 |     """
27 |     Protocol by which Validators can retrieve the Index from a Miner.
28 | 
29 |     Attributes:
30 |     - data_entity_buckets: A list of DataEntityBucket objects that the Miner can serve.
31 |     """
32 | 
33 |     # Required request output, filled by receiving axon.
34 |     data_entity_buckets: List[DataEntityBucket] = pydantic.Field(
35 |         title="data_entity_buckets",
36 |         description="All of the data entity buckets that a Miner can serve.",
37 |         frozen=False,
38 |         repr=False,
39 |         max_items=constants.DATA_ENTITY_BUCKET_COUNT_LIMIT_PER_MINER_INDEX,
40 |         default_factory=list,
41 |     )
42 | 
43 | 
44 | class GetDataEntityBucket(bt.Synapse):
45 |     """
46 |     Protocol by which Validators can retrieve the DataEntities of a Bucket from a Miner.
47 | 
48 |     Attributes:
49 |     - bucket_id: The id of the bucket that the requester is asking for.
50 |     - data_entities: A list of DataEntity objects that make up the requested DataEntityBucket.
51 |     """
52 | 
53 |     # Required request input, filled by sending dendrite caller.
54 |     data_entity_bucket_id: Optional[DataEntityBucketId] = pydantic.Field(
55 |         title="data_entity_bucket_id",
56 |         description="The identifier for the requested DataEntityBucket.",
57 |         frozen=True,
58 |         repr=False,
59 |         default=None,
60 |     )
61 | 
62 |     # Required request output, filled by recieving axon.
63 |     data_entities: List[DataEntity] = pydantic.Field(
64 |         title="data_entities",
65 |         description="All of the data that makes up the requested DataEntityBucket.",
66 |         frozen=False,
67 |         repr=False,
68 |         default_factory=list,
69 |     )
70 | 
71 | 
72 | # TODO Protocol for Users to Query Data which will accept query parameters such as a startDatetime, endDatetime.
73 | 


--------------------------------------------------------------------------------
/common/organic_protocol.py:
--------------------------------------------------------------------------------
 1 | import bittensor as bt
 2 | from typing import List, Dict, Any, Optional
 3 | from common.data import DataSource
 4 | 
 5 | 
 6 | class OrganicRequest(bt.Synapse):
 7 |     """Direct query synapse for organic data requests"""
 8 | 
 9 |     # Input fields
10 |     source: str
11 |     usernames: List[str] = []
12 |     keywords: List[str] = []
13 |     start_date: Optional[str] = None
14 |     end_date: Optional[str] = None
15 |     limit: int = 100
16 | 
17 |     # Output fields
18 |     data: List[Dict[str, Any]] = []
19 |     meta: Dict[str, Any] = {}
20 |     status: str = "pending"
21 | 
22 |     def deserialize(self) -> Dict[str, Any]:
23 |         """Convert synapse to dictionary for response"""
24 |         return {
25 |             "status": self.status,
26 |             "data": self.data,
27 |             "meta": self.meta
28 |         }


--------------------------------------------------------------------------------
/common/protocol.py:
--------------------------------------------------------------------------------
  1 | import bittensor as bt
  2 | from pydantic import Field, ConfigDict, field_validator
  3 | from common.data import (
  4 |     DataSource,
  5 |     DataEntityBucket,
  6 |     DataEntity,
  7 |     DataEntityBucketId,
  8 |     HuggingFaceMetadata
  9 | )
 10 | from typing import Dict, List, Optional, Tuple
 11 | 
 12 | 
 13 | class BaseProtocol(bt.Synapse):
 14 |     model_config = ConfigDict(
 15 |         arbitrary_types_allowed=True,
 16 |         validate_assignment=True
 17 |     )
 18 | 
 19 |     version: Optional[int] = Field(
 20 |         description="Protocol version",
 21 |         default=None
 22 |     )
 23 | 
 24 | 
 25 | class GetMinerIndex(BaseProtocol):
 26 |     """
 27 |     Protocol by which Validators can retrieve the Index from a Miner.
 28 | 
 29 |     Attributes:
 30 |     - data_entity_buckets: A list of DataEntityBucket objects that the Miner can serve.
 31 |     """
 32 | 
 33 |     # We opt to send the compressed index in pre-serialized form to have full control
 34 |     # over serialization and deserialization, rather than relying on fastapi and bittensors
 35 |     # interactions with pydantic serialization, which can be problematic for certain types.
 36 |     compressed_index_serialized: Optional[str] = Field(
 37 |         description="The compressed index of the Miner of type CompressedMinerIndex.",
 38 |         frozen=False,
 39 |         repr=False,
 40 |         default=None,
 41 |     )
 42 | 
 43 | 
 44 | class GetDataEntityBucket(BaseProtocol):
 45 |     """
 46 |     Protocol by which Validators can retrieve the DataEntities of a Bucket from a Miner.
 47 | 
 48 |     Attributes:
 49 |     - bucket_id: The id of the bucket that the requester is asking for.
 50 |     - data_entities: A list of DataEntity objects that make up the requested DataEntityBucket.
 51 |     """
 52 | 
 53 |     data_entity_bucket_id: Optional[DataEntityBucketId] = Field(
 54 |         title="data_entity_bucket_id",
 55 |         description="The identifier for the requested DataEntityBucket.",
 56 |         frozen=True,
 57 |         repr=False,
 58 |         default=None,
 59 |     )
 60 | 
 61 |     data_entities: List[DataEntity] = Field(
 62 |         title="data_entities",
 63 |         description="All of the data that makes up the requested DataEntityBucket.",
 64 |         frozen=False,
 65 |         repr=False,
 66 |         default_factory=list,
 67 |     )
 68 | 
 69 | 
 70 | class GetContentsByBuckets(BaseProtocol):
 71 |     """
 72 |     Protocol by which Validators can retrieve contents from one or more Miner Buckets.
 73 |     After March 1st all contents have their creation timestamp obfuscated to the minute.
 74 | 
 75 |     Attributes:
 76 |     - bucket_ids: The ids of the buckets that the requester is asking for.
 77 |     - bucket_ids_to_contents: A dict of DataEntityBucketId objects to a list of contained contents.
 78 |     """
 79 | 
 80 |     data_entity_bucket_ids: Optional[List[DataEntityBucketId]] = Field(
 81 |         title="data_entity_bucket_ids",
 82 |         description="The identifiers for the requested DataEntityBuckets.",
 83 |         frozen=True,
 84 |         repr=False,
 85 |         default=None,
 86 |     )
 87 | 
 88 |     bucket_ids_to_contents: List[Tuple[DataEntityBucketId, List[bytes]]] = Field(
 89 |         title="bucket_ids_to_contents",
 90 |         description="A list of bucket ids to the contents contained by that bucket. Each DataEntityBucketId appears at most once. This is just a flattened dictionary.",
 91 |         frozen=False,
 92 |         repr=False,
 93 |         default_factory=list,
 94 |     )
 95 | 
 96 | 
 97 | class GetHuggingFaceMetadata(BaseProtocol):
 98 |     """
 99 |     Protocol by which Validators can retrieve HuggingFace metadata from a Miner.
100 |     """
101 | 
102 |     metadata: List[HuggingFaceMetadata] = Field(
103 |         title="metadata",
104 |         description="List of HuggingFace metadata entries.",
105 |         default_factory=list
106 |     )
107 | 
108 | 
109 | class DecodeURLRequest(BaseProtocol):
110 |     """
111 |     Protocol by which Validators can request URL decoding from a Miner.
112 | 
113 |     Attributes:
114 |     - encoded_urls: A list of encoded URL strings to be decoded
115 |     - decoded_urls: A list of decoded URL strings returned by the miner
116 |     """
117 | 
118 |     encoded_urls: List[str] = Field(
119 |         title="encoded_urls",
120 |         description="List of encoded URLs that need to be decoded",
121 |         frozen=True,
122 |         repr=False,
123 |         default_factory=list,
124 |         max_length=10  # Changed from validator to direct Field constraint
125 |     )
126 | 
127 |     decoded_urls: List[str] = Field(
128 |         title="decoded_urls",
129 |         description="List of decoded URLs corresponding to the encoded URLs",
130 |         frozen=False,
131 |         repr=False,
132 |         default_factory=list,
133 |     )
134 | 
135 | 
136 | class OnDemandRequest(BaseProtocol):
137 |     """Protocol for on-demand data retrieval requests"""
138 | 
139 |     # Request parameters
140 |     source: Optional[DataSource] = Field(
141 |         default=None,
142 |         description="Source to query (X or Reddit)"
143 |     )
144 | 
145 |     usernames: List[str] = Field(
146 |         default_factory=list,
147 |         description="Usernames to fetch data from",
148 |         max_length=10
149 |     )
150 | 
151 |     keywords: List[str] = Field(
152 |         default_factory=list,
153 |         description="Keywords/hashtags to search for",
154 |         max_length=5
155 |     )
156 | 
157 |     start_date: Optional[str] = Field(
158 |         default=None,
159 |         description="Start date (ISO format)"
160 |     )
161 | 
162 |     end_date: Optional[str] = Field(
163 |         default=None,
164 |         description="End date (ISO format)"
165 |     )
166 | 
167 |     limit: int = Field(
168 |         default=100,
169 |         ge=1,
170 |         le=1000,
171 |         description="Maximum items to return"
172 |     )
173 | 
174 |     # Response fields
175 |     data: List[DataEntity] = Field(
176 |         default_factory=list,
177 |         description="Retrieved data"
178 |     )
179 | 
180 |     version: Optional[int] = Field(
181 |         default=None,
182 |         description="Protocol version"
183 |     )
184 | 
185 | # How many times validators can send requests per validation period.
186 | REQUEST_LIMIT_BY_TYPE_PER_PERIOD = {
187 |     GetMinerIndex: 1,
188 |     GetDataEntityBucket: 1,
189 |     GetContentsByBuckets: 5,
190 |     DecodeURLRequest: 2,
191 |     GetHuggingFaceMetadata: 1,
192 |     OnDemandRequest: 5,
193 | }


--------------------------------------------------------------------------------
/docs/apify.md:
--------------------------------------------------------------------------------
 1 | # Apify
 2 | 
 3 | [Apify](http://apify.com) is a popular platform and market place for web scraping tools.
 4 | 
 5 | Data Universe uses Apify to scrape certain DataSources. At this time, all Validators and Miners are required to use Apify. In future, Apify will become optional for Miners, depending on the DataSources they scrape from.
 6 | 
 7 | ## Setting your API Token
 8 | 
 9 | 1. Create an Apify account
10 | 2. Got to your Console -> Settings -> Integrations and copy your Personal API token
11 | 3. Create a file named `.env` in the `data-universe` directory if it doesn't already exist and add the following to it:
12 | ```py
13 | APIFY_API_TOKEN="YOUR_APIFY_API_TOKEN"
14 | ```


--------------------------------------------------------------------------------
/docs/hugging_face_validation.md:
--------------------------------------------------------------------------------
  1 | # Hugging Face Dataset Validation on Bittensor Subnet 13
  2 | 
  3 | This document outlines the validation process for Hugging Face datasets in the context of Bittensor Subnet 13. Validators on Subnet 13 are responsible for ensuring that miners are providing accurate and up-to-date datasets, specifically for X (formerly Twitter) and Reddit. The validation process is crucial for maintaining the integrity and utility of the network.
  4 | 
  5 | **Note:** The functionality described here will be integrated into the `MinerEvaluator` in the future.
  6 | 
  7 | ## Overview
  8 | 
  9 | The validation process involves the following key steps:
 10 | 
 11 | 1. **Querying Hugging Face Metadata from Miners**
 12 | 2. **Selecting Random Files from the Latest Data Commit**
 13 | 3. **Checking Dataset Updates and Size Changes**
 14 | 4. **Validating Data Samples**
 15 | 5. **Adjusting Miner Credibility Based on Validation Results**
 16 | 
 17 | ## Validation Process Details
 18 | 
 19 | ### 1. Querying Hugging Face Metadata from Miners
 20 | 
 21 | Every **55,000 blocks**, validators query the `HuggingFaceMetadata` table from each miner. This metadata includes information about the datasets that miners have uploaded to Hugging Face, specifically focusing on the newest datasets for X and Reddit.
 22 | 
 23 | ```python
 24 | # Example of querying Hugging Face metadata
 25 | async def _query_huggingface_metadata(self, hotkey: str, uid: int, miner_axon: bt.AxonInfo) -> Optional[List[HuggingFaceMetadata]]:
 26 |     # ... code to query metadata ...
 27 | ```
 28 | 
 29 | ### 2. Selecting Random Files from the Latest Data Commit
 30 | 
 31 | Validators select **10 random rows** from the latest data commit of the miner's Hugging Face dataset. This selection ensures that the validation covers recent data and that miners are continually updating their datasets.
 32 | 
 33 | ```python
 34 | # Function to select random rows from a dataset
 35 | def select_random_rows_from_parquet(repo_id: str, num_rows: int = 10) -> pd.DataFrame:
 36 |     # ... code to select random rows ...
 37 | ```
 38 | 
 39 | ### 3. Checking Dataset Updates and Size Changes
 40 | 
 41 | Validators assess how frequently the dataset is updated and monitor changes in its size over time. This step ensures that miners are actively accumulating data in their repositories and contributing to the network's growth.
 42 | 
 43 | - **Update Frequency:** Validators check the timestamps of data commits to verify regular updates.
 44 | - **Size Changes:** Validators compare the sizes of successive data commits to confirm that new data is being added.
 45 | 
 46 | ### 4. Validating Data Samples
 47 | 
 48 | For each selected file, validators perform the following:
 49 | 
 50 | - **Data Retrieval:** Fetch the data samples from the selected files.
 51 | - **Data Verification:** Use appropriate scrapers to validate the correctness of the data.
 52 | 
 53 | The validation criteria are:
 54 | 
 55 | - **Reddit Dataset:** A validation ratio of **0.5**. If at least 5 out of 10 samples are valid, the validation is considered successful.
 56 | - **X Dataset:** A validation ratio of **0.6**. If at least 6 out of 10 samples are valid, the validation is considered successful.
 57 | 
 58 | ```python
 59 | # Example of validating data samples
 60 | async def main():
 61 |     # ... code to validate data samples ...
 62 |     valid = await scraper.validate_hf(entities=selected_rows)
 63 |     # ... process validation results ...
 64 | ```
 65 | 
 66 | ### 5. Adjusting Miner Credibility Based on Validation Results
 67 | 
 68 | Based on the validation outcome, validators adjust the miner's credibility:
 69 | 
 70 | - **Successful Validation:** Increase the miner's credibility score by **10%**.
 71 | - **Failed Validation:** Decrease the miner's credibility score by **10%**.
 72 | 
 73 | This adjustment incentivizes miners to provide high-quality, up-to-date datasets.
 74 | 
 75 | ```python
 76 | # Adjusting miner credibility
 77 | if validation_successful:
 78 |     self.scorer.increase_credibility(uid, percentage=10)
 79 | else:
 80 |     self.scorer.decrease_credibility(uid, percentage=10)
 81 | ```
 82 | 
 83 | ## Future Integration into MinerEvaluator
 84 | 
 85 | The `MinerEvaluator` will be updated to include this validation process. The planned changes involve:
 86 | 
 87 | - Implementing the Hugging Face dataset validation as a separate component within the `MinerEvaluator`.
 88 | - Scheduling the validation process to occur every **55,000 blocks**.
 89 | - Incorporating the credibility adjustments based on validation outcomes.
 90 | 
 91 | **Note:** The existing validation steps for data entity buckets will remain, but the Hugging Face dataset validation will be handled separately to ensure a focused and efficient validation process.
 92 | 
 93 | ## Code Structure
 94 | 
 95 | - **MinerEvaluator Class:** Responsible for evaluating miners and updating their scores.
 96 | - **Hugging Face Validation Module:** Contains functions to select random rows from datasets and validate them.
 97 | - **ScraperProvider:** Supplies the appropriate scraper for data validation (e.g., X or Reddit scrapers).
 98 | 
 99 | ## Conclusion
100 | 
101 | This validation process is designed to ensure that miners on Bittensor Subnet 13 contribute valuable and accurate datasets to the network. By regularly validating datasets and adjusting miner credibility accordingly, the network maintains high data quality standards.
102 | 
103 | ## References
104 | 
105 | - **Bittensor Documentation:** [https://bittensor.com/](https://bittensor.com/)
106 | - **Hugging Face Datasets:** [https://huggingface.co/datasets](https://huggingface.co/datasets)
107 | 
108 | 


--------------------------------------------------------------------------------
/docs/huggingface_setup.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### README: Configuring Access for Uploading Data to Hugging Face Datasets
 3 | 
 4 | #### Creating and Configuring Your Hugging Face Access Token
 5 | 
 6 | To upload datasets to Hugging Face, you'll need an access token with the appropriate permissions. Follow these steps to create and configure your token:
 7 | 
 8 | #### Step 1: Create a Hugging Face Account
 9 | If you haven't already, create an account at [Hugging Face's website](https://huggingface.co/join).
10 | 
11 | #### Step 2: Generate an Access Token
12 | 1. Log into your Hugging Face account.
13 | 2. Navigate to your account settings by clicking on your profile picture in the upper right corner, then select 'Settings'.
14 | 3. Go to the 'Access Tokens' section.
15 | 4. Click on 'New Token'.
16 | 5. Name your token and select the appropriate role. To upload datasets, choose the "Write" role which allows you to upload and modify datasets.
17 | 6. Click 'Create a token'.
18 | 
19 | #### Step 3: Configure the Token in Your Environment
20 | 1. Copy the generated token.
21 | 2. Open or create a `.env` file in the root directory of your project.
22 | 3. Add the following line to your `.env` file:
23 | 
24 |    ```
25 |    HUGGINGFACE_TOKEN=<YOUR_HF_TOKEN_HERE>
26 |    ```
27 | 
28 |    Replace `<YOUR_HF_TOKEN_HERE>` with the token you copied in the previous step.
29 | 
30 | #### Step 4: Utilize the Token in Your Software
31 | Ensure that your software is configured to read the `HF_TOKEN` from the environment variables. This is typically handled in your Python script as follows:
32 | 
33 | ```python
34 | import os
35 | from huggingface_hub import HfApi, HfFolder
36 | from dotenv import load_dotenv
37 | 
38 | load_dotenv()
39 | # Ensure the token is loaded from the .env file
40 | api = HfApi(token=os.getenv('HUGGINGFACE_TOKEN'))
41 | ```
42 | 
43 | #### Finalizing Setup
44 | After configuring the token in your `.env` file, your miner should be able to authenticate with Hugging Face and upload datasets without requiring further login steps.
45 | 
46 | ### Additional Information
47 | - Keep your token secure and do not share it publicly.
48 | - If you need to regenerate your token, repeat the steps above to generate a new one and update your `.env` file accordingly.
49 | 


--------------------------------------------------------------------------------
/docs/on_demand.md:
--------------------------------------------------------------------------------
 1 | # On-Demand Data Request Implementation
 2 | 
 3 | ## Overview
 4 | On-demand data retrieval is ALREADY IMPLEMENTED in both validator and miner templates. This enhanced version now provides richer metadata for X/Twitter content while maintaining the original Reddit implementation.
 5 | 
 6 | ## For Miners
 7 | 
 8 | ### X/Twitter Scraping (Enhanced)
 9 | The enhanced implementation uses `EnhancedApiDojoTwitterScraper` for X/Twitter which provides:
10 | 
11 | - **Rich User Metadata**
12 |   - User ID, display name, verification status
13 |   - Follower/following counts
14 |   
15 | - **Complete Tweet Information**
16 |   - Engagement metrics (likes, retweets, replies, quotes, views)
17 |   - Tweet type classification (reply, quote, retweet)
18 |   - Conversation context and threading information
19 |   
20 | - **Media Content**
21 |   - Media URLs and content types
22 |   - Support for photos and videos
23 |   
24 | - **Advanced Formatting**
25 |   - Properly ordered hashtags and cashtags
26 |   - Full conversation context
27 | 
28 | ### Reddit Scraping (Unchanged)
29 | The Reddit implementation remains the same, using the Reddit API.
30 | 
31 | ## Implementation Options
32 | 
33 | You can:
34 | - Use the enhanced implementation as-is (recommended)
35 | - Modify `handle_on_demand` in miner.py to use your own scrapers
36 | - Build custom scraping logic while maintaining the same request/response format
37 | 
38 | ### Integration Steps:
39 | 
40 | 1. **Simple Integration**: Import the enhanced scraper and provider:
41 |    ```python
42 |    from scraping.x.enhanced_apidojo_scraper import EnhancedApiDojoTwitterScraper
43 |    from scraping.x.on_demand_model import EnhancedXContent
44 |    ```
45 | 
46 | 2. **Update your scraper provider**:
47 |    ```python
48 |    # Create enhanced scraper provider 
49 |    scraper_provider = EnhancedScraperProvider()
50 |    ```
51 | 
52 | 3. **Enjoy richer data**: The enhanced content is automatically used for X/Twitter requests
53 | 
54 | ## Rewards
55 | - Top 50% of miners by stake participate in validation
56 | - 50% chance of validation per request
57 | - Successful validation: +1% credibility
58 | - Failed validation: Proportional credibility decrease
59 | 
60 | ## Response Format Example
61 | 
62 | ```json
63 | {
64 |   "uri": "https://x.com/username/status/123456789",
65 |   "datetime": "2025-03-17T12:34:56+00:00",
66 |   "source": "X",
67 |   "label": "#bitcoin",
68 |   "content": "Tweet text content...",
69 |   "user": {
70 |     "username": "@username",
71 |     "display_name": "User Display Name",
72 |     "id": "12345678",
73 |     "verified": true,
74 |     "followers_count": 10000,
75 |     "following_count": 1000
76 |   },
77 |   "tweet": {
78 |     "id": "123456789",
79 |     "like_count": 500,
80 |     "retweet_count": 100,
81 |     "reply_count": 50,
82 |     "quote_count": 25,
83 |     "hashtags": ["#bitcoin", "#crypto"],
84 |     "is_retweet": false,
85 |     "is_reply": false,
86 |     "is_quote": true,
87 |     "conversation_id": "123456789"
88 |   },
89 |   "media": [
90 |     {"url": "https://pbs.twimg.com/media/image1.jpg", "type": "photo"},
91 |     {"url": "https://video.twimg.com/video1.mp4", "type": "video"}
92 |   ]
93 | }
94 | ```
95 | 
96 | That's it! The enhanced system is ready to use, providing significantly richer data while maintaining compatibility with existing implementations. 🚀


--------------------------------------------------------------------------------
/docs/reddit.md:
--------------------------------------------------------------------------------
 1 | # Reddit
 2 | 
 3 | [Reddit](https://reddit.com) is one source that Data Universe can pull from.
 4 | 
 5 | In addition to the [Apify actor based scraping](apify.md) we also support using a personal reddit account.
 6 | 
 7 | ## Getting a reddit account.
 8 | 
 9 | If you already have a reddit account you can use that one. Otherwise [sign up](https://www.reddit.com/register/) for one (must support password based auth).
10 | 
11 | ## Setting up your account for use with a script type app.
12 | 
13 | Follow the [OAuth2 First Steps guide](https://github.com/reddit-archive/reddit/wiki/OAuth2-Quick-Start-Example#first-steps) to add a script type app to your account and find the associated app client id and app client secret. Do not share your client secret with anyone.
14 | 
15 | ## Providing your information to your miner or validator.
16 | 
17 | Create a file named`.env` in the `data-universe` directory if it doesn't already exist and add the following to it:
18 | ```py
19 | REDDIT_CLIENT_ID="YOUR_REDDIT_CLIENT_ID"
20 | REDDIT_CLIENT_SECRET="YOUR_REDDIT_CLIENT_SECRET"
21 | REDDIT_USERNAME="YOUR_REDDIT_USERNAME"
22 | REDDIT_PASSWORD="YOUR_REDDIT_PASSWORD"
23 | ```


--------------------------------------------------------------------------------
/docs/scoring.md:
--------------------------------------------------------------------------------
  1 | # Bittensor Miner Evaluation System
  2 | 
  3 | ## Overview
  4 | 
  5 | This document outlines the key components of the Bittensor miner evaluation system: Credibility, Score, and Incentive. These components work together to create a fair and effective reward mechanism for miners in the network.
  6 | 
  7 | ## Key Components
  8 | 
  9 | ### 1. Credibility
 10 | 
 11 | | Aspect | Description |
 12 | |--------|-------------|
 13 | | Range | 0 to 1 |
 14 | | Purpose | Measures the miner's long-term reliability and consistency |
 15 | | Calculation | `new_credibility = α * current_validation_result + (1 - α) * old_credibility` |
 16 | | Characteristics | Slowly changes over time, reflecting consistent performance |
 17 | 
 18 | ### 2. Score
 19 | 
 20 | | Aspect | Description |
 21 | |--------|-------------|
 22 | | Range | Any non-negative number |
 23 | | Purpose | Represents the value of data provided in a single evaluation |
 24 | | Calculation | `raw_score = data_type_scale_factor * time_scalar * scorable_bytes` <br> `final_score = raw_score * (credibility ^ 2.5)` |
 25 | | Characteristics | Can vary significantly between evaluations |
 26 | 
 27 | ### 3. Incentive (Reward)
 28 | 
 29 | | Aspect | Description |
 30 | |--------|-------------|
 31 | | Range | Proportional to the miner's share of the total network score |
 32 | | Purpose | Determines the actual reward (e.g., tokens) given to the miner |
 33 | | Calculation | `miner_reward = (miner_score / total_network_score) * total_reward_pool` |
 34 | | Characteristics | Directly affects the miner's earnings |
 35 | 
 36 | ## Relationships
 37 | 
 38 | ### Credibility → Score
 39 | - Credibility acts as a multiplier for the raw score
 40 | - Higher credibility significantly boosts the final score due to the exponential factor (2.5)
 41 | 
 42 | ### Score → Incentive
 43 | - The miner's score determines their share of the total reward pool
 44 | - Higher scores lead to higher rewards, but it's relative to other miners' scores
 45 | 
 46 | ### Credibility → Incentive
 47 | - Credibility indirectly affects incentives by boosting scores
 48 | - Miners with higher credibility can earn more rewards even with the same raw data value
 49 | 
 50 | ### HuggingFace validation
 51 | - Credibility change is not enabled at this moment, but you can take a look how it is going to be implemented here:  [hugging_face_validation.md](/docs/hugging_face_validation.md) file.
 52 | ## System Flow
 53 | 
 54 | 1. **Credibility Evaluation**:
 55 |    - Miner's current credibility is assessed based on past performance.
 56 | 
 57 | 2. **Raw Score Calculation**:
 58 |    - Data Value is determined based on content, source, and timeliness.
 59 |    - Raw Score is computed using the Data Value and other factors.
 60 | 
 61 | 3. **Final Score Computation**:
 62 |    - Credibility is applied as a multiplier to the Raw Score.
 63 |    - Final Score = Raw Score * (Credibility ^ 2.5)
 64 | 
 65 | 4. **Incentive/Reward Allocation**:
 66 |    - Miner's Final Score is compared to the Total Network Score.
 67 |    - Reward is proportionally allocated based on this comparison.
 68 | 
 69 | 5. **Feedback Loop**:
 70 |    - The allocated reward indirectly motivates the miner to maintain and improve their Credibility for future evaluations.
 71 | 
 72 | Note: Credibility has a direct influence on the Final Score, while the Incentive/Reward indirectly influences future Credibility through miner behavior.
 73 | 
 74 | ## Key Parameters
 75 | 
 76 | - Starting Credibility: 0
 77 | - Credibility Exponent: 2.5
 78 | - Credibility Alpha (α): 0.15
 79 | - Max Data Entity Bucket Size: 128 MB
 80 | - Max Data Entity Bucket Count per Miner Index: 350,000
 81 | - Data Age Limit: 30 days
 82 | - Min Evaluation Period: 60 minutes
 83 | 
 84 | ## Data Source Weights
 85 | - Reddit: 55% (weight: 0.55)
 86 | - X (Twitter): 35% (weight: 0.35)
 87 | - Youtube: 10% (weight: 0.1)
 88 | 
 89 | ## Desirable Data
 90 | 
 91 | For the current list of desirable data sources and their jobs, run with the `--gravity` tag.
 92 | 
 93 | ## Important Notes
 94 | 
 95 | - Scores are relative to other miners in the network
 96 | - Credibility builds over time, rewarding consistent good performance
 97 | - Recent data is valued more highly than older data
 98 | - The system adapts to changes in data desirability through configurable lookup tables
 99 | - Negative scale factors can penalize undesirable data
100 | 
101 | This scoring system is designed to be fair while also being resistant to gaming attempts, encouraging miners to consistently provide high-quality, relevant, and timely data to the Bittensor network.
102 | 


--------------------------------------------------------------------------------
/docs/validator.md:
--------------------------------------------------------------------------------
 1 | # Validator
 2 | 
 3 | The Validator is responsible for validating the Miners and scoring them according to the [incentive mechanism](../README.md#incentive-mechanism). It runs a loop to enumerate all Miners in the network, and for each, it performs the following sequence:
 4 | 1. It requests the latest [MinerIndex](../README.md#terminology) from the miner, which it stores in a in-memory database.
 5 | 2. It chooses a random (sampled by size) DataEntityBucket from the MinerIndex to sample.
 6 | 3. It gets that DataEntityBucket from the Miner.
 7 | 4. It chooses N DataEntities from the DataEntityBucket to validate. It then scrapes the content from the appropriate DataSource to get those DataEntities.
 8 | 5. It then compares those retrieved DataEntities against the ones provided by the Miner and updates the Miner Credibility, based on the result.
 9 | 6. Finally, it updates the Miner's score. This is based on the total MinerIndex scaled by Freshness/Desirability/Duplication/Credibility.
10 | 
11 | Once this sequence has been performed for all Miners, the Validator waits a period of time before starting the next loop to ensure it does not evaluate a Miner more often than once per N minutes. This helps ensure the cost of running a Validator is not too high, and also protects the network against high amounts of traffic.
12 | 
13 | As of Jan 13th 2024, the expected cost number of DataItems queried via Apify is roughly: `225 Miners * 1 evals per hour * 2 sample per period * 24 hours = 10800`. Assuming this is ~50% Reddit (Free with Custom Scraper) and ~50% per X ($1 per 1000), the total cost is roughly $5.40 per day.
14 | 
15 | # System Requirements
16 | 
17 | Validators require at least 32 GB of RAM but do not require a GPU. We recommend a decent CPU (4+ cores) and sufficient network bandwidth to handle protocol traffic. Must have python >= 3.10.
18 | 
19 | # Getting Started
20 | 
21 | ## Prerequisites
22 | 1. As of Jan 13th 2024, we support Twitter and Reddit scraping via Apify so you'll need to [setup your Apify API token](apify.md).
23 | We also support Reddit scraping via a [personal reddit account](reddit.md) which is completely free. 
24 | Validators will default to using the personal reddit account for reliability but this can be changed editing the PREFERRED_SCRAPERS map in validator.py locally.
25 | We also support YouTube Scraping via a [official youtube api](youtube.md) which is completely free. 
26 | 
27 | 2. Clone the repo
28 | 
29 | ```shell
30 | git clone https://github.com/RusticLuftig/data-universe.git
31 | ```
32 | 
33 | 3. Setup your python [virtual environment](https://docs.python.org/3/library/venv.html) or [Conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands).
34 | 
35 | 4. Install the requirements. From your virtual environment, run
36 | ```shell
37 | cd data-universe
38 | python -m pip install -e .
39 | ```
40 | 
41 | 5. Make sure you've [created a Wallet](https://docs.bittensor.com/getting-started/wallets) and [registered a hotkey](https://docs.bittensor.com/subnets/register-and-participate).
42 | 
43 | 6. (Optional) Setup a wandb account and login so your validator can store logs beyond 7 days. From your virtual environment, run
44 | ```shell
45 | wandb login
46 | ```
47 | 
48 | This will prompt you to navigate to https://wandb.ai/authorize and copy your api key back into the terminal.
49 | 
50 | ## Running the Validator
51 | 
52 | ### With auto-updates
53 | 
54 | We highly recommend running the validator with auto-updates. This will help ensure your validator is always running the latest release, helping to maintain a high vtrust.
55 | 
56 | Prerequisites:
57 | 1. To run with auto-update, you will need to have [pm2](https://pm2.keymetrics.io/) installed.
58 | 2. Make sure your virtual environment is activated. This is important because the auto-updater will automatically update the package dependencies with pip.
59 | 3. Make sure you're using the main branch: `git checkout main`.
60 | 
61 | From the data-universe folder:
62 | ```shell
63 | pm2 start --name net13-vali-updater --interpreter python scripts/start_validator.py -- --pm2_name net13-vali --wallet.name cold_wallet --wallet.hotkey hotkey_wallet [other vali flags]
64 | ```
65 | 
66 | This will start a process called `net13-vali-updater`. This process periodically checks for a new git commit on the current branch. When one is found, it performs a `pip install` for the latest packages, and restarts the validator process (who's name is given by the `--pm2_name` flag)
67 | 
68 | 
69 | ### Without auto-updates
70 | 
71 | If you'd prefer to manage your own validator updates...
72 | 
73 | From the data-universe folder:
74 | ```shell
75 | pm2 start python -- ./neurons/validator.py --wallet.name your-wallet --wallet.hotkey your-hotkey
76 | ```
77 | 
78 | # Configuring the Validator
79 | 
80 | ## Flags
81 | 
82 | The Validator offers some flags to customize properties.
83 | 
84 | You can view the full set of flags by running
85 | ```shell
86 | python ./neurons/validator.py -h
87 | ```
88 | 
89 | # Coming Soon
90 | 
91 | We are working hard to add more features to the Subnet. For the Validators, we have plans to:
92 | 
93 | 1. Have the Validator serve an Axon on the network, so neurons on other Subnets can retrieve data.
94 | 2. Add scrapers for other DataSources.
95 | 3. Add other (and cheaper) scrapers for the Validators to use.


--------------------------------------------------------------------------------
/docs/youtube.md:
--------------------------------------------------------------------------------
 1 | Step-by-Step Instructions
 2 | 1. Create a Google Cloud Project
 3 | - Visit [Google Cloud Console](https://console.cloud.google.com/).
 4 | - Click on the project drop-down and select “New Project”.
 5 | - Enter a name for your project (e.g., YouTubeScraper) and click “Create”.
 6 | 
 7 | 2. Enable the YouTube Data API v3
 8 | - Search for YouTube Data API v3
 9 | - Click on it and then click “Enable”.
10 | 
11 | 3. Generate an API Key
12 | - Go to the Credentials page.
13 | - Click “Create Credentials” > “API Key”.
14 | - A new API key will be generated. Copy and save it.
15 | 
16 | 4. Set the API Key in Environment Variables
17 | - Add the following to your .env file in the root directory of your project:
18 | `YOUTUBE_API_KEY=your_actual_api_key_here
19 | `
20 | 
21 | ### Working with proxies ( webShare)
22 | If you are using cloud provider that belong to popular cloud providers (like AWS, Google Cloud Platform, Azure, DO etc.),
23 | 
24 | You will need to add WebShare proxy
25 | Once you have created a Webshare account and purchased a "Residential" proxy package that suits your workload 5 GB for a vali (make sure NOT to purchase "Proxy Server" or "Static Residential"!), 
26 | open the Webshare Proxy Settings to retrieve your "Proxy Username" and "Proxy Password". Using this information you can initialize the validator as follows:
27 | `WEB_SHARE_PROXY_USERNAME=WEB_SHARE_PROXY_USERNAME
28 | WEB_SHARE_PROXY_PASSWORD=WEB_SHARE_PROXY_PASSWORD`
29 | 
30 | ### Working with any other proxies 
31 | 
32 | If you don't like webshare, you can define this works with any http proxy!
33 | 
34 | YTT_PROXY_HOST=127.0.0.01
35 | YTT_PROXY_PORT=7777
36 | YTT_PROXY_USERNAME=myusername
37 | YTT_PROXY_PASSWORD=mypassword
38 | 


--------------------------------------------------------------------------------
/dynamic_desirability/chain_utils.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import functools
  3 | from typing import Dict, Optional, Any
  4 | import bittensor as bt
  5 | import multiprocessing
  6 | import argparse
  7 | 
  8 | def add_args(parser: argparse.ArgumentParser, is_upload: bool):
  9 |     """Add arguments to the parser"""
 10 |     parser.add_argument('--wallet', type=str, required=True, help='Name of the wallet')
 11 |     parser.add_argument('--hotkey', type=str, required=True, help='Name of the hotkey')
 12 |     parser.add_argument('--network', type=str, required=True, help='Name of the subtensor network', default='finney')
 13 |     parser.add_argument('--netuid', type=int, required=True, help='UID of the subnet', default=13)
 14 |     
 15 |     if is_upload:
 16 |         parser.add_argument('--file_path', type=str, required=True, help='Path to the JSON file containing preferences')
 17 | 
 18 | def _sync_retrieve_metadata(netuid: int, hotkey: str, network: str = "finney"):
 19 |     """Standalone function that can be pickled"""
 20 |     try:
 21 |         # Create a fresh subtensor instance for each call
 22 |         fresh_subtensor = bt.subtensor(network=network)
 23 |         
 24 |         metadata = bt.core.extrinsics.serving.get_metadata(
 25 |             fresh_subtensor, 
 26 |             netuid, 
 27 |             hotkey
 28 |         )
 29 |         
 30 |         if not metadata:
 31 |             return None
 32 |             
 33 |         commitment = metadata["info"]["fields"][0]
 34 |         hex_data = commitment[list(commitment.keys())[0]][2:]
 35 |         return bytes.fromhex(hex_data).decode()
 36 |     except Exception as e:
 37 |         bt.logging.error(f"Error retrieving metadata for {hotkey}: {str(e)}")
 38 |         return None
 39 | 
 40 | 
 41 | def _wrapped_func(func: functools.partial, queue: multiprocessing.Queue):
 42 |     try:
 43 |         result = func()
 44 |         queue.put(result)
 45 |     except Exception as e:
 46 |         queue.put(None)  # Return None instead of raising on error
 47 | 
 48 | 
 49 | def run_in_subprocess(func: functools.partial, ttl: int = 10) -> Any:
 50 |     """Runs with shorter timeout and better error handling"""
 51 |     ctx = multiprocessing.get_context('fork')
 52 |     queue = ctx.Queue()
 53 |     process = ctx.Process(target=_wrapped_func, args=[func, queue])
 54 | 
 55 |     process.start()
 56 |     process.join(timeout=ttl)
 57 | 
 58 |     if process.is_alive():
 59 |         process.terminate()
 60 |         process.join()
 61 |         return None  # Return None on timeout instead of raising
 62 | 
 63 |     try:
 64 |         result = queue.get(block=False)
 65 |         return result
 66 |     except Exception:
 67 |         return None
 68 | 
 69 | 
 70 | class ChainPreferenceStore:
 71 |     def __init__(
 72 |         self,
 73 |         subtensor: bt.subtensor,
 74 |         netuid: int,
 75 |         wallet: Optional[bt.wallet] = None,
 76 |     ):
 77 |         self.subtensor = subtensor
 78 |         self.wallet = wallet
 79 |         self.netuid = netuid
 80 |         
 81 |     async def store_preferences(
 82 |         self,
 83 |         data: str,
 84 |         wait_for_inclusion: bool = True,
 85 |         wait_for_finalization: bool = True,
 86 |     ):
 87 |         """Stores preferences on this subnet for a specific wallet."""
 88 |         if self.wallet is None:
 89 |             raise ValueError("No wallet available to write to the chain.")
 90 |         if not data:
 91 |             raise ValueError("No data provided to store on the chain.")
 92 | 
 93 |         def sync_store():
 94 |             return bt.core.extrinsics.serving.publish_metadata(
 95 |                 self.subtensor,
 96 |                 self.wallet,
 97 |                 self.netuid,
 98 |                 f"Raw{len(data)}",
 99 |                 data.encode(),
100 |                 wait_for_inclusion,
101 |                 wait_for_finalization,
102 |             )
103 | 
104 |         partial = functools.partial(sync_store)
105 |         bt.logging.info("Writing to chain...")
106 |         return run_in_subprocess(partial, 60)
107 | 
108 |     async def retrieve_preferences(self, hotkey: str) -> Optional[str]:
109 |         """Single retrieval with shorter timeout"""
110 |         partial = functools.partial(_sync_retrieve_metadata, self.netuid, hotkey)
111 |         return run_in_subprocess(partial, ttl=10)  # Shorter timeout per validator
112 | 
113 |     async def batch_retrieve_preferences(self, hotkeys: list[str], chunk_size: int = 5) -> Dict[str, Optional[str]]:
114 |         """Retrieve preferences for multiple validators in chunks"""
115 |         results = {}
116 |         
117 |         # Process in chunks to avoid overwhelming the system
118 |         for i in range(0, len(hotkeys), chunk_size):
119 |             chunk = hotkeys[i:i + chunk_size]
120 |             chunk_tasks = []
121 |             
122 |             # Create tasks for each hotkey in the chunk
123 |             for hotkey in chunk:
124 |                 task = asyncio.create_task(self.retrieve_preferences(hotkey))
125 |                 chunk_tasks.append((hotkey, task))
126 |             
127 |             # Wait for all tasks in chunk to complete
128 |             for hotkey, task in chunk_tasks:
129 |                 try:
130 |                     result = await task
131 |                     results[hotkey] = result
132 |                 except Exception as e:
133 |                     bt.logging.error(f"Error processing {hotkey}: {str(e)}")
134 |                     results[hotkey] = None
135 |             
136 |             # Small delay between chunks
137 |             await asyncio.sleep(0.1)
138 |         
139 |         return results
140 | 
141 | 
142 | if __name__ == "__main__":
143 |     # Example usage
144 |     parser = argparse.ArgumentParser()
145 |     add_args(parser, is_upload=False)
146 |     args = parser.parse_args()
147 | 
148 |     subtensor = bt.subtensor(network=args.network)
149 |     wallet = bt.wallet(name=args.wallet, hotkey=args.hotkey)
150 |     
151 |     store = ChainPreferenceStore(subtensor, args.netuid, wallet)
152 |     
153 |     async def test():
154 |         result = await store.retrieve_preferences(args.hotkey)
155 |         print(f"Retrieved preferences: {result}")
156 |     
157 |     asyncio.run(test())


--------------------------------------------------------------------------------
/dynamic_desirability/constants.py:
--------------------------------------------------------------------------------
 1 | # The link to the github repo where preferences JSONs are uploaded.
 2 | REPO_URL: str = 'https://github.com/macrocosm-os/gravity.git'
 3 | BRANCH_NAME: str = 'main'
 4 | PREFERENCES_FOLDER: str = 'validator_preferences'
 5 | 
 6 | # Total weight of all validators. Subnet (default) voting weight = 1-TOTAL_VALI_WEIGHT. 
 7 | TOTAL_VALI_WEIGHT: float = 0.7
 8 | DEFAULT_SCALE_FACTOR: float = 0.3               # number is subject to change
 9 | AMPLICATION_FACTOR: float = 250 / TOTAL_VALI_WEIGHT * (1 - TOTAL_VALI_WEIGHT)
10 | 
11 | # Paths of subnet preferences (default) and overall subnet + validator preferences.
12 | DEFAULT_JSON_PATH: str = 'default.json'
13 | AGGREGATE_JSON_PATH: str = 'total.json'
14 | 
15 | VALID_SOURCES: dict[str, str] = {
16 |     "reddit": "r/",
17 |     "x": "#",
18 |     "youtube": "",
19 | }


--------------------------------------------------------------------------------
/huggingface_utils/encoding_system.py:
--------------------------------------------------------------------------------
  1 | """Module for URL encoding and decoding using Fernet encryption."""
  2 | 
  3 | import base64
  4 | import json
  5 | import os
  6 | import time
  7 | from typing import Tuple, Optional
  8 | 
  9 | import pandas as pd
 10 | from cryptography.fernet import Fernet
 11 | 
 12 | class EncodingKeyManager:
 13 |     """Manages the encryption key for URL encoding and decoding."""
 14 | 
 15 |     def __init__(self, key_path: str = 'encoding_key.json'):
 16 |         """Initialize the EncodingKeyManager with a key file path."""
 17 |         self.key_path = key_path
 18 |         self.sym_key = self._load_or_generate_key()
 19 |         self.fernet = Fernet(self.sym_key)
 20 | 
 21 |     def _load_or_generate_key(self) -> bytes:
 22 |         """Load an existing key or generate a new one if it doesn't exist."""
 23 |         if os.path.exists(self.key_path):
 24 |             with open(self.key_path, 'r', encoding='utf-8') as f:
 25 |                 key_data = json.load(f)
 26 |                 return key_data['sym_key'].encode()
 27 |         else:
 28 |             sym_key = Fernet.generate_key()
 29 |             self._save_key(sym_key)
 30 |             return sym_key
 31 | 
 32 |     def _save_key(self, sym_key: bytes) -> None:
 33 |         """Save the symmetric key to a JSON file."""
 34 |         key_data = {
 35 |             'sym_key': sym_key.decode()
 36 |         }
 37 |         with open(self.key_path, 'w', encoding='utf-8') as f:
 38 |             json.dump(key_data, f)
 39 | 
 40 |     def get_fernet(self) -> Fernet:
 41 |         """Get the Fernet instance for encryption/decryption."""
 42 |         return self.fernet
 43 | 
 44 | 
 45 | class SymKeyEncodingKeyManager(EncodingKeyManager):
 46 |     """A subclass of EncodingKeyManager that uses a symmetric key directly."""
 47 | 
 48 |     def __init__(self, sym_key: str):
 49 |         """
 50 |         Initialize the SymKeyEncodingKeyManager with a symmetric key.
 51 | 
 52 |         Args:
 53 |             sym_key (str): A base64-encoded symmetric key string.
 54 |         """
 55 |         self.sym_key = self._validate_and_encode_key(sym_key)
 56 |         self.fernet = Fernet(self.sym_key)
 57 | 
 58 |     def _validate_and_encode_key(self, sym_key: str) -> bytes:
 59 |         """Validate the provided key and return it as bytes."""
 60 |         try:
 61 |             # Attempt to create a Fernet instance to validate the key
 62 |             Fernet(sym_key.encode())
 63 |             return sym_key.encode()
 64 |         except Exception as e:
 65 |             raise ValueError(f"Invalid symmetric key provided: {str(e)}")
 66 | 
 67 |     def _load_or_generate_key(self) -> bytes:
 68 |         """Override to return the provided symmetric key."""
 69 |         return self.sym_key
 70 | 
 71 |     def _save_key(self, sym_key: bytes) -> None:
 72 |         """Override to do nothing, as we don't want to save the key to a file."""
 73 |         pass
 74 | 
 75 | 
 76 | def encode_url(url: str, fernet: Fernet) -> Optional[str]:
 77 |     """Encode a URL using Fernet encryption."""
 78 |     try:
 79 |         encoded = fernet.encrypt(url.encode())
 80 |         return base64.urlsafe_b64encode(encoded).decode()
 81 |     except Exception as e:
 82 |         print(f"Encryption failed for URL: {url}")
 83 |         print(f"Error: {str(e)}")
 84 |         return None
 85 | 
 86 | 
 87 | def decode_url(encoded_url: str, fernet: Fernet) -> Optional[str]:
 88 |     """Decode an encoded URL using Fernet decryption."""
 89 |     try:
 90 |         decoded = fernet.decrypt(base64.urlsafe_b64decode(encoded_url.encode()))
 91 |         return decoded.decode()
 92 |     except Exception as e:
 93 |         print(f"Decryption failed for encoded URL: {encoded_url}")
 94 |         print(f"Error: {str(e)}")
 95 |         return None
 96 | 
 97 | 
 98 | def encode_dataframe_column(df: pd.DataFrame, column_name: str, key_manager: EncodingKeyManager) -> pd.DataFrame:
 99 |     """Encode a column of URLs in a DataFrame."""
100 |     fernet = key_manager.get_fernet()
101 |     df[f'{column_name}_encoded'] = df[column_name].apply(lambda url: encode_url(url, fernet))
102 |     return df
103 | 
104 | 
105 | def decode_dataframe_column(df: pd.DataFrame, column_name: str, key_manager: EncodingKeyManager) -> pd.DataFrame:
106 |     """Decode a column of encoded URLs in a DataFrame."""
107 |     fernet = key_manager.get_fernet()
108 |     original_column_name = column_name.replace('_encoded', '')
109 |     df[original_column_name] = df[column_name].apply(lambda url: decode_url(url, fernet))
110 |     return df
111 | 
112 | 
113 | def main():
114 |     """Main function to demonstrate URL encoding and decoding."""
115 |     # Initialize EncodingKeyManager
116 |     key_manager = EncodingKeyManager()
117 | 
118 |     # Create a larger sample DataFrame (1 million rows)
119 |     n_rows = 1_000_000
120 |     urls = [
121 |         'https://example.com/short_url',
122 |         'https://example.com/medium_length_url_with_some_parameters?param1=value1&param2=value2',
123 |         'https://example.com/very_long_url_with_many_parameters_and_some_special_characters?param1=value1&param2=value2&param3=value3&param4=value4&special=!@#$%^&*()'
124 |     ]
125 |     df = pd.DataFrame({
126 |         'url': urls * (n_rows // len(urls) + 1)
127 |     }).head(n_rows)
128 | 
129 |     # Measure encoding time
130 |     start_time = time.time()
131 |     df_encoded = encode_dataframe_column(df, 'url', key_manager)
132 |     encode_time = time.time() - start_time
133 |     print(f"Encoding time for {n_rows} rows: {encode_time:.2f} seconds")
134 | 
135 |     # Measure decoding time
136 |     start_time = time.time()
137 |     df_decoded = decode_dataframe_column(df_encoded, 'url_encoded', key_manager)
138 |     decode_time = time.time() - start_time
139 |     print(f"Decoding time for {n_rows} rows: {decode_time:.2f} seconds")
140 | 
141 |     # Verify that the decoded URLs match the original
142 |     print("\nVerification:")
143 |     print(df['url'].equals(df_decoded['url']))
144 | 
145 |     # Calculate and print rows processed per second
146 |     print(f"\nEncoding speed: {n_rows / encode_time:.2f} rows/second")
147 |     print(f"Decoding speed: {n_rows / decode_time:.2f} rows/second")
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     main()


--------------------------------------------------------------------------------
/huggingface_utils/s3_utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os
 3 | import requests
 4 | import bittensor as bt
 5 | from typing import Dict, Any, Optional
 6 | 
 7 | 
 8 | class S3Auth:
 9 |     """Handles S3 authentication with blockchain commitments and Keypair signatures"""
10 | 
11 |     def __init__(self, s3_auth_url: str):
12 |         self.s3_auth_url = s3_auth_url
13 | 
14 |     def get_credentials(self,
15 |                         wallet: bt.wallet,
16 |                         source_name: str,
17 |                         subtensor: bt.subtensor) -> Optional[Dict[str, Any]]:
18 |         """Get S3 credentials using blockchain commitments and hotkey signature"""
19 |         try:
20 |             coldkey = wallet.get_coldkeypub().ss58_address
21 |             hotkey = wallet.hotkey.ss58_address
22 |             timestamp = int(time.time())
23 | 
24 |             commitment = f"s3:access:{coldkey}:{source_name}:{timestamp}"
25 |             # bt.logging.info(f"\ud83d\ude80 Committing to blockchain: {commitment}") todo add if it's going to be necessary
26 |             # success = subtensor.commit(wallet=wallet, netuid=netuid, data=commitment)
27 | 
28 |             # Sign the commitment
29 |             signature = wallet.hotkey.sign(commitment.encode())
30 |             signature_hex = signature.hex()
31 | 
32 |             payload = {
33 |                 "coldkey": coldkey,
34 |                 "hotkey": hotkey,
35 |                 "source": source_name,
36 |                 "timestamp": timestamp,
37 |                 "signature": signature_hex
38 |             }
39 | 
40 |             response = requests.post(
41 |                 f"{self.s3_auth_url.rstrip('/')}/get-folder-access",
42 |                 json=payload,
43 |                 timeout=30
44 |             )
45 | 
46 |             if response.status_code != 200:
47 |                 try:
48 |                     error_detail = response.json().get("detail", "Unknown error")
49 |                 except Exception:
50 |                     error_detail = response.text or "Unknown error"
51 |                 bt.logging.error(f"\u274c Failed to get S3 credentials: {error_detail}")
52 |                 return None
53 | 
54 |             return response.json()
55 | 
56 |         except Exception as e:
57 |             bt.logging.error(f"\u274c Error getting S3 credentials: {str(e)}")
58 |             return None
59 | 
60 |     def upload_file(self, file_path: str, creds: Dict[str, Any]) -> bool:
61 |         try:
62 |             key = f"{creds['folder']}{os.path.basename(file_path)}"
63 |             post_data = dict(creds['fields'])  # clone all fields (V4-compatible)
64 |             post_data['key'] = key  # overwrite key with actual file key
65 | 
66 |             with open(file_path, 'rb') as f:
67 |                 files = {'file': f}
68 |                 response = requests.post(creds['url'], data=post_data, files=files)
69 | 
70 |             if response.status_code == 204:
71 |                 bt.logging.info(f"✅ Upload success: {key}")
72 |                 return True
73 |             else:
74 |                 bt.logging.error(f"❌ Upload failed: {response.status_code} — {response.text}")
75 |                 return False
76 | 
77 |         except Exception as e:
78 |             bt.logging.error(f"❌ S3 Upload Exception for {file_path}: {e}")
79 |             return False
80 | 


--------------------------------------------------------------------------------
/neurons/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.3.8"
2 | version_split = __version__.split(".")
3 | __spec_version__ = (
4 |     (1000 * int(version_split[0]))
5 |     + (10 * int(version_split[1]))
6 |     + (1 * int(version_split[2]))
7 | )
8 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | apify-client==1.6.1
 2 | asyncpraw==7.8.0
 3 | bittensor==9.7.0
 4 | jupyter==1.0.0
 5 | numpy==2.0.1
 6 | pydantic==2.10.1
 7 | python-dotenv==1.0.0
 8 | pytz==2023.3.post1
 9 | rich==13.7.0
10 | torch==2.5.1
11 | wandb==0.18.7
12 | pandas~=2.2.2
13 | cryptography==43.0.3
14 | requests==2.32.3
15 | huggingface-hub==0.27.1
16 | datasets~=2.20.0
17 | pyarrow==17.0.0
18 | fsspec==2024.5.0
19 | psutil==5.9.8
20 | loguru==0.7.3
21 | google-api-python-client==2.167.0
22 | youtube-transcript-api==1.0.3
23 | isodate==0.7.2
24 | 


--------------------------------------------------------------------------------
/rewards/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/rewards/__init__.py


--------------------------------------------------------------------------------
/rewards/data_desirability_lookup.py:
--------------------------------------------------------------------------------
 1 | from common import constants
 2 | from common.data import DataLabel, DataSource
 3 | from rewards.data import DataSourceDesirability, DataDesirabilityLookup
 4 | 
 5 | #################################################################
 6 | 
 7 | # This list is outdated and is only used as a backup to Dynamic Desirability. 
 8 | # Please see the folder dynamic_desirability for more information on how reward 
 9 | # scale factors are constructed. 
10 | 
11 | #################################################################
12 | 
13 | LOOKUP = DataDesirabilityLookup(
14 |     distribution={
15 |         DataSource.REDDIT: DataSourceDesirability(
16 |             weight=0.6,
17 |             default_scale_factor=0.5,
18 |             label_scale_factors={
19 |                 DataLabel(value="r/Bitcoin"): 1.0,
20 |                 DataLabel(value="r/BitcoinCash"): 1.0,
21 |                 DataLabel(value="r/Bittensor_"): 1.0,
22 |                 DataLabel(value="r/Btc"): 1.0,
23 |                 DataLabel(value="r/Cryptocurrency"): 1.0,
24 |                 DataLabel(value="r/Cryptomarkets"): 1.0,
25 |                 DataLabel(value="r/EthereumClassic"): 1.0,
26 |                 DataLabel(value="r/Ethtrader"): 1.0,
27 |                 DataLabel(value="r/Filecoin"): 1.0,
28 |                 DataLabel(value="r/Monero"): 1.0,
29 |                 DataLabel(value="r/Polkadot"): 1.0,
30 |                 DataLabel(value="r/Solana"): 1.0,
31 |                 DataLabel(value="r/WallstreetBets"): 1.0,
32 |             },
33 |         ),
34 |         DataSource.X: DataSourceDesirability(
35 |             weight=0.4,
36 |             default_scale_factor=0.5,
37 |             label_scale_factors={
38 |                 DataLabel(value="#bitcoin"): 1.0,
39 |                 DataLabel(value="#bitcoincharts"): 1.0,
40 |                 DataLabel(value="#bitcoiner"): 1.0,
41 |                 DataLabel(value="#bitcoinexchange"): 1.0,
42 |                 DataLabel(value="#bitcoinmining"): 1.0,
43 |                 DataLabel(value="#bitcoinnews"): 1.0,
44 |                 DataLabel(value="#bitcoinprice"): 1.0,
45 |                 DataLabel(value="#bitcointechnology"): 1.0,
46 |                 DataLabel(value="#bitcointrading"): 1.0,
47 |                 DataLabel(value="#bittensor"): 1.0,
48 |                 DataLabel(value="#btc"): 1.0,
49 |                 DataLabel(value="#cryptocurrency"): 1.0,
50 |                 DataLabel(value="#crypto"): 1.0,
51 |                 DataLabel(value="#defi"): 1.0,
52 |                 DataLabel(value="#decentralizedfinance"): 1.0,
53 |                 DataLabel(value="#tao"): 1.0,
54 |             },
55 |         ),
56 |     },
57 |     max_age_in_hours=constants.DATA_ENTITY_BUCKET_AGE_LIMIT_DAYS * 24,
58 | )
59 | 


--------------------------------------------------------------------------------
/rewards/data_value_calculator.py:
--------------------------------------------------------------------------------
  1 | import datetime as dt
  2 | from typing import Optional, List, Dict, Tuple
  3 | from common.data import DataSource, TimeBucket, DateRange
  4 | from common.data_v2 import ScorableDataEntityBucket
  5 | from rewards.data import DataDesirabilityLookup
  6 | from scraping.scraper import HFValidationResult
  7 | from rewards import data_desirability_lookup
  8 | from common import utils
  9 | 
 10 | class DataValueCalculator:
 11 |     """Calculates how rewards are distributed across DataSources and DataLabels."""
 12 |     
 13 |     def __init__(self, model: DataDesirabilityLookup = data_desirability_lookup.LOOKUP):
 14 |         # Convert to primitive version for performance optimization
 15 |         self.model = model.to_primitive_data_desirability_lookup()
 16 |     
 17 |     
 18 |     def get_score_for_data_entity_bucket(
 19 |         self,
 20 |         scorable_data_entity_bucket: ScorableDataEntityBucket,
 21 |         current_time_bucket: TimeBucket
 22 |     ) -> float:
 23 |         """Returns the score for the given data entity bucket."""
 24 |         # Extract frequently used values
 25 |         time_bucket_id = scorable_data_entity_bucket.time_bucket_id
 26 |         label = scorable_data_entity_bucket.label
 27 |         source = scorable_data_entity_bucket.source
 28 |         
 29 |         # Calculate time scalar
 30 |         time_scalar = self._scale_factor_for_age(time_bucket_id, current_time_bucket.id)
 31 |         if time_scalar == 0.0:
 32 |             return 0.0  # No need to do further processing
 33 |         
 34 |         # Find matching jobs directly using time bucket ID
 35 |         # Currently only finds matching jobs where keyword is None.
 36 |         matching_jobs = self.model.find_matching_jobs(source, None, label, time_bucket_id)
 37 |         
 38 |         # Rest of method remains the same...
 39 |         
 40 |         data_source_weight = self.model.get_data_source_weight(scorable_data_entity_bucket.source)
 41 |         
 42 |         if matching_jobs:
 43 |             # Calculate score based on matching jobs
 44 |             total_score = 0.0
 45 |             for job in matching_jobs:
 46 |                 # Get job weight
 47 |                 job_weight = job["job_weight"]
 48 |                 
 49 |                 # Calculate time scalar
 50 |                 if job["start_timebucket"] or job["end_timebucket"]:
 51 |                     # For jobs with date constraints, if we've reached here, the time bucket
 52 |                     # overlaps with the job's date range, so use full time scalar of 1.0
 53 |                     time_scalar = 1.0
 54 |                 else:
 55 |                     # For jobs without date constraints, use linear depreciation
 56 |                     time_scalar = self._scale_factor_for_age(
 57 |                         scorable_data_entity_bucket.time_bucket_id, 
 58 |                         current_time_bucket.id
 59 |                     )
 60 |                 
 61 |                 # Add this job's contribution to total score
 62 |                 contribution = data_source_weight * job_weight * time_scalar * scorable_data_entity_bucket.scorable_bytes
 63 |                 total_score += contribution
 64 |             
 65 |             return total_score
 66 |         else:
 67 |             # No matching jobs - use default scale factor
 68 |             default_scale_factor = self.model.get_default_scale_factor(scorable_data_entity_bucket.source)
 69 |             time_scalar = self._scale_factor_for_age(
 70 |                 scorable_data_entity_bucket.time_bucket_id, 
 71 |                 current_time_bucket.id
 72 |             )
 73 |             
 74 |             return (
 75 |                 data_source_weight
 76 |                 * default_scale_factor
 77 |                 * time_scalar
 78 |                 * scorable_data_entity_bucket.scorable_bytes
 79 |             )
 80 |     
 81 |     
 82 |     def _scale_factor_for_age(
 83 |         self, time_bucket_id: int, current_time_bucket_id: int
 84 |     ) -> float:
 85 |         """Returns the score scalar for data age.
 86 |         
 87 |         Uses a linear depreciation function:
 88 |         - Current data is scored 1.0
 89 |         - Data at max_age_in_hours is scored 0.5
 90 |         - Older data is scored 0.0
 91 |         """
 92 |         # Data age is scored using a linear depreciation function, where data from now is scored 1 and data
 93 |         # that is max_age_in_hours old is scored 0.5.
 94 |         # All data older than max_age_in_hours is scored 0.
 95 | 
 96 |         # Note: This makes the assumption that TimeBuckets are 1 hour buckets, which isn't ideal,
 97 |         # but we make the trade-off because it has a notable impact on perf vs. constructing TimeBuckets
 98 |         # to compute the age in hours.
 99 |         data_age_in_hours = current_time_bucket_id - time_bucket_id
100 | 
101 |         # Safe guard against future data.
102 |         data_age_in_hours = max(0, data_age_in_hours)
103 | 
104 |         if data_age_in_hours > self.model.max_age_in_hours:
105 |             return 0.0
106 |         return 1.0 - (data_age_in_hours / (2 * self.model.max_age_in_hours))
107 |     
108 | 


--------------------------------------------------------------------------------
/scraping/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/scraping/__init__.py


--------------------------------------------------------------------------------
/scraping/apify.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List, Optional
 3 | from apify_client import ApifyClientAsync
 4 | from pydantic import BaseModel, Field, PositiveInt
 5 | import bittensor as bt
 6 | 
 7 | from dotenv import load_dotenv
 8 | 
 9 | from common.data import StrictBaseModel
10 | 
11 | load_dotenv()
12 | 
13 | 
14 | class RunConfig(StrictBaseModel):
15 |     """Configuration parameters for a single Apify Actor run."""
16 | 
17 |     api_key: str = Field(
18 |         description="The Apify API token.",
19 |         default=os.getenv("APIFY_API_TOKEN"),
20 |         min_length=1,  # Can't be empty.
21 |     )
22 | 
23 |     actor_id: str = Field(
24 |         description="The ID of the actor to run.",
25 |         min_length=1,  # Can't be empty.
26 |     )
27 | 
28 |     timeout_secs: PositiveInt = Field(
29 |         description="The timeout for the actor run.",
30 |         default=180,
31 |     )
32 | 
33 |     max_data_entities: PositiveInt = Field(
34 |         description="The maximum number of items to be returned by the actor. The client will not be charged for more items than this value.",
35 |         default=100,
36 |     )
37 | 
38 |     debug_info: str = Field(
39 |         description="Optional debug info to include in logs relating to this run."
40 |     )
41 | 
42 |     memory_mb: Optional[int] = Field(
43 |         description="The amount of memory in mb to use for this run.", default=None
44 |     )
45 | 
46 | 
47 | class ActorRunError(Exception):
48 |     """Exception raised when an actor run fails."""
49 | 
50 |     def __init__(self, message: str):
51 |         self.message = message
52 |         super().__init__(self.message)
53 | 
54 | 
55 | class ActorRunner:
56 |     def __init__(self):
57 |         pass
58 | 
59 |     async def run(self, config: RunConfig, run_input: dict) -> List[dict]:
60 |         """
61 |         Run an Apify actor and return the json results.
62 | 
63 |         Args:
64 |             config (ActorConfig): The configuration to use for running the actor.
65 |             run_input (dict): The input parameters for the actor run.
66 | 
67 |         Raises:
68 |             ActorRunError: If the actor run fails, raises an exception, with the run details in the exception message.
69 | 
70 |         Returns:
71 |             list[dict]: List of items fetched from the dataset.
72 |         """
73 | 
74 |         client = ApifyClientAsync(config.api_key)
75 | 
76 |         run = await client.actor(config.actor_id).call(
77 |             run_input=run_input,
78 |             max_items=config.max_data_entities,
79 |             timeout_secs=config.timeout_secs,
80 |             # If not set, the client will wait indefinitely for the run to finish. Ensure we don't wait forever.
81 |             wait_secs=config.timeout_secs + 5,
82 |             memory_mbytes=config.memory_mb,
83 |         )
84 | 
85 |         # We want a success status. Timeout is also okay because it will return partial results.
86 |         if "status" not in run or not (
87 |             run["status"].casefold() == "SUCCEEDED".casefold()
88 |             or run["status"].casefold() == "TIMED-OUT".casefold()
89 |         ):
90 |             raise ActorRunError(
91 |                 f"Actor ({config.actor_id}) [{config.debug_info}] failed: {run}"
92 |             )
93 |         iterator = client.dataset(run["defaultDatasetId"]).iterate_items()
94 |         items = [i async for i in iterator]
95 | 
96 |         return items
97 | 


--------------------------------------------------------------------------------
/scraping/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/scraping/config/__init__.py


--------------------------------------------------------------------------------
/scraping/config/config_reader.py:
--------------------------------------------------------------------------------
 1 | from scraping.config import model
 2 | from scraping import coordinator
 3 | 
 4 | class ConfigReader:
 5 |     """A class to read the scraping config from a json file."""
 6 |     
 7 |     @classmethod
 8 |     def load_config(cls, filepath: str) -> coordinator.CoordinatorConfig:
 9 |         """Loads the scraping config from json and returns it as a CoordinatorConfig.
10 |         
11 |         Raises:
12 |             ValidationError: if the file content is not valid.
13 |         """
14 |         
15 |         print(f"Loading file: {filepath}")
16 |         parsed_file = model.ScrapingConfig.parse_file(path=filepath)
17 |         print(f"Got parsed file: {parsed_file}")
18 |         return parsed_file.to_coordinator_config()


--------------------------------------------------------------------------------
/scraping/config/model.py:
--------------------------------------------------------------------------------
  1 | """This file contains the pydantic classes for the scraping config JSON file.
  2 | 
  3 | We use JSON for the configuring the scraping distribution config to make it easier
  4 | for miner's to customize their miners, while still being able to take advantage of 
  5 | auto-updates, in future.
  6 | 
  7 | The classes here are ~identical to their sibling classes in scraping/scraper.py, except
  8 | they contain natively serializable/deseriazable fields. All code should use the classes
  9 | in scraping/scraper.py. These classes are only intended to be used for deserializing
 10 | the scraping_config JSON file.
 11 | """
 12 | 
 13 | from typing import List, Optional
 14 | from pydantic import BaseModel, Field, PositiveInt, ConfigDict
 15 | from common import constants
 16 | from common.data import DataLabel, StrictBaseModel
 17 | from scraping import coordinator
 18 | from scraping.scraper import ScraperId
 19 | 
 20 | 
 21 | class LabelScrapingConfig(StrictBaseModel):
 22 |     """Describes what labels to scrape."""
 23 | 
 24 |     model_config = ConfigDict()
 25 | 
 26 |     label_choices: Optional[List[str]] = Field(
 27 |         description="""The collection of labels to choose from when performing a scrape.
 28 |         On a given scrape, 1 label will be chosen at random from this list.
 29 |         
 30 |         An empty list is treated as a non-existant label. In that case, no filter is applied when scraping data from this source.
 31 |         """
 32 |     )
 33 | 
 34 |     max_age_hint_minutes: int = Field(
 35 |         description="""The maximum age of data that this scrape should fetch. A random TimeBucket (currently hour block),
 36 |         will be chosen within the time frame (now - max_age_hint_minutes, now), using a probality distribution aligned
 37 |         with how validators score data freshness.
 38 |         
 39 |         Note: not all data sources provide date filters, so this property should be thought of as a hint to the scraper, not a rule.
 40 |         """,
 41 |         default=60 * 24 * constants.DATA_ENTITY_BUCKET_AGE_LIMIT_DAYS,
 42 |     )
 43 | 
 44 |     max_data_entities: Optional[PositiveInt] = Field(
 45 |         default=None,
 46 |         description="The maximum number of items to fetch in a single scrape for this label. If None, the scraper will fetch as many items possible.",
 47 |     )
 48 | 
 49 |     def to_coordinator_label_scrape_config(self) -> coordinator.LabelScrapingConfig:
 50 |         """Returns the internal LabelScrapingConfig representation"""
 51 |         labels = (
 52 |             [DataLabel(value=val) for val in self.label_choices]
 53 |             if self.label_choices
 54 |             else None
 55 |         )
 56 |         return coordinator.LabelScrapingConfig(
 57 |             label_choices=labels,
 58 |             max_age_hint_minutes=self.max_age_hint_minutes,
 59 |             max_data_entities=self.max_data_entities,
 60 |         )
 61 | 
 62 | 
 63 | class ScraperConfig(StrictBaseModel):
 64 |     """Configures a specific scraper."""
 65 | 
 66 |     model_config = ConfigDict()
 67 | 
 68 |     scraper_id: ScraperId = Field(description="The scraper being configured.")
 69 | 
 70 |     cadence_seconds: PositiveInt = Field(
 71 |         description="""Configures how often to scrape from this data source, measured in seconds."""
 72 |     )
 73 | 
 74 |     labels_to_scrape: List[LabelScrapingConfig] = Field(
 75 |         description="""Describes the type of data to scrape from this source.
 76 |         
 77 |         The scraper will perform one scrape per entry in this list every 'cadence_seconds'.
 78 |         """
 79 |     )
 80 | 
 81 |     def to_coordinator_scraper_config(self) -> coordinator.ScraperConfig:
 82 |         """Returns the internal ScraperConfig representation"""
 83 |         return coordinator.ScraperConfig(
 84 |             cadence_seconds=self.cadence_seconds,
 85 |             labels_to_scrape=[
 86 |                 label.to_coordinator_label_scrape_config()
 87 |                 for label in self.labels_to_scrape
 88 |             ],
 89 |         )
 90 | 
 91 | 
 92 | class ScrapingConfig(StrictBaseModel):
 93 |     """Configuration for all scrapers."""
 94 | 
 95 |     model_config = ConfigDict()
 96 | 
 97 |     scraper_configs: List[ScraperConfig] = Field(
 98 |         description="The list of scrapers (and their scraping config) this miner should scrape from. Only scrapers in this list will be used."
 99 |     )
100 | 
101 |     def to_coordinator_config(self) -> coordinator.CoordinatorConfig:
102 |         """Returns the CoordinatorConfig."""
103 |         ids_and_configs = [
104 |             [config.scraper_id, config.to_coordinator_scraper_config()]
105 |             for config in self.scraper_configs
106 |         ]
107 |         return coordinator.CoordinatorConfig(
108 |             scraper_configs={id: config for id, config in ids_and_configs}
109 |         )


--------------------------------------------------------------------------------
/scraping/config/scraping_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "scraper_configs": [
 3 |         {
 4 |             "scraper_id": "X.apidojo",
 5 |             "cadence_seconds": 300,
 6 |             "labels_to_scrape": [
 7 |                 {
 8 |                     "label_choices": [
 9 |                         "#bitcoin",
10 |                         "#bitcoincharts",
11 |                         "#bitcoiner",
12 |                         "#bitcoinexchange",
13 |                         "#bitcoinmining",
14 |                         "#bitcoinnews",
15 |                         "#bitcoinprice",
16 |                         "#bitcointechnology",
17 |                         "#bitcointrading",
18 |                         "#bittensor",
19 |                         "#btc",
20 |                         "#cryptocurrency",
21 |                         "#crypto",
22 |                         "#defi",
23 |                         "#decentralizedfinance",
24 |                         "#tao"
25 |                     ],
26 |                     "max_data_entities": 75
27 |                 }
28 |             ]
29 |         },
30 |         {
31 |             "scraper_id": "Reddit.custom",
32 |             "cadence_seconds": 60,
33 |             "labels_to_scrape": [
34 |                 {
35 |                     "label_choices": [
36 |                         "r/bittensor_",
37 |                         "r/bitcoin",
38 |                         "r/BitcoinCash",
39 |                         "r/Bittensor_",
40 |                         "r/Btc",
41 |                         "r/Cryptocurrency",
42 |                         "r/Cryptomarkets",
43 |                         "r/EthereumClassic",
44 |                         "r/Ethtrader",
45 |                         "r/Filecoin",
46 |                         "r/Monero",
47 |                         "r/Polkadot",
48 |                         "r/Solana",
49 |                         "r/WallstreetBets"
50 |                     ],
51 |                     "max_data_entities": 100
52 |                 }
53 |             ]
54 |         },
55 |         {
56 |             "scraper_id": "YouTube.transcript",
57 |             "cadence_seconds": 100,
58 |             "labels_to_scrape": [
59 |                 {
60 |                     "label_choices": [
61 |                         "#ytc_c_UCAuUUnT6oDeKwE6v1NGQxug",
62 |                         "#ytc_c_UCYO_jab_esuFRV4b17AJtAw",
63 |                         "#ytc_c_UCsXVk37bltHxD1rDPwtNM8Q",
64 |                         "#ytc_c_UCSHZKyawb77ixDdsGog4iWA",
65 |                         "#ytc_c_UCR93yACeNzxMSk6Y1cHM2pA",
66 |                         "#ytc_c_UCbLhGKVY-bJPcawebgtNfbw"
67 |                     ],
68 |                     "max_data_entities": 50,
69 |                      "max_age_hint_minutes": 43200
70 |                 }
71 |             ]
72 |         }
73 |     ]
74 | }


--------------------------------------------------------------------------------
/scraping/provider.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | from typing import Callable, Dict
 3 | from common.data import DataSource
 4 | from scraping.reddit.reddit_lite_scraper import RedditLiteScraper
 5 | from scraping.reddit.reddit_custom_scraper import RedditCustomScraper
 6 | from scraping.scraper import Scraper, ScraperId
 7 | from scraping.x.microworlds_scraper import MicroworldsTwitterScraper
 8 | from scraping.x.apidojo_scraper import ApiDojoTwitterScraper
 9 | from scraping.x.quacker_url_scraper import QuackerUrlScraper
10 | from scraping.youtube.youtube_custom_scraper import YouTubeTranscriptScraper
11 | 
12 | 
13 | DEFAULT_FACTORIES = {
14 |     ScraperId.REDDIT_LITE: RedditLiteScraper,
15 |     # For backwards compatibility with old configs, remap x.flash to x.apidojo.
16 |     ScraperId.X_FLASH: MicroworldsTwitterScraper,
17 |     ScraperId.REDDIT_CUSTOM: RedditCustomScraper,
18 |     ScraperId.X_MICROWORLDS: MicroworldsTwitterScraper,
19 |     ScraperId.X_APIDOJO: ApiDojoTwitterScraper,
20 |     ScraperId.X_QUACKER: QuackerUrlScraper,
21 |     ScraperId.YOUTUBE_TRANSCRIPT: YouTubeTranscriptScraper
22 | }
23 | 
24 | 
25 | class ScraperProvider:
26 |     """A scraper provider will provide the correct scraper based on the source to be scraped."""
27 | 
28 |     def __init__(
29 |         self, factories: Dict[DataSource, Callable[[], Scraper]] = DEFAULT_FACTORIES
30 |     ):
31 |         self.factories = factories
32 | 
33 |     def get(self, scraper_id: ScraperId) -> Scraper:
34 |         """Returns a scraper for the given scraper id."""
35 | 
36 |         assert scraper_id in self.factories, f"Scraper id {scraper_id} not supported."
37 | 
38 |         return self.factories[scraper_id]()
39 | 


--------------------------------------------------------------------------------
/scraping/reddit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/scraping/reddit/__init__.py


--------------------------------------------------------------------------------
/scraping/reddit/model.py:
--------------------------------------------------------------------------------
 1 | import datetime as dt
 2 | from enum import Enum
 3 | from typing import Optional
 4 | 
 5 | # Use v1 for these models to keep serialization consistent.
 6 | # Pydantic v2 doesn't include spaces in its serialization.
 7 | from pydantic.v1 import BaseModel, Field
 8 | 
 9 | 
10 | from common import constants
11 | from common.data import DataEntity, DataLabel, DataSource
12 | from scraping import utils
13 | 
14 | # The username used for deleted users.
15 | # This is the value returned by the Apify lite scraper.
16 | # Other scrapers may need to adapt their code to use this value.
17 | DELETED_USER = "[deleted]"
18 | 
19 | 
20 | class RedditDataType(str, Enum):
21 |     POST = "post"
22 |     COMMENT = "comment"
23 | 
24 | 
25 | class RedditContent(BaseModel):
26 |     """The content model for Reddit data.
27 | 
28 |     Useful to standardize the representation of Reddit data, that could be scraped from different sources.
29 |     """
30 | 
31 |     class Config:
32 |         extra = "forbid"
33 | 
34 |     id: str = Field(description="The unique ID of the post/comment")
35 |     url: str = Field(
36 |         description="URL of the post/comment",
37 |     )
38 |     username: str
39 |     community: str = Field(
40 |         alias="communityName", description="The subreddit. Includes the 'r/' prefix"
41 |     )
42 |     body: str = Field()
43 |     created_at: dt.datetime = Field(alias="createdAt")
44 |     data_type: RedditDataType = Field(alias="dataType")
45 | 
46 |     # Post-only fields.
47 |     title: Optional[str] = Field(
48 |         description="Title of the post. Empty for comments", default=None
49 |     )
50 | 
51 |     # Comment-only fields.
52 |     parent_id: Optional[str] = Field(
53 |         description="The ID of the parent comment. Only applicable to comments.",
54 |         alias="parentId",
55 |         default=None,
56 |     )
57 | 
58 |     @classmethod
59 |     def to_data_entity(cls, content: "RedditContent") -> DataEntity:
60 |         """Converts the RedditContent to a DataEntity."""
61 |         entity_created_at = content.created_at
62 |         content.created_at = utils.obfuscate_datetime_to_minute(entity_created_at)
63 |         content_bytes = content.json(by_alias=True).encode("utf-8")
64 | 
65 |         return DataEntity(
66 |             uri=content.url,
67 |             datetime=entity_created_at,
68 |             source=DataSource.REDDIT,
69 |             label=DataLabel(
70 |                 value=content.community.lower()[: constants.MAX_LABEL_LENGTH]
71 |             ),
72 |             content=content_bytes,
73 |             content_size_bytes=len(content_bytes),
74 |         )
75 | 
76 |     @classmethod
77 |     def from_data_entity(cls, data_entity: DataEntity) -> "RedditContent":
78 |         """Converts a DataEntity to a RedditContent."""
79 | 
80 |         return RedditContent.parse_raw(data_entity.content.decode("utf-8"))
81 | 


--------------------------------------------------------------------------------
/scraping/scraper.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | from enum import Enum
  3 | from typing import Dict, List, Optional
  4 | from pydantic import BaseModel, Field, PositiveInt, ConfigDict
  5 | 
  6 | from common.data import DataEntity, DataLabel, DataSource, StrictBaseModel
  7 | from common.date_range import DateRange
  8 | from storage.miner.miner_storage import MinerStorage
  9 | 
 10 | 
 11 | class ScraperId(str, Enum):
 12 |     """The id for each of the scrapers."""
 13 | 
 14 |     REDDIT_LITE = "Reddit.lite"
 15 |     X_FLASH = "X.flash"
 16 |     REDDIT_CUSTOM = "Reddit.custom"
 17 |     X_MICROWORLDS = "X.microworlds"
 18 |     X_APIDOJO = "X.apidojo"
 19 |     X_QUACKER = "X.quacker"
 20 |     YOUTUBE_TRANSCRIPT = "YouTube.transcript"
 21 | 
 22 | 
 23 | class ValidationResult(StrictBaseModel):
 24 |     """Data class to contain the result of a scraping validation."""
 25 | 
 26 |     model_config = ConfigDict(frozen=True)
 27 | 
 28 |     is_valid: bool
 29 |     content_size_bytes_validated: int = Field(
 30 |         description="The content size in bytes validated as part of this check", ge=0
 31 |     )
 32 |     reason: str = Field(
 33 |         description="An optional reason for the validation result.",
 34 |         default="",
 35 |     )
 36 | 
 37 | 
 38 | class HFValidationResult(StrictBaseModel):
 39 |     """Data class to contain the result of a validation for a miner's Hugging Face dataset. """
 40 | 
 41 |     class Config:
 42 |         frozen = True
 43 | 
 44 |     is_valid: bool
 45 | 
 46 |     validation_percentage: float = Field(
 47 |         description="The percentage of successfully validated HF rows. "
 48 |     )
 49 | 
 50 |     reason: str = Field(
 51 |         description="An optional reason for the validation result. ",
 52 |         default=""
 53 |     )
 54 | 
 55 | 
 56 | class ScrapeConfig(StrictBaseModel):
 57 |     """Data class to contain the configuration to be used for scraping."""
 58 | 
 59 |     model_config = ConfigDict(frozen=True)
 60 | 
 61 |     entity_limit: Optional[PositiveInt]
 62 |     date_range: DateRange
 63 |     labels: Optional[List[DataLabel]] = Field(
 64 |         default=None,
 65 |         description="Optional labels to filter the scrape by. If none are provided, the data source will issue a scrape for 'all' data, without any label filters applied",
 66 |     )
 67 | 
 68 | 
 69 | class LabelScrapingFrequency(StrictBaseModel):
 70 |     """Data class to contain the frequency distribution for a set of labels."""
 71 | 
 72 |     model_config = ConfigDict(frozen=True)
 73 | 
 74 |     labels: List[DataLabel]
 75 |     frequency: float
 76 | 
 77 | 
 78 | class SourceScrapingFrequency(StrictBaseModel):
 79 |     """Data class to contain the frequency distribution for a source across labels."""
 80 | 
 81 |     model_config = ConfigDict(frozen=True)
 82 | 
 83 |     source: DataSource
 84 |     frequency: float
 85 |     label_frequencies: List[LabelScrapingFrequency]
 86 | 
 87 | 
 88 | class ScrapingDistribution(StrictBaseModel):
 89 |     """A relative distribution across sources and labels."""
 90 | 
 91 |     model_config = ConfigDict(frozen=True)
 92 | 
 93 |     distribution: List[SourceScrapingFrequency]
 94 | 
 95 | 
 96 | class Scraper(abc.ABC):
 97 |     """An abstract base class for scrapers across all data sources."""
 98 | 
 99 |     @abc.abstractmethod
100 |     async def validate(self, entities: List[DataEntity]) -> List[ValidationResult]:
101 |         """Validate the correctness of a list of DataEntities by URI."""
102 |         pass
103 | 
104 |     @abc.abstractmethod
105 |     async def scrape(self, scrape_config: ScrapeConfig) -> List[DataEntity]:
106 |         """Scrapes a batch of data based on the specified ScrapeConfig."""
107 |         pass
108 | 
109 |     @abc.abstractmethod
110 |     async def validate_hf(self, entities) -> bool:
111 |         """Validate the correctness of a list of HF retrieved data"""
112 |         pass


--------------------------------------------------------------------------------
/scraping/utils.py:
--------------------------------------------------------------------------------
 1 | import datetime as dt
 2 | 
 3 | 
 4 | def obfuscate_datetime_to_minute(datetime_to_obfuscate: dt.datetime) -> dt.datetime:
 5 |     """_summary_
 6 | 
 7 |     Args:
 8 |         datetime_to_obfuscate (dt.datetime): Datetime to generate an obfuscated version of.
 9 | 
10 |     Returns:
11 |         dt.datetime: obfuscated datetime.
12 |     """
13 |     return datetime_to_obfuscate.replace(second=0, microsecond=0)
14 | 


--------------------------------------------------------------------------------
/scraping/x/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/scraping/x/__init__.py


--------------------------------------------------------------------------------
/scraping/x/model.py:
--------------------------------------------------------------------------------
 1 | import datetime as dt
 2 | import json
 3 | from typing import Dict, List, Optional
 4 | # Use v1 for these models to keep serialization consistent.
 5 | # Pydantic v2 doesn't include spaces in its serialization.
 6 | from pydantic.v1 import BaseModel, Field
 7 | 
 8 | from common import constants
 9 | from common.data import DataEntity, DataLabel, DataSource
10 | from scraping import utils
11 | 
12 | 
13 | class XContent(BaseModel):
14 |     """The content model for tweets.
15 | 
16 |     The model helps standardize the data format for tweets, even if they're scraped using different methods.
17 |     """
18 | 
19 |     class Config:
20 |         extra = "forbid"
21 | 
22 |     # model_config should NOT be set by Miners.
23 |     # In the near future, Validators will penalized Miners who set this field.
24 |     model_config: Dict[str, str] = Field(default=None)
25 | 
26 |     username: str
27 |     text: str
28 |     url: str
29 |     timestamp: dt.datetime
30 |     tweet_hashtags: List[str] = Field(
31 |         default_factory=list,
32 |         description="A list of hashtags associated with the tweet, in order they appear in the tweet. Note: it's critical this ordering is respected as the first tag is used as the DataLabel for the index.",
33 |     )
34 |     media: Optional[List[str]] = Field(
35 |         default=None,
36 |         description="A list of media URLs associated with the tweet. Can be None if no media is present.",
37 |     )
38 | 
39 |     # Enhanced fields
40 |     user_id: Optional[str] = None
41 |     user_display_name: Optional[str] = None
42 |     user_verified: Optional[bool] = None
43 | 
44 |     # Non-dynamic tweet metadata
45 |     tweet_id: Optional[str] = None
46 |     is_reply: Optional[bool] = None
47 |     is_quote: Optional[bool] = None
48 | 
49 |     # Additional metadata
50 |     conversation_id: Optional[str] = None
51 |     in_reply_to_user_id: Optional[str] = None
52 | 
53 |     @classmethod
54 |     def to_data_entity(cls, content: "XContent") -> DataEntity:
55 |         """Converts the XContent to a DataEntity."""
56 |         entity_timestamp = content.timestamp
57 |         content.timestamp = utils.obfuscate_datetime_to_minute(entity_timestamp)
58 |         content_bytes = content.json(exclude_none=True).encode("utf-8")
59 | 
60 |         return DataEntity(
61 |             uri=content.url,
62 |             datetime=entity_timestamp,
63 |             source=DataSource.X,
64 |             label=(
65 |                 DataLabel(
66 |                     value=content.tweet_hashtags[0].lower()[
67 |                         : constants.MAX_LABEL_LENGTH
68 |                     ]
69 |                 )
70 |                 if content.tweet_hashtags
71 |                 else None
72 |             ),
73 |             content=content_bytes,
74 |             content_size_bytes=len(content_bytes),
75 |         )
76 | 
77 |     @classmethod
78 |     def from_data_entity(cls, data_entity: DataEntity) -> "XContent":
79 |         """Converts a DataEntity to an XContent."""
80 |         content_str = data_entity.content.decode("utf-8")
81 |         return XContent.parse_raw(content_str)


--------------------------------------------------------------------------------
/scraping/youtube/model.py:
--------------------------------------------------------------------------------
 1 | import datetime as dt
 2 | from typing import Dict, List, Optional
 3 | from pydantic.v1 import BaseModel, Field
 4 | from common.data import DataEntity, DataLabel, DataSource
 5 | 
 6 | 
 7 | class YouTubeContent(BaseModel):
 8 |     """The content model for YouTube transcripts.
 9 | 
10 |     This model standardizes how YouTube transcript data is stored,
11 |     regardless of how it was scraped.
12 |     """
13 | 
14 |     class Config:
15 |         extra = "forbid"
16 | 
17 |     video_id: str = Field(
18 |         description="The YouTube video ID (e.g., 'dQw4w9WgXcQ')"
19 |     )
20 | 
21 |     title: str = Field(
22 |         description="The title of the YouTube video"
23 |     )
24 | 
25 |     channel_id: str = Field(
26 |         description="The YouTube channel ID"
27 |     )
28 | 
29 |     channel_name: str = Field(
30 |         description="The name of the YouTube channel"
31 |     )
32 | 
33 |     upload_date: dt.datetime = Field(
34 |         description="The date the video was uploaded"
35 |     )
36 | 
37 |     transcript: List[Dict] = Field(
38 |         description="The transcript of the video, as a list of dictionaries with 'text', 'start', and 'duration' keys",
39 |         default_factory=list
40 |     )
41 | 
42 |     url: str = Field(
43 |         description="The URL of the YouTube video"
44 |     )
45 | 
46 |     language: str = Field(
47 |         description="The language of the transcript",
48 |         default="en"
49 |     )
50 | 
51 |     duration_seconds: int = Field(
52 |         description="The duration of the video in seconds",
53 |         default=0
54 |     )
55 | 
56 |     @classmethod
57 |     def to_data_entity(cls, content: "YouTubeContent", original_label: Optional[str] = None) -> DataEntity:
58 |         """Converts the YouTubeContent to a DataEntity.
59 | 
60 |         Args:
61 |             content: The YouTubeContent object to convert
62 |             original_label: The original label type that was used for scraping (optional)
63 | 
64 |         Returns:
65 |             A DataEntity with the appropriate label
66 |         """
67 |         entity_timestamp = content.upload_date
68 |         content_bytes = content.json(exclude_none=True).encode("utf-8")
69 | 
70 |         # Create a DataLabel - ALWAYS use NEW format for output, but check BOTH old and new for input
71 |         if original_label and (original_label.startswith('#youtube_v_') or original_label.startswith('#ytc_v_')):
72 |             # If scraped with a video label, use NEW video label format
73 |             label = DataLabel(value=f"#ytc_v_{content.video_id}")
74 |         else:
75 |             # Default to NEW channel label format
76 |             label = DataLabel(value=f"#ytc_c_{content.channel_id}")
77 | 
78 |         return DataEntity(
79 |             uri=content.url,
80 |             datetime=entity_timestamp,
81 |             source=DataSource.YOUTUBE,
82 |             label=label,
83 |             content=content_bytes,
84 |             content_size_bytes=len(content_bytes),
85 |         )
86 | 
87 |     @classmethod
88 |     def from_data_entity(cls, data_entity: DataEntity) -> "YouTubeContent":
89 |         """Converts a DataEntity to a YouTubeContent."""
90 |         content_str = data_entity.content.decode("utf-8")
91 |         return YouTubeContent.parse_raw(content_str)


--------------------------------------------------------------------------------
/scraping/youtube/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from urllib.parse import urlparse, parse_qs
  3 | 
  4 | 
  5 | def extract_video_id(url: str) -> str:
  6 |     """
  7 |     Extracts the YouTube video ID from a YouTube URL.
  8 | 
  9 |     Args:
 10 |         url: The YouTube video URL.
 11 | 
 12 |     Returns:
 13 |         The YouTube video ID or an empty string if no ID could be extracted.
 14 |     """
 15 |     if not url:
 16 |         return ""
 17 | 
 18 |     # Standard YouTube URLs like https://www.youtube.com/watch?v=dQw4w9WgXcQ
 19 |     parsed_url = urlparse(url)
 20 |     if parsed_url.netloc in ('youtube.com', 'www.youtube.com'):
 21 |         query_params = parse_qs(parsed_url.query)
 22 |         if 'v' in query_params:
 23 |             return query_params['v'][0]
 24 | 
 25 |     # Short YouTube URLs like https://youtu.be/dQw4w9WgXcQ
 26 |     if parsed_url.netloc == 'youtu.be':
 27 |         return parsed_url.path.strip('/')
 28 | 
 29 |     # Embedded YouTube URLs like https://www.youtube.com/embed/dQw4w9WgXcQ
 30 |     if parsed_url.netloc in ('youtube.com', 'www.youtube.com') and '/embed/' in parsed_url.path:
 31 |         return parsed_url.path.split('/embed/')[1].split('/')[0].split('?')[0]
 32 | 
 33 |     # Try to find a video ID pattern in the URL
 34 |     video_id_pattern = r'(?:v=|v\/|embed\/|youtu\.be\/|\/v\/|\/e\/|watch\?v=|youtube.com\/v\/|youtube.com\/embed\/|youtu.be\/|v=|e=|u\/\w+\/|embed\?video_id=|\/videos\/|\/embed\/|\/v\/|watch\?.*v=|youtube.com\/embed\/)([\w-]{11})'
 35 |     match = re.search(video_id_pattern, url)
 36 |     if match:
 37 |         return match.group(1)
 38 | 
 39 |     return ""
 40 | 
 41 | 
 42 | def normalize_youtube_url(url: str) -> str:
 43 |     """
 44 |     Normalizes a YouTube URL to a standard form.
 45 | 
 46 |     Args:
 47 |         url: The YouTube URL to normalize.
 48 | 
 49 |     Returns:
 50 |         The normalized URL or the original if no normalization is possible.
 51 |     """
 52 |     video_id = extract_video_id(url)
 53 |     if video_id:
 54 |         return f"https://www.youtube.com/watch?v={video_id}"
 55 |     return url
 56 | 
 57 | 
 58 | def validate_youtube_content(actual_content, entity_to_validate, threshold=0.8):
 59 |     """
 60 |     Validates a YouTube content entity against an actual content.
 61 | 
 62 |     Args:
 63 |         actual_content: The actual YouTube content from the API.
 64 |         entity_to_validate: The entity that needs validation.
 65 |         threshold: The similarity threshold for text comparison.
 66 | 
 67 |     Returns:
 68 |         A tuple (is_valid, reason) where is_valid is a boolean and reason is a string.
 69 |     """
 70 |     # Check if the video IDs match
 71 |     if actual_content.video_id != entity_to_validate.video_id:
 72 |         return False, "Video IDs do not match"
 73 | 
 74 |     # Check if the upload dates are within a reasonable range
 75 |     # (YouTube may show slightly different timestamps depending on time zones)
 76 |     date_difference = abs((actual_content.upload_date - entity_to_validate.upload_date).total_seconds())
 77 |     if date_difference > 86400:  # More than 24 hours difference
 78 |         return False, "Upload dates do not match"
 79 | 
 80 |     # Check if the titles are similar enough
 81 |     if not texts_are_similar(actual_content.title, entity_to_validate.title, threshold):
 82 |         return False, "Titles do not match"
 83 | 
 84 |     # Check if the transcripts are similar enough
 85 |     if not transcripts_are_similar(actual_content.transcript, entity_to_validate.transcript, threshold):
 86 |         return False, "Transcripts do not match"
 87 | 
 88 |     return True, "Content is valid"
 89 | 
 90 | 
 91 | def texts_are_similar(text1, text2, threshold=0.8):
 92 |     """
 93 |     Check if two texts are similar enough.
 94 | 
 95 |     Args:
 96 |         text1: First text.
 97 |         text2: Second text.
 98 |         threshold: Similarity threshold (0-1).
 99 | 
100 |     Returns:
101 |         True if the texts are similar enough, False otherwise.
102 |     """
103 |     if not text1 or not text2:
104 |         return text1 == text2
105 | 
106 |     # Simple approach: check if enough words from one text appear in the other
107 |     words1 = set(text1.lower().split())
108 |     words2 = set(text2.lower().split())
109 | 
110 |     # Calculate overlap ratio
111 |     overlap = len(words1.intersection(words2))
112 |     similarity = overlap / max(len(words1), len(words2))
113 | 
114 |     return similarity >= threshold
115 | 
116 | 
117 | def transcripts_are_similar(transcript1, transcript2, threshold=0.8):
118 |     """
119 |     Check if two transcripts are similar enough.
120 | 
121 |     Args:
122 |         transcript1: First transcript (list of dicts with 'text' keys).
123 |         transcript2: Second transcript (list of dicts with 'text' keys).
124 |         threshold: Similarity threshold (0-1).
125 | 
126 |     Returns:
127 |         True if the transcripts are similar enough, False otherwise.
128 |     """
129 |     if not transcript1 or not transcript2:
130 |         return transcript1 == transcript2
131 | 
132 |     # Extract text from both transcripts
133 |     text1 = " ".join([item.get('text', '') for item in transcript1])
134 |     text2 = " ".join([item.get('text', '') for item in transcript2])
135 | 
136 |     return texts_are_similar(text1, text2, threshold)


--------------------------------------------------------------------------------
/scripts/start_validator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script runs a validator process and automatically updates it when a new version is released.
  3 | Command-line arguments will be forwarded to validator (`neurons/validator.py`), so you can pass
  4 | them like this:
  5 |     python3 scripts/start_validator.py --wallet.name=my-wallet
  6 | Auto-updates are enabled by default and will make sure that the latest version is always running
  7 | by pulling the latest version from git and upgrading python packages. This is done periodically.
  8 | Local changes may prevent the update, but they will be preserved.
  9 | 
 10 | To disable auto-updates, pass --no_autoupdate.
 11 | 
 12 | The script will use the same virtual environment as the one used to run it. If you want to run
 13 | validator within virtual environment, run this auto-update script from the virtual environment.
 14 | 
 15 | Pm2 is required for this script. This script will start a pm2 process using the name provided by
 16 | the --pm2_name argument.
 17 | """
 18 | import argparse
 19 | import logging
 20 | import subprocess
 21 | import sys
 22 | import time
 23 | from datetime import timedelta
 24 | from pathlib import Path
 25 | from shlex import split
 26 | from typing import List
 27 | 
 28 | log = logging.getLogger(__name__)
 29 | UPDATES_CHECK_TIME = timedelta(minutes=15)
 30 | ROOT_DIR = Path(__file__).parent.parent
 31 | 
 32 | 
 33 | def get_version() -> str:
 34 |     """Extract the version as current git commit hash"""
 35 |     result = subprocess.run(
 36 |         split("git rev-parse HEAD"),
 37 |         check=True,
 38 |         capture_output=True,
 39 |         cwd=ROOT_DIR,
 40 |     )
 41 |     commit = result.stdout.decode().strip()
 42 |     assert len(commit) == 40, f"Invalid commit hash: {commit}"
 43 |     return commit[:8]
 44 | 
 45 | 
 46 | def start_validator_process(pm2_name: str, args: List[str]) -> subprocess.Popen:
 47 |     """
 48 |     Spawn a new python process running neurons.validator.
 49 |     `sys.executable` ensures thet the same python interpreter is used as the one
 50 |     used to run this auto-updater.
 51 |     """
 52 |     assert sys.executable, "Failed to get python executable"
 53 | 
 54 |     log.info("Starting validator process with pm2, name: %s", pm2_name)
 55 |     process = subprocess.Popen(
 56 |         (
 57 |             "pm2",
 58 |             "start",
 59 |             sys.executable,
 60 |             "--name",
 61 |             pm2_name,
 62 |             "--",
 63 |             "-m",
 64 |             "neurons.validator",
 65 |             *args,
 66 |         ),
 67 |         cwd=ROOT_DIR,
 68 |     )
 69 |     process.pm2_name = pm2_name
 70 | 
 71 |     return process
 72 | 
 73 | 
 74 | def stop_validator_process(process: subprocess.Popen) -> None:
 75 |     """Stop the validator process"""
 76 |     subprocess.run(("pm2", "delete", process.pm2_name), cwd=ROOT_DIR, check=True)
 77 | 
 78 | 
 79 | def pull_latest_version() -> None:
 80 |     """
 81 |     Pull the latest version from git.
 82 |     This uses `git pull --rebase`, so if any changes were made to the local repository,
 83 |     this will try to apply them on top of origin's changes. This is intentional, as we
 84 |     don't want to overwrite any local changes. However, if there are any conflicts,
 85 |     this will abort the rebase and return to the original state.
 86 |     The conflicts are expected to happen rarely since validator is expected
 87 |     to be used as-is.
 88 |     """
 89 |     try:
 90 |         subprocess.run(split("git pull --rebase --autostash"), check=True, cwd=ROOT_DIR)
 91 |     except subprocess.CalledProcessError as exc:
 92 |         log.error("Failed to pull, reverting: %s", exc)
 93 |         subprocess.run(split("git rebase --abort"), check=True, cwd=ROOT_DIR)
 94 | 
 95 | 
 96 | def upgrade_packages() -> None:
 97 |     """
 98 |     Upgrade python packages by running `pip install --upgrade -r requirements.txt`.
 99 |     Notice: this won't work if some package in `requirements.txt` is downgraded.
100 |     Ignored as this is unlikely to happen.
101 |     """
102 | 
103 |     log.info("Upgrading packages")
104 |     try:
105 |         subprocess.run(
106 |             split(f"{sys.executable} -m pip install -e ."),
107 |             check=True,
108 |             cwd=ROOT_DIR,
109 |         )
110 |     except subprocess.CalledProcessError as exc:
111 |         log.error("Failed to upgrade packages, proceeding anyway. %s", exc)
112 | 
113 | 
114 | def main(pm2_name: str, args: List[str]) -> None:
115 |     """
116 |     Run the validator process and automatically update it when a new version is released.
117 |     This will check for updates every `UPDATES_CHECK_TIME` and update the validator
118 |     if a new version is available. Update is performed as simple `git pull --rebase`.
119 |     """
120 | 
121 |     validator = start_validator_process(pm2_name, args)
122 |     current_version = latest_version = get_version()
123 |     log.info("Current version: %s", current_version)
124 | 
125 |     try:
126 |         while True:
127 |             pull_latest_version()
128 |             latest_version = get_version()
129 |             log.info("Latest version: %s", latest_version)
130 | 
131 |             if latest_version != current_version:
132 |                 log.info(
133 |                     "Upgraded to latest version: %s -> %s",
134 |                     current_version,
135 |                     latest_version,
136 |                 )
137 |                 upgrade_packages()
138 | 
139 |                 stop_validator_process(validator)
140 |                 validator = start_validator_process(pm2_name, args)
141 |                 current_version = latest_version
142 | 
143 |             time.sleep(UPDATES_CHECK_TIME.total_seconds())
144 | 
145 |     finally:
146 |         stop_validator_process(validator)
147 | 
148 | 
149 | if __name__ == "__main__":
150 |     logging.basicConfig(
151 |         level=logging.INFO,
152 |         format="%(asctime)s %(levelname)s %(message)s",
153 |         handlers=[logging.StreamHandler(sys.stdout)],
154 |     )
155 | 
156 |     parser = argparse.ArgumentParser(
157 |         description="Automatically update and restart the validator process when a new version is released.",
158 |         epilog="Example usage: python start_validator.py --pm2_name 'net13vali' --wallet_name 'wallet1' --wallet_hotkey 'key123' [--no-autoupdate]",
159 |     )
160 | 
161 |     parser.add_argument(
162 |         "--pm2_name", default="net13vali", help="Name of the PM2 process."
163 |     )
164 | 
165 |     flags, extra_args = parser.parse_known_args()
166 | 
167 |     main(flags.pm2_name, extra_args)
168 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # The MIT License (MIT)
 2 | # Copyright © 2023 Data Universe
 3 | 
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation
 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 8 | 
 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of
10 | # the Software.
11 | 
12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
16 | # DEALINGS IN THE SOFTWARE.
17 | 
18 | import re
19 | import os
20 | import codecs
21 | import pathlib
22 | from os import path
23 | from io import open
24 | from setuptools import setup, find_packages
25 | from pkg_resources import parse_requirements
26 | 
27 | 
28 | def read_requirements(path):
29 |     with open(path, "r") as f:
30 |         requirements = f.read().splitlines()
31 |         processed_requirements = []
32 | 
33 |         for req in requirements:
34 |             # For git or other VCS links
35 |             if req.startswith("git+") or "@" in req:
36 |                 pkg_name = re.search(r"(#egg=)([\w\-_]+)", req)
37 |                 if pkg_name:
38 |                     processed_requirements.append(pkg_name.group(2))
39 |                 else:
40 |                     # You may decide to raise an exception here,
41 |                     # if you want to ensure every VCS link has an #egg=<package_name> at the end
42 |                     continue
43 |             else:
44 |                 processed_requirements.append(req)
45 |         return processed_requirements
46 | 
47 | 
48 | requirements = read_requirements("requirements.txt")
49 | here = path.abspath(path.dirname(__file__))
50 | 
51 | with open(path.join(here, "README.md"), encoding="utf-8") as f:
52 |     long_description = f.read()
53 | 
54 | # loading version from setup.py
55 | with codecs.open(
56 |     os.path.join(here, "neurons/__init__.py"),
57 |     encoding="utf-8",
58 | ) as init_file:
59 |     version_match = re.search(
60 |         r"^__version__ = ['\"]([^'\"]*)['\"]", init_file.read(), re.M
61 |     )
62 |     version_string = version_match.group(1)
63 | 
64 | setup(
65 |     name="bittensor_data_universe",
66 |     version=version_string,
67 |     description="Data Universe is a Bittensor subnet for collecting and storing large amounts of data from across a wide-range of sources, for use by other Subnets.",
68 |     long_description=long_description,
69 |     long_description_content_type="text/markdown",
70 |     url="https://github.com/RusticLuftig/data-universe",
71 |     author="Data Universe Team",
72 |     packages=find_packages(),
73 |     include_package_data=True,
74 |     author_email="sid.data.universe@gmail.com",
75 |     license="MIT",
76 |     python_requires=">=3.10",
77 |     install_requires=requirements,
78 |     classifiers=[
79 |         "Development Status :: 3 - Alpha",
80 |         "Intended Audience :: Developers",
81 |         "Topic :: Software Development :: Build Tools",
82 |         "License :: OSI Approved :: MIT License",
83 |         "Programming Language :: Python :: 3 :: Only",
84 |         "Programming Language :: Python :: 3.10",
85 |         "Topic :: Scientific/Engineering",
86 |         "Topic :: Scientific/Engineering :: Mathematics",
87 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
88 |         "Topic :: Software Development",
89 |         "Topic :: Software Development :: Libraries",
90 |         "Topic :: Software Development :: Libraries :: Python Modules",
91 |     ],
92 | )
93 | 


--------------------------------------------------------------------------------
/storage/miner/miner_storage.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from common.data import (
 3 |     CompressedMinerIndex,
 4 |     DataEntity,
 5 |     DataEntityBucketId,
 6 | )
 7 | from typing import Dict, List
 8 | import datetime as dt
 9 | 
10 | 
11 | class MinerStorage(ABC):
12 |     """An abstract class which defines the contract that all implementations of MinerStorage must fulfill."""
13 | 
14 |     @abstractmethod
15 |     def store_data_entities(self, data_entities: List[DataEntity]):
16 |         """Stores any number of DataEntities, making space if necessary."""
17 |         raise NotImplemented
18 | 
19 |     @abstractmethod
20 |     def list_data_entities_in_data_entity_bucket(
21 |         self, data_entity_bucket_id: DataEntityBucketId
22 |     ) -> List[DataEntity]:
23 |         """Lists from storage all DataEntities matching the provided DataEntityBucket."""
24 |         raise NotImplemented
25 | 
26 |     @abstractmethod
27 |     def get_compressed_index(self) -> CompressedMinerIndex:
28 |         """Gets the compressed MinedIndex, which is a summary of all of the DataEntities that this MinerStorage is currently serving."""
29 |         raise NotImplemented
30 | 
31 |     @abstractmethod
32 |     def refresh_compressed_index(self, date_time: dt.timedelta):
33 |         """Refreshes the compressed MinerIndex."""
34 |         raise NotImplemented
35 | 
36 |     @abstractmethod
37 |     def list_contents_in_data_entity_buckets(
38 |         self, data_entity_bucket_ids: List[DataEntityBucketId]
39 |     ) -> Dict[DataEntityBucketId, List[bytes]]:
40 |         """Lists contents for each requested DataEntityBucketId.
41 |         Args:
42 |             data_entity_bucket_ids (List[DataEntityBucketId]): Which buckets to get contents for.
43 |         Returns:
44 |             Dict[DataEntityBucketId, List[bytes]]: Map of each bucket id to contained contents.
45 |         """
46 |         raise NotImplemented
47 | 


--------------------------------------------------------------------------------
/storage/validator/hf_validator_storage.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | import pyarrow as pa
 4 | import pyarrow.parquet as pq
 5 | 
 6 | 
 7 | class HFValidationStorage:
 8 |     def __init__(self, storage_path):
 9 |         self.file_path = storage_path
10 |         self._ensure_file_exists()
11 | 
12 |     def _ensure_file_exists(self):
13 |         if not os.path.exists(self.file_path):
14 |             self._create_empty_dataframe()
15 | 
16 |     def _create_empty_dataframe(self):
17 |         df = pd.DataFrame(columns=['hotkey', 'repo_name', 'block'])
18 |         self._safe_write_parquet(df)
19 | 
20 |     def _safe_write_parquet(self, df):
21 |         temp_file = f"{self.file_path}.temp"
22 |         try:
23 |             table = pa.Table.from_pandas(df)
24 |             pq.write_table(table, temp_file)
25 |             os.replace(temp_file, self.file_path)
26 |         except Exception as e:
27 |             if os.path.exists(temp_file):
28 |                 os.remove(temp_file)
29 |             raise e
30 | 
31 |     def _safe_read_parquet(self):
32 |         try:
33 |             return pd.read_parquet(self.file_path)
34 |         except Exception as e:
35 |             print(f"Error reading Parquet file: {e}")
36 |             print("Attempting to recover data...")
37 |             return self._recover_data()
38 | 
39 |     def _recover_data(self):
40 |         try:
41 |             table = pq.read_table(self.file_path)
42 |             return table.to_pandas()
43 |         except Exception as e:
44 |             print(f"Recovery failed: {e}")
45 |             print("Creating a new empty dataframe.")
46 |             return pd.DataFrame(columns=['hotkey', 'repo_name', 'block'])
47 | 
48 |     def get_validation_info(self, hotkey):
49 |         df = self._safe_read_parquet()
50 |         matching_rows = df[df['hotkey'] == hotkey]
51 |         return matching_rows.to_dict('records')[0] if not matching_rows.empty else None
52 | 
53 |     def update_validation_info(self, hotkey, repo_name, block):
54 |         df = self._safe_read_parquet()
55 |         new_row = pd.DataFrame({'hotkey': [hotkey], 'repo_name': [repo_name], 'block': [block]})
56 |         df = pd.concat([df[df['hotkey'] != hotkey], new_row], ignore_index=True)
57 |         self._safe_write_parquet(df)
58 | 
59 |     def get_all_validations(self):
60 |         return self._safe_read_parquet()


--------------------------------------------------------------------------------
/storage/validator/validator_storage.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from common.data import CompressedMinerIndex
 3 | from typing import Optional
 4 | import datetime as dt
 5 | 
 6 | from common.data_v2 import ScorableMinerIndex
 7 | 
 8 | 
 9 | class ValidatorStorage(ABC):
10 |     """An abstract class which defines the contract that all implementations of ValidatorStorage must fulfill."""
11 | 
12 |     @abstractmethod
13 |     def upsert_compressed_miner_index(
14 |         self, index: CompressedMinerIndex, hotkey: str, credibility: float = 0
15 |     ):
16 |         """Stores the index for all of the data that a specific miner promises to provide."""
17 |         raise NotImplemented
18 | 
19 |     @abstractmethod
20 |     def read_miner_index(self, miner_hotkey: str) -> Optional[ScorableMinerIndex]:
21 |         """Gets a scored index for all of the data that a specific miner promises to provide."""
22 |         raise NotImplemented
23 | 
24 |     @abstractmethod
25 |     def delete_miner(self, miner_hotkey: str):
26 |         """Removes the index and miner information for the specified miner."""
27 |         raise NotImplemented
28 | 
29 |     @abstractmethod
30 |     def read_miner_last_updated(self, miner_hotkey: str) -> Optional[dt.datetime]:
31 |         """Gets when a specific miner was last updated."""
32 |         raise NotImplemented
33 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/__init__.py


--------------------------------------------------------------------------------
/tests/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/common/__init__.py


--------------------------------------------------------------------------------
/tests/common/test_data.py:
--------------------------------------------------------------------------------
  1 | import datetime as dt
  2 | import random
  3 | import string
  4 | import time
  5 | 
  6 | from common import constants, utils
  7 | 
  8 | from common.data import (
  9 |     CompressedEntityBucket,
 10 |     CompressedMinerIndex,
 11 |     DataLabel,
 12 |     DataSource,
 13 |     TimeBucket,
 14 | )
 15 | import unittest
 16 | 
 17 | from common.protocol import GetMinerIndex
 18 | from pydantic import ValidationError
 19 | 
 20 | 
 21 | class TestData(unittest.TestCase):
 22 |     def test_time_bucket_to_date_range(self):
 23 |         """Tests a Timebucket's date range function"""
 24 | 
 25 |         # Create a datetime that should align with the start of a time bucket.
 26 |         datetime = dt.datetime.fromtimestamp(36000, tz=dt.timezone.utc)
 27 |         time_bucket = TimeBucket.from_datetime(datetime)
 28 | 
 29 |         date_range = TimeBucket.to_date_range(time_bucket)
 30 | 
 31 |         for i in range(0, 60):
 32 |             self.assertTrue(date_range.contains(datetime + dt.timedelta(minutes=i)))
 33 | 
 34 |         self.assertFalse(date_range.contains(datetime + dt.timedelta(minutes=60)))
 35 | 
 36 |     def test_data_source_init(self):
 37 |         """Tests that the data source enum can be initialized"""
 38 |         source = 1
 39 |         self.assertEqual(DataSource.REDDIT, DataSource(source))
 40 | 
 41 |     def test_compressed_index_bucket_count(self):
 42 |         """Tests that the compressed version of Miner index can get bucket count."""
 43 |         # Make 5 compressed buckets per source, each containing 5 unique time bucket ids of size 10.
 44 |         sources = {}
 45 |         for source in [DataSource.REDDIT, DataSource.X]:
 46 |             compressed_buckets = [None] * 5
 47 |             for label_i in range(0, 5):
 48 |                 label = "label" + str(label_i)
 49 |                 compressed_buckets[label_i] = CompressedEntityBucket(
 50 |                     label=label,
 51 |                     time_bucket_ids=[i for i in range(1, 6)],
 52 |                     sizes_bytes=[10 for i in range(1, 6)],
 53 |                 )
 54 |             sources[int(source)] = compressed_buckets
 55 | 
 56 |         index = CompressedMinerIndex(sources=sources)
 57 | 
 58 |         self.assertEqual(CompressedMinerIndex.bucket_count(index), 50)
 59 | 
 60 |     def test_compressed_index_size_bytes(self):
 61 |         """Tests that the compressed version of Miner index can get size in bytes."""
 62 |         # Make 5 compressed buckets per source, each containing 5 unique time bucket ids of size 10.
 63 |         sources = {}
 64 |         for source in [DataSource.REDDIT, DataSource.X]:
 65 |             compressed_buckets = [None] * 5
 66 |             for label_i in range(0, 5):
 67 |                 label = "label" + str(label_i)
 68 |                 compressed_buckets[label_i] = CompressedEntityBucket(
 69 |                     label=label,
 70 |                     time_bucket_ids=[i for i in range(1, 6)],
 71 |                     sizes_bytes=[10 for i in range(1, 6)],
 72 |                 )
 73 |             sources[int(source)] = compressed_buckets
 74 | 
 75 |         index = CompressedMinerIndex(sources=sources)
 76 | 
 77 |         self.assertEqual(CompressedMinerIndex.bucket_count(index), 50)
 78 | 
 79 |     def test_compressed_index_supports_max_index(self):
 80 |         """Tests that the compressed version of the maximal Miner index is under our response size limit."""
 81 | 
 82 |         target_buckets = (
 83 |             constants.DATA_ENTITY_BUCKET_COUNT_LIMIT_PER_MINER_INDEX_PROTOCOL_4
 84 |         )
 85 | 
 86 |         # Figure out how many time buckets and labels we need to fill the index.
 87 |         buckets_per_source = target_buckets // 2  # Twitter/Reddit
 88 |         num_time_buckets = constants.DATA_ENTITY_BUCKET_AGE_LIMIT_DAYS * 7 * 24
 89 |         num_labels = buckets_per_source // num_time_buckets
 90 | 
 91 |         # Double check the math
 92 |         total_buckets = 2 * num_time_buckets * num_labels
 93 |         self.assertAlmostEqual(
 94 |             target_buckets,
 95 |             total_buckets,
 96 |             delta=target_buckets * 0.05,
 97 |         )
 98 | 
 99 |         start = time.time()
100 |         sources = {}
101 | 
102 |         def generate_random_string(length):
103 |             # Combine letters and digits for the random string
104 |             characters = string.ascii_letters + string.digits
105 |             return "".join(random.choice(characters) for _ in range(length))
106 | 
107 |         for source in [DataSource.REDDIT, DataSource.X]:
108 |             compressed_buckets = [None] * num_labels
109 |             for label_i in range(0, num_labels):
110 |                 label = generate_random_string(random.randint(4, 32))
111 |                 compressed_buckets[label_i] = CompressedEntityBucket(
112 |                     label=label,
113 |                     time_bucket_ids=[i for i in range(1, num_time_buckets + 1)],
114 |                     sizes_bytes=[
115 |                         random.randint(1, 112345678)
116 |                         for i in range(1, num_time_buckets + 1)
117 |                     ],
118 |                 )
119 |             sources[int(source)] = compressed_buckets
120 | 
121 |         print(f"Time to create index: {time.time() - start}")
122 |         maximal_index = CompressedMinerIndex(
123 |             sources=sources,
124 |         )
125 | 
126 |         start = time.time()
127 |         serialized_compressed_index = maximal_index.json()
128 |         print(f"Time to serialize index: {time.time() - start}")
129 | 
130 |         start = time.time()
131 |         get_miner_index = GetMinerIndex(
132 |             compressed_index_serialized=serialized_compressed_index
133 |         )
134 |         print(f"Time to create synapse: {time.time() - start}")
135 | 
136 |         start = time.time()
137 |         compressed_json = get_miner_index.json()
138 |         print(f"Time to serialize synapse: {time.time() - start}")
139 |         print(f"Compressed index size: {len(compressed_json)}")
140 |         self.assertLess(len(compressed_json), utils.mb_to_bytes(mb=128))
141 | 
142 |         start = time.time()
143 |         deserialized_index = GetMinerIndex.parse_raw(compressed_json)
144 |         deserialized_compressed_index = CompressedMinerIndex.parse_raw(
145 |             deserialized_index.compressed_index_serialized
146 |         )
147 |         print(f"Time to deserialize synapse: {time.time() - start}")
148 | 
149 |         # Verify the deserialized form is as expected.
150 |         self.assertEqual(deserialized_compressed_index, maximal_index)
151 | 
152 |     def test_data_label_lower_validation(self):
153 |         """Tests that the data label value is checked to be <32 characters even after .lower()."""
154 |         with self.assertRaises(ValidationError):
155 |             bad_label = DataLabel(value="#İsrailleTicaretFilistineİhanet")
156 | 
157 | 
158 | if __name__ == "__main__":
159 |     unittest.main()
160 | 


--------------------------------------------------------------------------------
/tests/common/test_data_v2.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from common.data import DataEntityBucketId, DataLabel, DataSource, TimeBucket
 3 | from common.data_v2 import ScorableDataEntityBucket, DataEntityBucket
 4 | 
 5 | 
 6 | class TestDataV2(unittest.TestCase):
 7 |     def test_scorable_data_entity_bucket_to_data_entity(self):
 8 |         # Create a ScorableDataEntityBucket instance
 9 |         time_bucket_id = 123
10 |         source = DataSource.REDDIT.value
11 |         label = "EXAMPLE_label"
12 |         size_bytes = 1000
13 |         scorable_bytes = 500
14 |         scorable_data_entity_bucket = ScorableDataEntityBucket(
15 |             time_bucket_id=time_bucket_id,
16 |             source=source,
17 |             label=label,
18 |             size_bytes=size_bytes,
19 |             scorable_bytes=scorable_bytes,
20 |         )
21 | 
22 |         # Call the to_data_entity_bucket method
23 |         data_entity_bucket = scorable_data_entity_bucket.to_data_entity_bucket()
24 | 
25 |         # Verify that the returned value is an instance of DataEntityBucket
26 |         expected = DataEntityBucket(
27 |             id=DataEntityBucketId(
28 |                 time_bucket=TimeBucket(id=time_bucket_id),
29 |                 source=source,
30 |                 label=DataLabel(value=label.casefold()),
31 |             ),
32 |             size_bytes=size_bytes,
33 |         )
34 |         self.assertEqual(data_entity_bucket, expected)
35 | 
36 |     def test_scorable_data_entity_bucket_to_data_entity_none_label(self):
37 |         # Create a ScorableDataEntityBucket instance
38 |         time_bucket_id = 123
39 |         source = DataSource.REDDIT.value
40 |         size_bytes = 1000
41 |         scorable_bytes = 500
42 |         scorable_data_entity_bucket = ScorableDataEntityBucket(
43 |             time_bucket_id=time_bucket_id,
44 |             source=source,
45 |             label=None,
46 |             size_bytes=size_bytes,
47 |             scorable_bytes=scorable_bytes,
48 |         )
49 | 
50 |         # Call the to_data_entity_bucket method
51 |         data_entity_bucket = scorable_data_entity_bucket.to_data_entity_bucket()
52 | 
53 |         # Verify that the returned value is an instance of DataEntityBucket
54 |         expected = DataEntityBucket(
55 |             id=DataEntityBucketId(
56 |                 time_bucket=TimeBucket(id=time_bucket_id),
57 |                 source=source,
58 |                 label=None,
59 |             ),
60 |             size_bytes=size_bytes,
61 |         )
62 |         self.assertEqual(data_entity_bucket, expected)
63 | 
64 |     def test_scorable_data_entity_bucket_equality(self):
65 |         # Create two ScorableDataEntityBucket instances
66 |         time_bucket_id = 123
67 |         source = DataSource.REDDIT.value
68 |         label = "EXAMPLE_label"
69 |         size_bytes = 1000
70 |         scorable_bytes = 500
71 |         scorable_data_entity_bucket_1 = ScorableDataEntityBucket(
72 |             time_bucket_id=time_bucket_id,
73 |             source=source,
74 |             label=label,
75 |             size_bytes=size_bytes,
76 |             scorable_bytes=scorable_bytes,
77 |         )
78 |         scorable_data_entity_bucket_2 = ScorableDataEntityBucket(
79 |             time_bucket_id=time_bucket_id,
80 |             source=source,
81 |             label=label,
82 |             size_bytes=size_bytes,
83 |             scorable_bytes=scorable_bytes,
84 |         )
85 | 
86 |         # Verify that the two instances are equal
87 |         self.assertEqual(scorable_data_entity_bucket_1, scorable_data_entity_bucket_2)
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     unittest.main()
92 | 


--------------------------------------------------------------------------------
/tests/common/test_metagraph_syncer.py:
--------------------------------------------------------------------------------
 1 | from curses import meta
 2 | import threading
 3 | from unittest import mock
 4 | import unittest
 5 | import bittensor as bt
 6 | from common.metagraph_syncer import MetagraphSyncer
 7 | 
 8 | 
 9 | class TestMetagraphSyncer(unittest.TestCase):
10 |     def test_do_initial_sync(self):
11 |         # Mock subtensor.metagraph() function
12 |         metagraph1 = bt.metagraph(netuid=1, sync=False)
13 |         metagraph2 = bt.metagraph(netuid=2, sync=False)
14 | 
15 |         def get_metagraph(netuid) -> bt.metagraph:
16 |             if netuid == 1:
17 |                 return metagraph1
18 |             elif netuid == 2:
19 |                 return metagraph2
20 |             else:
21 |                 raise Exception("Invalid netuid")
22 | 
23 |         mock_subtensor = mock.MagicMock(spec=bt.subtensor)
24 |         metagraph_mock = mock.MagicMock(side_effect=get_metagraph)
25 |         mock_subtensor.metagraph = metagraph_mock
26 | 
27 |         # Create MetagraphSyncer instance with mock subtensor
28 |         metagraph_syncer = MetagraphSyncer(mock_subtensor, {1: 1, 2: 1})
29 | 
30 |         # Call do_initial_sync method
31 |         metagraph_syncer.do_initial_sync()
32 | 
33 |         # Verify get_metagraph() returns the expected metagraph.
34 |         # We can't check object equality because of how equality is done on bt.metagraph
35 |         # so just check the netuid.
36 |         self.assertEqual(metagraph_syncer.get_metagraph(1).netuid, metagraph1.netuid)
37 |         self.assertEqual(metagraph_syncer.get_metagraph(2).netuid, metagraph2.netuid)
38 | 
39 |     def test_listener_called(self):
40 |         # Mock subtensor.metagraph() function
41 |         metagraph1 = bt.metagraph(netuid=1, sync=False)
42 |         metagraph2 = bt.metagraph(netuid=2, sync=False)
43 | 
44 |         def get_metagraph(netuid) -> bt.metagraph:
45 |             if netuid == 1:
46 |                 return metagraph1
47 |             elif netuid == 2:
48 |                 return metagraph2
49 |             else:
50 |                 raise Exception("Invalid netuid")
51 | 
52 |         mock_subtensor = mock.MagicMock(spec=bt.subtensor)
53 |         metagraph_mock = mock.MagicMock(side_effect=get_metagraph)
54 |         mock_subtensor.metagraph = metagraph_mock
55 | 
56 |         # Create MetagraphSyncer instance with mock subtensor
57 |         metagraph_syncer = MetagraphSyncer(mock_subtensor, {1: 1, 2: 1})
58 | 
59 |         # Call do_initial_sync method
60 |         metagraph_syncer.do_initial_sync()
61 | 
62 |         # Register a listener for netuid 1.
63 |         event = threading.Event()
64 | 
65 |         def listener(metagraph, netuid):
66 |             self.assertEqual(metagraph.netuid, 1)
67 |             self.assertEqual(netuid, 1)
68 |             event.set()
69 | 
70 |         metagraph_syncer.register_listener(listener, [1])
71 | 
72 |         # Since we sync every 1 second, verify the listener is called within 5 seconds.
73 |         event.wait(5)
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     unittest.main()
78 | 


--------------------------------------------------------------------------------
/tests/common/test_protocol.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from typing import Type
  3 | import unittest
  4 | import datetime as dt
  5 | import bittensor as bt
  6 | from common.data import (
  7 |     CompressedEntityBucket,
  8 |     CompressedMinerIndex,
  9 |     DataEntity,
 10 |     DataEntityBucket,
 11 |     DataEntityBucketId,
 12 |     DataLabel,
 13 |     DataSource,
 14 |     TimeBucket,
 15 | )
 16 | from common import old_protocol
 17 | 
 18 | from common.protocol import GetDataEntityBucket, GetMinerIndex
 19 | 
 20 | 
 21 | def serialize_like_dendrite(synapse: bt.Synapse) -> str:
 22 |     """Serializes a synapse like a Dendrite would."""
 23 |     d = synapse.dict()
 24 |     return json.dumps(d)
 25 | 
 26 | 
 27 | def serialize_like_axon(synapse: bt.Synapse) -> str:
 28 |     """Serializes a synapse like an Axon would."""
 29 |     return serialize_like_dendrite(synapse)
 30 | 
 31 | 
 32 | def deserialize(json_str: str, cls: Type) -> bt.Synapse:
 33 |     """Deserializes the same way a dendrite/axon does."""
 34 |     d = json.loads(json_str)
 35 |     return cls(**d)
 36 | 
 37 | 
 38 | class TestGetMinerIndex(unittest.TestCase):
 39 |     def test_get_miner_index_old_format_round_trip(self):
 40 |         """Tests that the old miner index format can be serialized/deserialized for transport."""
 41 |         request = GetMinerIndex()
 42 |         json = request.json()
 43 |         print(json)
 44 |         deserialized = GetMinerIndex.parse_raw(json)
 45 |         self.assertEqual(request, deserialized)
 46 | 
 47 |         # Also check that the headers can be constructed.
 48 |         request.to_headers()
 49 | 
 50 |         # Now construct a response and check it.
 51 |         response = GetMinerIndex(
 52 |             data_entity_buckets=[
 53 |                 DataEntityBucket(
 54 |                     id=DataEntityBucketId(
 55 |                         time_bucket=TimeBucket(id=5),
 56 |                         label=DataLabel(value="r/bittensor_"),
 57 |                         source=DataSource.REDDIT,
 58 |                     ),
 59 |                     size_bytes=100,
 60 |                 ),
 61 |                 DataEntityBucket(
 62 |                     id=DataEntityBucketId(
 63 |                         time_bucket=TimeBucket(id=6),
 64 |                         source=DataSource.X,
 65 |                     ),
 66 |                     size_bytes=200,
 67 |                 ),
 68 |             ]
 69 |         )
 70 | 
 71 |         serialized = serialize_like_axon(response)
 72 |         deserialized = deserialize(serialized, GetMinerIndex)
 73 |         self.assertEqual(response, deserialized)
 74 | 
 75 |     def test_get_miner_index_new_format_round_trip(self):
 76 |         """Tests that the compressed miner index can be serialized/deserialized for transport."""
 77 | 
 78 |         request = GetMinerIndex()
 79 | 
 80 |         serialized = serialize_like_dendrite(request)
 81 |         deserialized = deserialize(serialized, GetMinerIndex)
 82 |         self.assertEqual(request, deserialized)
 83 | 
 84 |         # Also check that the headers can be constructed.
 85 |         request.to_headers()
 86 | 
 87 |         # Now construct a response and check it.
 88 |         response = GetMinerIndex(
 89 |             compressed_index_serialized=CompressedMinerIndex(
 90 |                 sources={
 91 |                     DataSource.REDDIT.value: [
 92 |                         CompressedEntityBucket(
 93 |                             label="r/bittensor_",
 94 |                             time_bucket_ids=[5, 6],
 95 |                             sizes_bytes=[100, 200],
 96 |                         )
 97 |                     ],
 98 |                     DataSource.X.value: [
 99 |                         CompressedEntityBucket(
100 |                             time_bucket_ids=[10, 11, 12], sizes_bytes=[300, 400, 500]
101 |                         ),
102 |                         CompressedEntityBucket(
103 |                             label="#bittensor", time_bucket_ids=[5], sizes_bytes=[100]
104 |                         ),
105 |                     ],
106 |                 }
107 |             ).json()
108 |         )
109 | 
110 |         serialized = serialize_like_axon(response)
111 |         deserialized = deserialize(serialized, GetMinerIndex)
112 |         self.assertEqual(response, deserialized)
113 | 
114 | 
115 | class TestGetDataEntityBucket(unittest.TestCase):
116 |     def test_synapse_serialization(self):
117 |         """Tests that the protocol messages can be serialized/deserialized for transport."""
118 |         request = GetDataEntityBucket(
119 |             data_entity_bucket_id=DataEntityBucketId(
120 |                 time_bucket=TimeBucket.from_datetime(dt.datetime.utcnow()),
121 |                 label=DataLabel(value="r/bittensor_"),
122 |                 source=DataSource.REDDIT,
123 |             )
124 |         )
125 |         json = request.json()
126 |         print(json)
127 |         deserialized = GetDataEntityBucket.parse_raw(json)
128 |         self.assertEqual(request, deserialized)
129 | 
130 |         # Check that the enum is deserialized correctly
131 |         self.assertEqual(deserialized.data_entity_bucket_id.source, DataSource.REDDIT)
132 | 
133 |         # Also check that the headers can be constructed.
134 |         request.to_headers()
135 | 
136 |         # TODO: Add a test for the response.
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     unittest.main()
141 | 


--------------------------------------------------------------------------------
/tests/common/test_utils.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import time
 3 | import unittest
 4 | 
 5 | from common.utils import run_in_thread
 6 | 
 7 | 
 8 | class TestUtils(unittest.TestCase):
 9 |     def test_run_in_thread(self):
10 |         def test_func(a: int, b: int):
11 |             return a + b
12 | 
13 |         partial = functools.partial(test_func, 1, 2)
14 | 
15 |         result = run_in_thread(func=partial, ttl=5)
16 |         self.assertEqual(3, result)
17 | 
18 |     def test_run_in_thread_timeout(self):
19 |         def test_func(a: int, b: int):
20 |             time.sleep(3)
21 |             return a + b
22 | 
23 |         partial = functools.partial(test_func, 1, 2)
24 | 
25 |         with self.assertRaises(TimeoutError):
26 |             result = run_in_thread(func=partial, ttl=1)
27 | 
28 |     def test_run_in_thread_no_return(self):
29 |         def test_func(a: int, b: int):
30 |             pass
31 | 
32 |         partial = functools.partial(test_func, 1, 2)
33 | 
34 |         result = run_in_thread(func=partial, ttl=5)
35 |         self.assertIsNone(result)
36 | 
37 |     def test_run_in_thread_tuple_return(self):
38 |         def test_func(a: int, b: int):
39 |             return a, b
40 | 
41 |         partial = functools.partial(test_func, 1, 2)
42 | 
43 |         result = run_in_thread(func=partial, ttl=5)
44 |         self.assertEqual((1, 2), result)
45 | 
46 |     def test_run_in_thread_exception(self):
47 |         def test_func(a: int, b: int):
48 |             raise ValueError()
49 | 
50 |         partial = functools.partial(test_func, 1, 2)
51 | 
52 |         with self.assertRaises(ValueError):
53 |             result = run_in_thread(func=partial, ttl=5)
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     unittest.main()
58 | 


--------------------------------------------------------------------------------
/tests/hf_validation/test_encoding_key.json:
--------------------------------------------------------------------------------
1 | {"sym_key": "XdcRI9sPT2e43a8fda53H13HpGqpQLTZHHeIoHPtIMI="}


--------------------------------------------------------------------------------
/tests/hf_validation/test_reddit_dataset_validation.py:
--------------------------------------------------------------------------------
  1 | """Module for selecting and processing random rows from Hugging Face datasets."""
  2 | 
  3 | import random
  4 | from typing import List, Dict, Any
  5 | import asyncio
  6 | 
  7 | import bittensor as bt
  8 | import requests
  9 | import pandas as pd
 10 | from datasets import load_dataset
 11 | import itertools
 12 | from huggingface_utils.encoding_system import EncodingKeyManager, decode_url
 13 | from scraping.reddit.reddit_custom_scraper import RedditCustomScraper
 14 | 
 15 | 
 16 | def get_parquet_files(repo_id: str) -> List[str]:
 17 |     """
 18 |     Fetch a list of parquet files from a Hugging Face dataset repository.
 19 | 
 20 |     Args:
 21 |         repo_id (str): The Hugging Face dataset repository ID.
 22 | 
 23 |     Returns:
 24 |         List[str]: A list of parquet file paths.
 25 | 
 26 |     Raises:
 27 |         requests.RequestException: If the API request fails.
 28 |     """
 29 |     api_url = f"https://huggingface.co/api/datasets/{repo_id}/tree/main/data"
 30 |     try:
 31 |         response = requests.get(api_url)
 32 |         response.raise_for_status()
 33 |         files = [item['path'] for item in response.json() if item['path'].endswith('.parquet')]
 34 |         return files
 35 |     except requests.RequestException as e:
 36 |         raise requests.RequestException(f"Failed to fetch file list: {e}")
 37 | 
 38 | def select_random_rows_from_parquet(repo_id: str, num_rows: int = 10, buffer_size: int = 10_000) -> pd.DataFrame:
 39 |     """
 40 |     Efficiently select random rows from a randomly chosen parquet file in a Hugging Face dataset
 41 |     using a streaming approach with shuffling.
 42 | 
 43 |     Args:
 44 |         repo_id (str): The Hugging Face dataset repository ID.
 45 |         num_rows (int, optional): Number of random rows to select. Defaults to 10.
 46 |         buffer_size (int, optional): Size of the buffer for shuffling. Defaults to 10,000.
 47 | 
 48 |     Returns:
 49 |         pd.DataFrame: A DataFrame containing the randomly selected rows.
 50 | 
 51 |     Raises:
 52 |         ValueError: If no parquet files are found in the dataset.
 53 |     """
 54 |     parquet_files = get_parquet_files(repo_id)
 55 | 
 56 |     if not parquet_files:
 57 |         raise ValueError("No parquet files found in the dataset.")
 58 | 
 59 |     selected_file = random.choice(parquet_files)
 60 |     bt.logging.trace(f"Selected file: {selected_file}")
 61 | 
 62 |     # Load the dataset in streaming mode
 63 |     dataset = load_dataset(
 64 |         repo_id,
 65 |         data_files={'train': selected_file},
 66 |         split='train',
 67 |         streaming=True
 68 |     )
 69 | 
 70 |     # Generate random seed
 71 |     random_seed = random.randint(0, 2 ** 32 - 1)
 72 |     # Shuffle the dataset
 73 |     shuffled_dataset = dataset.shuffle(buffer_size=buffer_size, seed=random_seed)
 74 | 
 75 |     # Select the specified number of rows
 76 |     selected_rows = list(itertools.islice(shuffled_dataset, num_rows))
 77 | 
 78 |     # Convert to DataFrame
 79 |     df = pd.DataFrame(selected_rows)
 80 | 
 81 |     # Decode encrypted columns
 82 |     key_manager = EncodingKeyManager(key_path='/Users/volodymyrtruba/data-universe/tests/hf_validation/test_encoding_key.json')
 83 |     fernet = key_manager.get_fernet()
 84 | 
 85 |     for column in ['url_encoded', 'username_encoded']:
 86 |         if column in df.columns:
 87 |             df[column.replace('_encoded', '')] = df[column].apply(lambda x: decode_url(x, fernet))
 88 |             df = df.drop(columns=[column])
 89 | 
 90 |     bt.logging.trace(df)
 91 | 
 92 |     return df
 93 | 
 94 | 
 95 | async def main():
 96 |     """Main function to demonstrate the usage of the script."""
 97 |     repo_id = "arrmlet/reddit_dataset_123456"
 98 |     bt.logging.set_trace(True)
 99 |     try:
100 |         selected_rows = select_random_rows_from_parquet(repo_id)
101 |         print(selected_rows)
102 |         s = selected_rows.to_dict(orient='records')
103 |         scrapper = RedditCustomScraper()
104 |         valid = await scrapper.validate_hf(entities=s)
105 |         bt.logging.info(f"Number of rows: {len(selected_rows)}")
106 |         bt.logging.info(valid)
107 | 
108 |     except (requests.RequestException, ValueError) as e:
109 |         bt.logging.trace(f"An error occurred: {e}")
110 | 
111 | if __name__ == "__main__":
112 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/hf_validation/test_x_dataset_validation.py:
--------------------------------------------------------------------------------
  1 | """Module for selecting and processing random rows from Hugging Face datasets."""
  2 | 
  3 | import random
  4 | from typing import List, Dict, Any
  5 | import asyncio
  6 | 
  7 | import bittensor as bt
  8 | import requests
  9 | import pandas as pd
 10 | from datasets import load_dataset
 11 | import itertools
 12 | from huggingface_utils.encoding_system import EncodingKeyManager, decode_url
 13 | from scraping.x.apidojo_scraper import ApiDojoTwitterScraper
 14 | 
 15 | def get_parquet_files(repo_id: str) -> List[str]:
 16 |     """
 17 |     Fetch a list of parquet files from a Hugging Face dataset repository.
 18 | 
 19 |     Args:
 20 |         repo_id (str): The Hugging Face dataset repository ID.
 21 | 
 22 |     Returns:
 23 |         List[str]: A list of parquet file paths.
 24 | 
 25 |     Raises:
 26 |         requests.RequestException: If the API request fails.
 27 |     """
 28 |     api_url = f"https://huggingface.co/api/datasets/{repo_id}/tree/main/data"
 29 |     try:
 30 |         response = requests.get(api_url)
 31 |         response.raise_for_status()
 32 |         files = [item['path'] for item in response.json() if item['path'].endswith('.parquet')]
 33 |         return files
 34 |     except requests.RequestException as e:
 35 |         raise requests.RequestException(f"Failed to fetch file list: {e}")
 36 | 
 37 | def select_random_rows_from_parquet(repo_id: str, num_rows: int = 10, buffer_size: int = 10_000) -> pd.DataFrame:
 38 |     """
 39 |     Efficiently select random rows from a randomly chosen parquet file in a Hugging Face dataset
 40 |     using a streaming approach with shuffling.
 41 | 
 42 |     Args:
 43 |         repo_id (str): The Hugging Face dataset repository ID.
 44 |         num_rows (int, optional): Number of random rows to select. Defaults to 10.
 45 |         buffer_size (int, optional): Size of the buffer for shuffling. Defaults to 10,000.
 46 | 
 47 |     Returns:
 48 |         pd.DataFrame: A DataFrame containing the randomly selected rows.
 49 | 
 50 |     Raises:
 51 |         ValueError: If no parquet files are found in the dataset.
 52 |     """
 53 |     parquet_files = get_parquet_files(repo_id)
 54 | 
 55 |     if not parquet_files:
 56 |         raise ValueError("No parquet files found in the dataset.")
 57 | 
 58 |     selected_file = random.choice(parquet_files)
 59 |     bt.logging.trace(f"Selected file: {selected_file}")
 60 | 
 61 |     # Load the dataset in streaming mode
 62 |     dataset = load_dataset(
 63 |         repo_id,
 64 |         data_files={'train': selected_file},
 65 |         split='train',
 66 |         streaming=True
 67 |     )
 68 | 
 69 |     # Generate random seed
 70 |     random_seed = random.randint(0, 2 ** 32 - 1)
 71 |     # Shuffle the dataset
 72 |     shuffled_dataset = dataset.shuffle(buffer_size=buffer_size, seed=random_seed)
 73 | 
 74 |     # Select the specified number of rows
 75 |     selected_rows = list(itertools.islice(shuffled_dataset, num_rows))
 76 | 
 77 |     # Convert to DataFrame
 78 |     df = pd.DataFrame(selected_rows)
 79 | 
 80 |     # Decode encrypted columns
 81 |     key_manager = EncodingKeyManager(key_path='/Users/volodymyrtruba/data-universe/tests/hf_validation/test_encoding_key.json')
 82 |     fernet = key_manager.get_fernet()
 83 | 
 84 |     for column in ['url_encoded', 'username_encoded']:
 85 |         if column in df.columns:
 86 |             df[column.replace('_encoded', '')] = df[column].apply(lambda x: decode_url(x, fernet))
 87 |             df = df.drop(columns=[column])
 88 | 
 89 |     bt.logging.trace(df)
 90 | 
 91 |     return df
 92 | 
 93 | 
 94 | async def main():
 95 |     """Main function to demonstrate the usage of the script."""
 96 |     repo_id = "arrmlet/x_dataset_123456"
 97 | 
 98 |     try:
 99 |         selected_rows = select_random_rows_from_parquet(repo_id)
100 |         s = selected_rows.to_dict(orient='records')
101 |         scrapper = ApiDojoTwitterScraper()
102 |         valid = await scrapper.validate_hf(entities=s)
103 |         bt.logging.info(f"Number of rows: {len(selected_rows)}")
104 |         bt.logging.info(valid)
105 | 
106 |     except (requests.RequestException, ValueError) as e:
107 |         bt.logging.trace(f"An error occurred: {e}")
108 | 
109 | if __name__ == "__main__":
110 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/integration/__init__.py


--------------------------------------------------------------------------------
/tests/integration/test_on_demand.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import asyncio
 3 | import bittensor as bt
 4 | import datetime as dt
 5 | import random
 6 | from common.data import DataLabel, DataSource, DataEntity
 7 | from common.protocol import OnDemandRequest
 8 | from common.date_range import DateRange
 9 | from scraping.scraper import ScrapeConfig
10 | from scraping.x.apidojo_scraper import ApiDojoTwitterScraper
11 | 
12 | 
13 | class TestOnDemandProtocol(unittest.TestCase):
14 |     def test_on_demand_flow(self):
15 |         """Test the complete on-demand data flow"""
16 | 
17 |         async def run_test():
18 |             # Create OnDemand request
19 |             test_request = OnDemandRequest(
20 |                 source=DataSource.X,
21 |                 keywords=["#TAO"],
22 |                 start_date=(dt.datetime.now(dt.timezone.utc) - dt.timedelta(days=1)).isoformat(),
23 |                 end_date=dt.datetime.now(dt.timezone.utc).isoformat(),
24 |                 limit=5
25 |             )
26 | 
27 |             # Set up scraper
28 |             scraper = ApiDojoTwitterScraper()
29 | 
30 |             # Create scrape config from request
31 |             scrape_config = ScrapeConfig(
32 |                 entity_limit=test_request.limit,
33 |                 date_range=DateRange(
34 |                     start=dt.datetime.fromisoformat(test_request.start_date),
35 |                     end=dt.datetime.fromisoformat(test_request.end_date)
36 |                 ),
37 |                 labels=[DataLabel(value=k) for k in test_request.keywords]
38 |             )
39 | 
40 |             # Get data using scraper
41 |             data = await scraper.scrape(scrape_config)
42 | 
43 |             # Verify data was retrieved
44 |             self.assertTrue(len(data) > 0, "No data returned from scraper")
45 | 
46 |             # Select 1 random samples for validation
47 |             if data:
48 |                 samples = random.sample(data, min(1, len(data)))
49 |                 validation_results = await scraper.validate(samples)
50 |                 print(data)
51 |                 # Check if any validation passed
52 |                 self.assertTrue(
53 |                     any(result.is_valid for result in validation_results),
54 |                     "All validation failed for sample data"
55 |                 )
56 | 
57 |         # Run the async test
58 |         asyncio.run(run_test())
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     unittest.main()


--------------------------------------------------------------------------------
/tests/neurons/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/neurons/__init__.py


--------------------------------------------------------------------------------
/tests/neurons/test_miner_config.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from unittest.mock import patch
 4 | 
 5 | from neurons.miner import Miner
 6 | 
 7 | 
 8 | class TestMinerConfig(unittest.TestCase):
 9 |     def test_miner_config(self):
10 |         with patch.object(
11 |             sys,
12 |             "argv",
13 |             [
14 |                 "miner.py",
15 |                 "--neuron.database_name",
16 |                 "mydb",
17 |                 "--subtensor.network",
18 |                 "test",
19 |             ],
20 |         ):
21 |             miner = Miner()
22 |             config = miner.get_config_for_test()
23 | 
24 |             self.assertEqual(config.neuron.database_name, "mydb")
25 |             # Check the default values are still there.
26 |             self.assertEqual(config.neuron.max_database_size_gb_hint, 250)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     unittest.main()
31 | 


--------------------------------------------------------------------------------
/tests/neurons/test_validator_config.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unittest
 3 | from unittest.mock import patch
 4 | from neurons.config import NeuronType, create_config
 5 | 
 6 | 
 7 | class TestValidatorConfig(unittest.TestCase):
 8 |     def test_validator_config(self):
 9 |         with patch.object(
10 |             sys,
11 |             "argv",
12 |             [
13 |                 "validator.py",
14 |                 "--subtensor.network",
15 |                 "test",
16 |             ],
17 |         ):
18 |             config = create_config(NeuronType.VALIDATOR)
19 | 
20 |             # Check the default values are still there.
21 |             self.assertEqual(config.neuron.axon_off, False)
22 |             self.assertEqual(config.subtensor.network, "test")
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     unittest.main()
27 | 


--------------------------------------------------------------------------------
/tests/rewards/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/rewards/__init__.py


--------------------------------------------------------------------------------
/tests/scraping/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/scraping/__init__.py


--------------------------------------------------------------------------------
/tests/scraping/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/scraping/config/__init__.py


--------------------------------------------------------------------------------
/tests/scraping/config/invalid_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "scraper_configs": [
 3 |         {
 4 |             "scraper_id": "bogus",
 5 |             "cadence_seconds": 300,
 6 |             "labels_to_scrape": [
 7 |                 {
 8 |                     "label_choices": [
 9 |                         "#bittensor",
10 |                         "#tao"
11 |                     ],
12 |                     "max_age_hint_minutes": 1440,
13 |                     "max_data_entities": 100
14 |                 }
15 |             ]
16 |         },
17 |         {
18 |             "scraper_id": "Reddit.lite",
19 |             "cadence_seconds": 900,
20 |             "labels_to_scrape": [
21 |                 {
22 |                     "label_choices": [
23 |                         "r/bittensor_",
24 |                         "r/bitcoin"
25 |                     ],
26 |                     "max_data_entities": 50
27 |                 }
28 |             ]
29 |         }
30 |     ]
31 | }


--------------------------------------------------------------------------------
/tests/scraping/config/test_config_reader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | from unittest.mock import patch
 4 | from common import constants
 5 | from common.data import DataLabel, DataSource
 6 | from scraping.config.config_reader import ConfigReader
 7 | from scraping.coordinator import (
 8 |     CoordinatorConfig,
 9 |     ScraperConfig,
10 |     LabelScrapingConfig,
11 | )
12 | from scraping.scraper import ScraperId
13 | 
14 | 
15 | class TestConfigReader(unittest.TestCase):
16 |     def test_load_config_valid(self):
17 |         """Tests a valid config is loaded correctly."""
18 |         expected_config = CoordinatorConfig(
19 |             scraper_configs={
20 |                 ScraperId.X_MICROWORLDS: ScraperConfig(
21 |                     cadence_seconds=300,
22 |                     labels_to_scrape=[
23 |                         LabelScrapingConfig(
24 |                             label_choices=[
25 |                                 DataLabel(value="#bittensor"),
26 |                                 DataLabel(value="#TAO"),
27 |                             ],
28 |                             max_age_hint_minutes=1440,
29 |                             max_data_entities=100,
30 |                         ),
31 |                         LabelScrapingConfig(
32 |                             max_age_hint_minutes=60
33 |                             * 24
34 |                             * constants.DATA_ENTITY_BUCKET_AGE_LIMIT_DAYS,
35 |                             max_data_entities=500,
36 |                         ),
37 |                     ],
38 |                 ),
39 |                 ScraperId.REDDIT_LITE: ScraperConfig(
40 |                     cadence_seconds=900,
41 |                     labels_to_scrape=[
42 |                         LabelScrapingConfig(
43 |                             label_choices=[
44 |                                 DataLabel(value="r/bittensor_"),
45 |                                 DataLabel(value="r/bitcoin"),
46 |                             ],
47 |                             max_age_hint_minutes=60
48 |                             * 24
49 |                             * constants.DATA_ENTITY_BUCKET_AGE_LIMIT_DAYS,
50 |                             max_data_entities=50,
51 |                         ),
52 |                     ],
53 |                 ),
54 |             }
55 |         )
56 | 
57 |         this_dir = os.path.abspath(os.path.dirname(__file__))
58 |         filepath = os.path.join(this_dir, "valid_config.json")
59 |         loaded_config = ConfigReader.load_config(filepath)
60 | 
61 |         self.assertEqual(loaded_config, expected_config)
62 | 
63 |     def test_load_config_invalid(self):
64 |         """Tests that loading an invalid config raises an exception."""
65 |         this_dir = os.path.abspath(os.path.dirname(__file__))
66 |         filepath = os.path.join(this_dir, "invalid_config.json")
67 | 
68 |         with self.assertRaises(Exception) as e:
69 |             ConfigReader.load_config(filepath)
70 |         self.assertIn(
71 |             "scraper_id\n  value is not a valid enumeration member", str(e.exception)
72 |         )
73 | 
74 |     def test_load_real_config_valid(self):
75 |         this_dir = os.path.abspath(os.path.dirname(__file__))
76 |         filepath = os.path.join(
77 |             this_dir, "../../../scraping/config/scraping_config.json"
78 |         )
79 |         loaded_config = ConfigReader.load_config(filepath)
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     unittest.main()
84 | 


--------------------------------------------------------------------------------
/tests/scraping/config/test_model.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from scraping.config.model import (
 4 |     ScraperConfig,
 5 |     LabelScrapingConfig,
 6 |     ScrapingConfig,
 7 | )
 8 | from scraping.scraper import ScraperId
 9 | 
10 | 
11 | class TestScrapingConfig(unittest.TestCase):
12 |     def test_serialization_deserialization(self):
13 |         """Verifies a round-trip serialization/deserialization of the ScrapingConfig"""
14 | 
15 |         config = ScrapingConfig(
16 |             scraper_configs=[
17 |                 ScraperConfig(
18 |                     scraper_id=ScraperId.X_MICROWORLDS,
19 |                     cadence_seconds=300,
20 |                     labels_to_scrape=[
21 |                         LabelScrapingConfig(
22 |                             label_choices=["#bittensor", "#TAO"],
23 |                             max_age_hint_minutes=1440,
24 |                             max_data_entities=100,
25 |                         ),
26 |                         LabelScrapingConfig(
27 |                             max_age_hint_minutes=10080,
28 |                             max_data_entities=500,
29 |                         ),
30 |                     ],
31 |                 ),
32 |                 ScraperConfig(
33 |                     scraper_id=ScraperId.REDDIT_LITE,
34 |                     cadence_seconds=900,
35 |                     labels_to_scrape=[
36 |                         LabelScrapingConfig(
37 |                             label_choices=["r/bittensor_"],
38 |                             max_data_entities=50,
39 |                         ),
40 |                     ],
41 |                 ),
42 |             ]
43 |         )
44 | 
45 |         # Serialize the object to JSON
46 |         json_data = config.json()
47 |         print(json_data)
48 | 
49 |         # Deserialize the JSON back to an object
50 |         deserialized_config = ScrapingConfig.parse_raw(json_data)
51 | 
52 |         # Verify the deserialized object is equal to the starting object
53 |         self.assertEqual(config, deserialized_config)
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     unittest.main()
58 | 


--------------------------------------------------------------------------------
/tests/scraping/config/valid_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "scraper_configs": [
 3 |         {
 4 |             "scraper_id": "X.microworlds",
 5 |             "cadence_seconds": 300,
 6 |             "labels_to_scrape": [
 7 |                 {
 8 |                     "label_choices": [
 9 |                         "#bittensor",
10 |                         "#tao"
11 |                     ],
12 |                     "max_age_hint_minutes": 1440,
13 |                     "max_data_entities": 100
14 |                 },
15 |                 {
16 |                     "max_data_entities": 500
17 |                 }
18 |             ]
19 |         },
20 |         {
21 |             "scraper_id": "Reddit.lite",
22 |             "cadence_seconds": 900,
23 |             "labels_to_scrape": [
24 |                 {
25 |                     "label_choices": [
26 |                         "r/bittensor_",
27 |                         "r/bitcoin"
28 |                     ],
29 |                     "max_data_entities": 50
30 |                 }
31 |             ]
32 |         }
33 |     ]
34 | }


--------------------------------------------------------------------------------
/tests/scraping/reddit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/scraping/reddit/__init__.py


--------------------------------------------------------------------------------
/tests/scraping/reddit/test_model.py:
--------------------------------------------------------------------------------
  1 | import datetime as dt
  2 | import unittest
  3 | 
  4 | from common import constants
  5 | from scraping.reddit.model import RedditContent, RedditDataType
  6 | 
  7 | 
  8 | class TestModel(unittest.TestCase):
  9 |     def test_label_truncation(self):
 10 |         """Tests that RedditContents correctly truncate labels to 32 characters when converting to DataEntities"""
 11 |         timestamp = dt.datetime.now(tz=dt.timezone.utc)
 12 |         content = RedditContent(
 13 |             id="postId",
 14 |             url="https://reddit.com/123",
 15 |             username="user1",
 16 |             communityName="r/looooooooooooooooooooooooongSubreddit",
 17 |             body="Hello world",
 18 |             createdAt=timestamp,
 19 |             dataType=RedditDataType.POST,
 20 |             title="Title text",
 21 |         )
 22 |         entity = RedditContent.to_data_entity(content=content)
 23 | 
 24 |         self.assertEqual(len(entity.label.value), constants.MAX_LABEL_LENGTH)
 25 |         self.assertEqual(entity.label.value, "r/looooooooooooooooooooooooongsu")
 26 | 
 27 |     def test_label_truncation_lower(self):
 28 |         """Tests truncation of characters that become longer when .lower() is used on them."""
 29 |         timestamp = dt.datetime.now(tz=dt.timezone.utc)
 30 |         content = RedditContent(
 31 |             id="postId",
 32 |             url="https://reddit.com/123",
 33 |             username="user1",
 34 |             communityName="r/İsrailleTicaretFilistineİhanet",
 35 |             body="Hello world",
 36 |             createdAt=timestamp,
 37 |             dataType=RedditDataType.POST,
 38 |             title="Title text",
 39 |         )
 40 |         entity = RedditContent.to_data_entity(content=content)
 41 | 
 42 |         self.assertEqual(len(entity.label.value), constants.MAX_LABEL_LENGTH)
 43 |         self.assertEqual(entity.label.value, "r/i̇srailleticaretfilistinei̇han")
 44 | 
 45 |     def test_to_data_entity_obfuscated(self):
 46 |         timestamp = dt.datetime(
 47 |             year=2024,
 48 |             month=3,
 49 |             day=1,
 50 |             hour=1,
 51 |             minute=1,
 52 |             second=1,
 53 |             microsecond=1,
 54 |             tzinfo=dt.timezone.utc,
 55 |         )
 56 |         content = RedditContent(
 57 |             id="postId",
 58 |             url="https://reddit.com/123",
 59 |             username="user1",
 60 |             communityName="r/bitcoin",
 61 |             body="Hello world",
 62 |             createdAt=timestamp,
 63 |             dataType=RedditDataType.POST,
 64 |             title="Title text",
 65 |         )
 66 | 
 67 |         # Convert to entity and back to check granularity of the content timestamp.
 68 |         entity = RedditContent.to_data_entity(content=content)
 69 |         content_roundtrip = RedditContent.from_data_entity(entity)
 70 | 
 71 |         # The entity datetime should have full granularity but the roundtripped content should not.
 72 |         self.assertEqual(entity.datetime, timestamp)
 73 |         self.assertEqual(
 74 |             content_roundtrip.created_at,
 75 |             dt.datetime(
 76 |                 year=2024,
 77 |                 month=3,
 78 |                 day=1,
 79 |                 hour=1,
 80 |                 minute=1,
 81 |                 second=0,
 82 |                 microsecond=0,
 83 |                 tzinfo=dt.timezone.utc,
 84 |             ),
 85 |         )
 86 | 
 87 |     def test_to_data_entity_content_serialization(self):
 88 |         """Verifies that the content is serialized correctly when converting to a DataEntity."""
 89 |         content = RedditContent(
 90 |             id="postId",
 91 |             url="https://reddit.com/123",
 92 |             username="user1",
 93 |             communityName="r/bitcoin",
 94 |             body="Hello world",
 95 |             createdAt=dt.datetime(2024, 3, 30, 1, 2, 3, tzinfo=dt.timezone.utc),
 96 |             dataType=RedditDataType.POST,
 97 |             title="Title text",
 98 |         )
 99 | 
100 |         # Convert to entity and back to check granularity of the content timestamp.
101 |         entity = RedditContent.to_data_entity(content=content)
102 | 
103 |         self.assertEqual(
104 |             entity.content,
105 |             b'{"id": "postId", "url": "https://reddit.com/123", "username": "user1", "communityName": "r/bitcoin", "body": "Hello world", "createdAt": "2024-03-30T01:02:00+00:00", "dataType": "post", "title": "Title text", "parentId": null}',
106 |         )
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     unittest.main()
111 | 


--------------------------------------------------------------------------------
/tests/scraping/test_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import datetime as dt
 3 | 
 4 | from scraping import utils
 5 | 
 6 | 
 7 | class TestUtils(unittest.TestCase):
 8 |     def test_obfuscate_datetime_to_minute(self):
 9 |         test_date = dt.datetime(
10 |             year=2024,
11 |             month=1,
12 |             day=2,
13 |             hour=3,
14 |             minute=4,
15 |             second=5,
16 |             microsecond=6,
17 |             tzinfo=dt.timezone.utc,
18 |         )
19 | 
20 |         obfuscated_date = utils.obfuscate_datetime_to_minute(test_date)
21 | 
22 |         self.assertEqual(
23 |             obfuscated_date,
24 |             dt.datetime(
25 |                 year=2024,
26 |                 month=1,
27 |                 day=2,
28 |                 hour=3,
29 |                 minute=4,
30 |                 second=0,
31 |                 microsecond=0,
32 |                 tzinfo=dt.timezone.utc,
33 |             ),
34 |         )
35 | 


--------------------------------------------------------------------------------
/tests/scraping/x/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/scraping/x/__init__.py


--------------------------------------------------------------------------------
/tests/scraping/x/test_model.py:
--------------------------------------------------------------------------------
  1 | import datetime as dt
  2 | import unittest
  3 | from common import constants
  4 | 
  5 | from sympy import timed
  6 | from scraping.x.model import XContent
  7 | 
  8 | 
  9 | class TestModel(unittest.TestCase):
 10 |     def test_equality(self):
 11 |         """Tests validation of equivalent XContent instances."""
 12 |         timestamp = dt.datetime.now()
 13 |         # Create two XContent instances with the same values
 14 |         xcontent1 = XContent(
 15 |             username="user1",
 16 |             text="Hello world",
 17 |             url="https://twitter.com/123",
 18 |             timestamp=timestamp,
 19 |             tweet_hashtags=["#bittensor", "$TAO"],
 20 |         )
 21 |         xcontent2 = XContent(
 22 |             username="user1",
 23 |             text="Hello world",
 24 |             url="https://twitter.com/123",
 25 |             timestamp=timestamp,
 26 |             tweet_hashtags=["#bittensor", "$TAO"],
 27 |         )
 28 | 
 29 |         # Check if the two instances are equivalent
 30 |         self.assertTrue(xcontent1 == xcontent2)
 31 |         self.assertTrue(xcontent2 == xcontent1)
 32 | 
 33 |     def test_equality_not_equivalent(self):
 34 |         """Tests validation of non-equivalent XContent instances."""
 35 |         timestamp = dt.datetime.now()
 36 |         content = XContent(
 37 |             username="user1",
 38 |             text="Hello world",
 39 |             url="https://twitter.com/123",
 40 |             timestamp=timestamp,
 41 |             tweet_hashtags=["#bittensor", "$TAO"],
 42 |         )
 43 | 
 44 |         non_matching_content = [
 45 |             content.copy(update={"username": "user2"}),
 46 |             content.copy(update={"text": "Hello world!"}),
 47 |             content.copy(update={"url": "https://twitter.com/456"}),
 48 |             content.copy(update={"timestamp": timestamp + dt.timedelta(seconds=1)}),
 49 |             # Hashtag ordering needs to be deterministic. Verify changing the order of the hashtags makes the content non-equivalent.
 50 |             content.copy(update={"tweet_hashtags": ["#TAO", "#bittensor"]}),
 51 |         ]
 52 | 
 53 |         for c in non_matching_content:
 54 |             self.assertFalse(content == c)
 55 |             self.assertFalse(c == content)
 56 | 
 57 |     def test_label_truncation(self):
 58 |         """Tests that XContents correctly truncate labels to 32 characters when converting to DataEntities"""
 59 |         timestamp = dt.datetime.now(tz=dt.timezone.utc)
 60 |         content = XContent(
 61 |             username="user1",
 62 |             text="Hello world",
 63 |             url="https://twitter.com/123",
 64 |             timestamp=timestamp,
 65 |             tweet_hashtags=["#loooooooooooooooooooooooonghashtag", "$TAO"],
 66 |         )
 67 |         entity = XContent.to_data_entity(content=content)
 68 | 
 69 |         self.assertEqual(len(entity.label.value), constants.MAX_LABEL_LENGTH)
 70 |         self.assertEqual(entity.label.value, "#loooooooooooooooooooooooonghash")
 71 | 
 72 |     def test_label_truncation_lower(self):
 73 |         """Tests truncation of characters that become longer when .lower() is used on them."""
 74 |         timestamp = dt.datetime.now(tz=dt.timezone.utc)
 75 |         content = XContent(
 76 |             username="user1",
 77 |             text="Hello world",
 78 |             url="https://twitter.com/123",
 79 |             timestamp=timestamp,
 80 |             tweet_hashtags=["#İsrailleTicaretFilistineİhanet", "$TAO"],
 81 |         )
 82 |         entity = XContent.to_data_entity(content=content)
 83 | 
 84 |         self.assertEqual(len(entity.label.value), constants.MAX_LABEL_LENGTH)
 85 |         self.assertEqual(entity.label.value, "#i̇srailleticaretfilistinei̇hane")
 86 | 
 87 |     def test_to_data_entity_obfuscated(self):
 88 |         timestamp = dt.datetime(
 89 |             year=2024,
 90 |             month=3,
 91 |             day=1,
 92 |             hour=1,
 93 |             minute=1,
 94 |             second=1,
 95 |             microsecond=1,
 96 |             tzinfo=dt.timezone.utc,
 97 |         )
 98 |         content = XContent(
 99 |             username="user1",
100 |             text="Hello world",
101 |             url="https://twitter.com/123",
102 |             timestamp=timestamp,
103 |             tweet_hashtags=["#bittensor", "$TAO"],
104 |         )
105 | 
106 |         # Convert to entity and back to check granularity of the content timestamp.
107 |         entity = XContent.to_data_entity(content=content)
108 |         content_roundtrip = XContent.from_data_entity(entity)
109 | 
110 |         # The entity datetime should have full granularity but the roundtripped content should not.
111 |         self.assertEqual(entity.datetime, timestamp)
112 |         self.assertEqual(
113 |             content_roundtrip.timestamp,
114 |             dt.datetime(
115 |                 year=2024,
116 |                 month=3,
117 |                 day=1,
118 |                 hour=1,
119 |                 minute=1,
120 |                 second=0,
121 |                 microsecond=0,
122 |                 tzinfo=dt.timezone.utc,
123 |             ),
124 |         )
125 | 
126 |     def test_to_data_entity_content_serialization(self):
127 |         """Verifies that the content is serialized correctly when converting to a DataEntity."""
128 |         content = XContent(
129 |             username="user1",
130 |             text="Hello world",
131 |             url="https://twitter.com/123",
132 |             timestamp=dt.datetime(2024, 3, 30, 1, 2, 3, tzinfo=dt.timezone.utc),
133 |             tweet_hashtags=["#bittensor", "$TAO"],
134 |         )
135 | 
136 |         # Convert to entity and back to check granularity of the content timestamp.
137 |         entity = XContent.to_data_entity(content=content)
138 | 
139 |         # The content should not contain the model_config field.
140 |         self.assertEqual(
141 |             entity.content,
142 |             b'{"username": "user1", "text": "Hello world", "url": "https://twitter.com/123", "timestamp": "2024-03-30T01:02:00+00:00", "tweet_hashtags": ["#bittensor", "$TAO"]}',
143 |         )
144 | 
145 | 
146 | if __name__ == "__main__":
147 |     unittest.main()
148 | 


--------------------------------------------------------------------------------
/tests/storage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/storage/__init__.py


--------------------------------------------------------------------------------
/tests/storage/miner/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/storage/miner/__init__.py


--------------------------------------------------------------------------------
/tests/storage/validator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/storage/validator/__init__.py


--------------------------------------------------------------------------------
/tests/test_all.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | 
 4 | def create_test_suite():
 5 |     test_suite = unittest.TestSuite()
 6 | 
 7 |     # Find all tests in the current directory and subdirectories
 8 |     loader = unittest.TestLoader()
 9 |     suite = loader.discover("./tests", pattern="test_*.py")
10 |     print(loader.errors)
11 | 
12 |     # Add the discovered tests to the test suite
13 |     test_suite.addTest(suite)
14 | 
15 |     return test_suite
16 | 
17 | 
18 | # TODO: Fix this.
19 | # The tests fail because of a ModuleNotFoundError.
20 | if __name__ == "__main__":
21 |     suite = create_test_suite()
22 | 
23 |     # Run the tests
24 |     runner = unittest.TextTestRunner()
25 |     result = runner.run(suite)
26 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from typing import Any, Callable, Iterable, Tuple
  3 | import time
  4 | import datetime as dt
  5 | 
  6 | from common.data import (
  7 |     CompressedMinerIndex,
  8 |     DataSource,
  9 | )
 10 | from common.data_v2 import ScorableDataEntityBucket, ScorableMinerIndex
 11 | 
 12 | 
 13 | def get_only_element_matching_filter(
 14 |     iterable: Iterable[Any], filter: Callable[[Any], bool]
 15 | ) -> Any:
 16 |     """Returns the only element in the iterable that matches the filter, or raises an exception if there are zero or more than one elements."""
 17 |     results = [x for x in iterable if filter(x)]
 18 |     if len(results) != 1:
 19 |         raise Exception(
 20 |             f"Expected exactly one element matching filter, but found {len(results)}"
 21 |         )
 22 |     return results[0]
 23 | 
 24 | 
 25 | def wait_for_condition(condition: Callable[[], bool], timeout: float = 10.0):
 26 |     """Waits until the provided condition is true, or until the timeout is reached."""
 27 |     start_time = time.time()
 28 |     while not condition():
 29 |         if time.time() - start_time > timeout:
 30 |             raise Exception("Timed out waiting for condition to be true.")
 31 |         time.sleep(0.1)
 32 | 
 33 | 
 34 | def convert_compressed_index_to_scorable_miner_index(
 35 |     index: CompressedMinerIndex, last_updated: dt.datetime
 36 | ) -> ScorableMinerIndex:
 37 |     """Converts a CompressedMinerIndex to a ScorableMinerIndex, assuming size_bytes are fully scorable."""
 38 | 
 39 |     return ScorableMinerIndex(
 40 |         scorable_data_entity_buckets=[
 41 |             ScorableDataEntityBucket(
 42 |                 time_bucket_id=time_bucket_id,
 43 |                 source=source,
 44 |                 label=bucket.label,
 45 |                 size_bytes=size_bytes,
 46 |                 scorable_bytes=size_bytes,
 47 |             )
 48 |             for source in index.sources
 49 |             for bucket in index.sources[source]
 50 |             for time_bucket_id, size_bytes in zip(
 51 |                 bucket.time_bucket_ids, bucket.sizes_bytes
 52 |             )
 53 |         ],
 54 |         last_updated=last_updated,
 55 |     )
 56 | 
 57 | 
 58 | def are_scorable_indexes_equal(
 59 |     index1: ScorableMinerIndex, index2: ScorableMinerIndex
 60 | ) -> Tuple[bool, str]:
 61 |     """Compares two ScorableMinerIndex instances for equality."""
 62 | 
 63 |     # Compare the last_updated fields.
 64 |     if index1.last_updated != index2.last_updated:
 65 |         return (
 66 |             False,
 67 |             f"last_updated fields do not match. {index1.last_updated} != {index2.last_updated}",
 68 |         )
 69 | 
 70 |     def sort_key(bucket: ScorableDataEntityBucket):
 71 |         return (
 72 |             bucket.time_bucket_id,
 73 |             bucket.source,
 74 |             bucket.label if bucket.label else "NULL",
 75 |         )
 76 | 
 77 |     index1_sorted = sorted(index1.scorable_data_entity_buckets, key=sort_key)
 78 |     index2_sorted = sorted(index2.scorable_data_entity_buckets, key=sort_key)
 79 |     for bucket1, bucket2 in zip(index1_sorted, index2_sorted):
 80 |         if bucket1 != bucket2:
 81 |             return (
 82 |                 False,
 83 |                 f"Buckets do not match. {bucket1} != {bucket2}",
 84 |             )
 85 | 
 86 |     return True, None
 87 | 
 88 | 
 89 | def are_compressed_indexes_equal(
 90 |     index1: CompressedMinerIndex, index2: CompressedMinerIndex
 91 | ) -> bool:
 92 |     """Compares two CompressedMinerIndex instances for equality."""
 93 | 
 94 |     # Iterate both indexes, in order of sources.
 95 |     for source1, source2 in zip(sorted(index1.sources), sorted(index2.sources)):
 96 |         if source1 != source2:
 97 |             print(f"Sources do not match. {source1} != {source2}")
 98 |             return False
 99 | 
100 |         # For a given source, compare the buckets.
101 |         buckets1 = sorted(
102 |             index1.sources[source1], key=lambda b: b.label if b.label else "NULL"
103 |         )
104 |         buckets2 = sorted(
105 |             index2.sources[source2], key=lambda b: b.label if b.label else "NULL"
106 |         )
107 |         if buckets1 != buckets2:
108 |             print(f"Buckets do not match. {buckets1} != {buckets2}")
109 |             return False
110 | 
111 |     return True
112 | 
113 | 
114 | def create_scorable_index(num_buckets: int) -> ScorableMinerIndex:
115 |     """Creates a CompressedMinerIndex with ~ the specified number of buckets."""
116 |     assert num_buckets > 1000
117 | 
118 |     labels = [f"label{i}" for i in range(num_buckets // 2 // 500)]
119 |     time_buckets = [i for i in range(1, (num_buckets // 2 // len(labels)) + 1)]
120 | 
121 |     # Split max buckets equaly between sources with reddit having 100 time buckets and x having 500.
122 |     buckets = []
123 |     for source in [DataSource.REDDIT.value, DataSource.X.value]:
124 |         for time_bucket in time_buckets:
125 |             for label in labels:
126 |                 size = random.randint(50, 1000)
127 |                 scorable_bytes = int(random.random() * size)
128 |                 buckets.append(
129 |                     ScorableDataEntityBucket(
130 |                         time_bucket_id=time_bucket,
131 |                         source=source,
132 |                         label=label,
133 |                         size_bytes=size,
134 |                         scorable_bytes=scorable_bytes,
135 |                     )
136 |                 )
137 |     return ScorableMinerIndex(
138 |         scorable_data_entity_buckets=buckets, last_updated=dt.datetime.now()
139 |     )
140 | 


--------------------------------------------------------------------------------
/tests/vali_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/tests/vali_utils/__init__.py


--------------------------------------------------------------------------------
/tests/vali_utils/test_miner_iterator.py:
--------------------------------------------------------------------------------
 1 | from vali_utils.miner_iterator import MinerIterator
 2 | import unittest
 3 | 
 4 | 
 5 | class TestMinerIterator(unittest.TestCase):
 6 |     def test_miner_uids_are_sorted(self):
 7 |         """Creates a MinerIterator with unsorted miner UIDs and verifies that the miner UIDs are sorted."""
 8 |         uids = [2, 5, 1, 0]
 9 |         iterator = MinerIterator(uids)
10 | 
11 |         # The iterator starts at a random position. Move it until we're pointing to 0.
12 |         while iterator.peek() != 0:
13 |             next(iterator)
14 | 
15 |         # Now verify the UIDs are iterated in sorted order.
16 |         iterated_uids = [next(iterator) for _ in range(len(uids))]
17 |         self.assertEqual(iterated_uids, sorted(uids))
18 | 
19 |     def test_iterator_is_infinite(self):
20 |         """Creates a MinerIterator and verifies calling it more times than the number of miner UIDs cycles the UIDs."""
21 |         uids = [3, 2, 1]
22 |         expected = [1, 2, 3] * 10
23 |         iterator = MinerIterator(uids)
24 |         iterated_uids = [next(iterator) for _ in range(30)]
25 |         self.assertEqual(sorted(iterated_uids), sorted(expected))
26 | 
27 |     def test_peek(self):
28 |         """Creates a MinerIterator and verifies that peek returns the next UID without advancing the iterator."""
29 |         uids = [1, 2, 3]
30 |         iterator = MinerIterator(uids)
31 | 
32 |         peeked = iterator.peek()
33 |         self.assertEqual(peeked, iterator.peek())
34 |         self.assertEqual(peeked, next(iterator))
35 |         self.assertNotEqual(peeked, iterator.peek())
36 | 
37 |     def test_set_miner_uids(self):
38 |         """Verifies the iterator position is maintained when the miner UIDs are updated."""
39 |         initial_miner_uids = [1, 2, 3, 4, 5]
40 |         iterator = MinerIterator(initial_miner_uids)
41 | 
42 |         # Advance the iterator so it should now point to 3
43 |         # The iterator starts at a random position. Advance it until it returns 2.
44 |         while next(iterator) != 2:
45 |             pass
46 | 
47 |         iterator.set_miner_uids([1, 4, 6])
48 | 
49 |         # Verify the iterator picks up from the next UID greater than or equal to 3.
50 |         self.assertEqual(next(iterator), 4)
51 |         self.assertEqual(next(iterator), 6)
52 |         self.assertEqual(next(iterator), 1)
53 | 
54 |     def test_set_miner_uids_edge_case(self):
55 |         """Verifies the iterator position is reset when the miner UIDs are updated and the current position is no longer valid."""
56 |         # Create a MinerIterator with initial miner UIDs
57 |         initial_miner_uids = [1, 2, 3, 4, 5]
58 |         iterator = MinerIterator(initial_miner_uids)
59 | 
60 |         # Advance the iterator so it should now point to 5
61 |         while iterator.peek() != 5:
62 |             next(iterator)
63 | 
64 |         iterator.set_miner_uids([1, 2, 3, 4])
65 | 
66 |         self.assertEqual(next(iterator), 1)
67 |         self.assertEqual(next(iterator), 2)
68 |         self.assertEqual(next(iterator), 3)
69 |         self.assertEqual(next(iterator), 4)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     unittest.main()
74 | 


--------------------------------------------------------------------------------
/tests/vali_utils/test_validator_s3_access.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import sys
  4 | import bittensor as bt
  5 | from pathlib import Path
  6 | import json
  7 | from vali_utils.validator_s3_access import ValidatorS3Access
  8 | 
  9 | 
 10 | def main():
 11 |     # Parse arguments
 12 |     parser = argparse.ArgumentParser(description="Test S3 access for validators")
 13 |     parser.add_argument("--wallet", type=str, required=True, help="Wallet name")
 14 |     parser.add_argument("--hotkey", type=str, required=True, help="Hotkey name")
 15 |     parser.add_argument("--s3_auth_url", type=str, default="https://sn13-data.api.macrocosmos.ai",
 16 |                         help="S3 authentication URL")
 17 |     parser.add_argument("--netuid", type=int, default=13, help="Network UID")
 18 |     parser.add_argument("--network", type=str, default="finney", help="Network name")
 19 |     parser.add_argument("--action", type=str, choices=['auth', 'list_sources', 'list_miners', 'list_files'],
 20 |                         default='auth', help="Action to perform")
 21 |     parser.add_argument("--source", type=str, help="Data source (x or reddit)")
 22 |     parser.add_argument("--miner", type=str, help="Miner ID (coldkey)")
 23 | 
 24 |     args = parser.parse_args()
 25 | 
 26 |     # Create config
 27 |     config = bt.config()
 28 |     config.netuid = args.netuid
 29 |     config.s3_auth_url = args.s3_auth_url
 30 | 
 31 |     # Create wallet and S3 access
 32 |     wallet = bt.wallet(name=args.wallet, hotkey=args.hotkey)
 33 |     s3_access = ValidatorS3Access(
 34 |         wallet=wallet,
 35 |         s3_auth_url=args.s3_auth_url
 36 |     )
 37 | 
 38 |     # Perform requested action
 39 |     if args.action == 'auth':
 40 |         # Test authentication
 41 |         if s3_access.ensure_access():
 42 |             print("✅ Authentication successful")
 43 |             print(f"Access data received:")
 44 | 
 45 |             # Print readable summary of access data
 46 |             access_data = s3_access.access_data
 47 |             print(f"  Bucket: {access_data.get('bucket')}")
 48 |             print(f"  Region: {access_data.get('region')}")
 49 |             print(f"  Expiry: {access_data.get('expiry')}")
 50 | 
 51 |             # Print URLs structure
 52 |             urls = access_data.get('urls', {})
 53 |             sources = urls.get('sources', {})
 54 |             print(f"  Available sources: {list(sources.keys())}")
 55 | 
 56 |             return 0
 57 |         else:
 58 |             print("❌ Authentication failed")
 59 |             return 1
 60 | 
 61 |     elif args.action == 'list_sources':
 62 |         # List available sources
 63 |         sources = s3_access.list_sources()
 64 |         if sources:
 65 |             print(f"✅ Available sources: {sources}")
 66 |             return 0
 67 |         else:
 68 |             print("❌ Failed to list sources or none available")
 69 |             return 1
 70 | 
 71 |     elif args.action == 'list_miners':
 72 |         # List miners for a source
 73 |         if not args.source:
 74 |             print("❌ --source is required for list_miners action")
 75 |             return 1
 76 | 
 77 |         miners = s3_access.list_miners(args.source)
 78 |         if miners:
 79 |             print(f"✅ Found {len(miners)} miners for source {args.source}:")
 80 |             for m in miners[:20]:  # Show first 20
 81 |                 print(f"  - {m}")
 82 |             if len(miners) > 20:
 83 |                 print(f"  ... and {len(miners) - 20} more")
 84 |             return 0
 85 |         else:
 86 |             print(f"❌ No miners found for source {args.source} or listing failed")
 87 |             return 1
 88 | 
 89 |     elif args.action == 'list_files':
 90 |         # List files for a miner
 91 |         if not args.source or not args.miner:
 92 |             print("❌ --source and --miner are required for list_files action")
 93 |             return 1
 94 | 
 95 |         files = s3_access.list_files(args.source, args.miner)
 96 |         if files:
 97 |             print(f"✅ Found {len(files)} files for miner {args.miner} in source {args.source}:")
 98 |             for i, f in enumerate(files[:10]):  # Show first 10
 99 |                 print(f"  {i + 1}. {f['filename']} ({f['size']} bytes, modified: {f['last_modified']})")
100 |             if len(files) > 10:
101 |                 print(f"  ... and {len(files) - 10} more")
102 |             return 0
103 |         else:
104 |             print(f"❌ No files found for miner {args.miner} or listing failed")
105 |             return 1
106 | 
107 |     return 0
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     sys.exit(main())


--------------------------------------------------------------------------------
/vali_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/macrocosm-os/data-universe/231d5177dcd070b058d2ad467f0cffc9b0259647/vali_utils/__init__.py


--------------------------------------------------------------------------------
/vali_utils/api/auth/key_routes.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter, Depends
 2 | from .auth import require_master_key, key_manager
 3 | from pydantic import BaseModel
 4 | from typing import List
 5 | from vali_utils.api.utils import endpoint_error_handler
 6 | 
 7 | class APIKeyCreate(BaseModel):
 8 |     name: str
 9 | 
10 | 
11 | class APIKeyResponse(BaseModel):
12 |     key: str
13 |     name: str
14 | 
15 | 
16 | router = APIRouter(tags=["key management"])
17 | 
18 | 
19 | @router.post("", response_model=APIKeyResponse)
20 | @endpoint_error_handler
21 | async def create_api_key(
22 |     request: APIKeyCreate,
23 |     _: bool = Depends(require_master_key)
24 | ):
25 |     """Create new API key (requires master key)"""
26 |     key = key_manager.create_api_key(request.name)
27 |     return {"key": key, "name": request.name}
28 | 
29 | 
30 | @router.get("")
31 | @endpoint_error_handler
32 | async def list_api_keys(_: bool = Depends(require_master_key)):
33 |     """List all API keys (requires master key)"""
34 |     return {"keys": key_manager.list_api_keys()}
35 | 
36 | 
37 | @router.post("/{key}/deactivate")
38 | @endpoint_error_handler
39 | async def deactivate_api_key(
40 |     key: str,
41 |     _: bool = Depends(require_master_key)
42 | ):
43 |     """Deactivate an API key (requires master key)"""
44 |     key_manager.deactivate_api_key(key)
45 |     return {"status": "success"}


--------------------------------------------------------------------------------
/vali_utils/api/models.py:
--------------------------------------------------------------------------------
  1 | from pydantic import BaseModel, Field, field_validator
  2 | from typing import List, Optional, Dict, Any
  3 | import datetime as dt
  4 | from common.data import DataSource, StrictBaseModel
  5 | 
  6 | 
  7 | class DesirabilityRequest(BaseModel):
  8 |     desirabilities: List[Dict[str, Any]] = Field(
  9 |         description="List of source items with label weights"
 10 |     )
 11 | 
 12 | 
 13 | class QueryRequest(StrictBaseModel):
 14 |     """Request model for data queries"""
 15 |     source: str = Field(
 16 |         ...,  # Required field
 17 |         description="Data source (x or reddit)"
 18 |     )
 19 |     usernames: List[str] = Field(
 20 |         default_factory=list,
 21 |         description="List of usernames to fetch data from",
 22 |         max_length=10
 23 |     )
 24 |     keywords: List[str] = Field(
 25 |         default_factory=list,
 26 |         description="List of keywords to search for",
 27 |         max_length=5
 28 |     )
 29 |     # Change to optional strings for ISO format
 30 |     start_date: Optional[str] = Field(
 31 |         default=None,
 32 |         description="Start date (ISO format)"
 33 |     )
 34 |     end_date: Optional[str] = Field(
 35 |         default=None,
 36 |         description="End date (ISO format)"
 37 |     )
 38 |     limit: int = Field(
 39 |         default=100,
 40 |         ge=1,
 41 |         le=1000,
 42 |         description="Maximum number of items to return"
 43 |     )
 44 | 
 45 |     @field_validator('source')
 46 |     @classmethod
 47 |     def validate_source(cls, v: str) -> str:
 48 |         try:
 49 |             source = DataSource[v.upper()]
 50 |             if source.weight == 0:  # Check if it's an active source
 51 |                 raise ValueError(f"Source {v} is not currently active")
 52 |             return v.upper()  # Return uppercase to match enum
 53 |         except KeyError:
 54 |             valid_sources = [s.name.lower() for s in DataSource if s.weight > 0]
 55 |             raise ValueError(f"Invalid source. Must be one of: {valid_sources}")
 56 | 
 57 | 
 58 | class QueryResponse(StrictBaseModel):
 59 |     """Response model for data queries"""
 60 |     status: str = Field(description="Request status (success/error)")
 61 |     data: List[Dict[str, Any]] = Field(default_factory=list)
 62 |     meta: Dict[str, Any] = Field(
 63 |         default_factory=dict,
 64 |         description="Additional metadata about the request"
 65 |     )
 66 | 
 67 | 
 68 | class DataItem(StrictBaseModel):
 69 |     """Single data item in response"""
 70 |     content: bytes
 71 |     datetime: dt.datetime
 72 |     uri: str
 73 |     source: DataSource
 74 |     label: Optional[str] = None
 75 | 
 76 | 
 77 | class HfReposResponse(BaseModel):
 78 |     count: int
 79 |     repo_names: List[str]
 80 | 
 81 | 
 82 | class HealthResponse(StrictBaseModel):
 83 |     """Response model for health check"""
 84 |     status: str = Field(description="Service status")
 85 |     timestamp: dt.datetime = Field(description="Current UTC timestamp")
 86 |     miners_available: int = Field(description="Number of available miners")
 87 |     version: str = Field(default="1.0.0", description="API version")
 88 |     netuid: int = Field(description="Network UID")
 89 |     hotkey: str = Field(description="Validator hotkey address")
 90 | 
 91 | 
 92 | class MinerInfo(BaseModel):
 93 |     """Information about a miner's current data"""
 94 |     hotkey: str
 95 |     credibility: float
 96 |     bucket_count: int
 97 |     content_size_bytes_reddit: int
 98 |     content_size_bytes_twitter: int
 99 |     last_updated: dt.datetime
100 | 
101 | 
102 | class LabelSize(BaseModel):
103 |     """Content size information for a specific label"""
104 |     label_value: str
105 |     content_size_bytes: int
106 |     adj_content_size_bytes: int
107 | 
108 | 
109 | class AgeSize(BaseModel):
110 |     """Content size information for a specific time bucket"""
111 |     time_bucket_id: int
112 |     content_size_bytes: int
113 |     adj_content_size_bytes: int
114 | 
115 | 
116 | class LabelBytes(BaseModel):
117 |     """Byte size information for a particular label"""
118 |     label: str
119 |     total_bytes: int
120 |     adj_total_bytes: float
121 | 


--------------------------------------------------------------------------------
/vali_utils/api/server.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from fastapi import FastAPI, Depends, HTTPException, Request
  3 | from fastapi.middleware.cors import CORSMiddleware
  4 | from fastapi.openapi.docs import get_swagger_ui_html, get_redoc_html
  5 | import uvicorn
  6 | from threading import Thread
  7 | import bittensor as bt
  8 | from typing import Optional
  9 | from .routes import router, get_validator
 10 | from vali_utils.api.auth.key_routes import router as key_router
 11 | from vali_utils.api.auth.auth import APIKeyManager, key_manager, require_master_key
 12 | from vali_utils.api.utils import endpoint_error_handler
 13 | 
 14 | 
 15 | class ValidatorAPI:
 16 |     """API server for validator on-demand queries"""
 17 | 
 18 |     def __init__(self, validator, port: int = 8000):
 19 |         """
 20 |         Initialize API server
 21 | 
 22 |         Args:
 23 |             validator: Validator instance
 24 |             port: Port number to run API on
 25 |         """
 26 |         self.validator = validator
 27 |         self.port = port
 28 |         self.key_manager = key_manager
 29 |         self.app = self._create_app()
 30 |         self.server_thread: Optional[Thread] = None
 31 | 
 32 |     def _create_app(self) -> FastAPI:
 33 |         """Create and configure FastAPI application"""
 34 |         app = FastAPI(
 35 |             title="Data Universe Validator API",
 36 |             description="API for on-demand data queries from the Data Universe network",
 37 |             version="1.0.0",
 38 |             docs_url=None,    # Disable default docs routes
 39 |         )
 40 | 
 41 |         # Add CORS middleware
 42 |         app.add_middleware(
 43 |             CORSMiddleware,
 44 |             allow_origins=["*"],
 45 |             allow_credentials=True,
 46 |             allow_methods=["*"],
 47 |             allow_headers=["*"],
 48 |         )
 49 | 
 50 |         # Protected Swagger UI docs endpoint
 51 |         @app.get("/docs", include_in_schema=False)
 52 |         async def get_docs(_: bool = Depends(require_master_key)):
 53 |             return get_swagger_ui_html(
 54 |                 openapi_url="/openapi.json",
 55 |                 title="API Documentation"
 56 |             )
 57 | 
 58 |         # Protected ReDoc docs endpoint using default styling
 59 |         @app.get("/redoc", include_in_schema=False)
 60 |         async def get_redoc(_: bool = Depends(require_master_key)):
 61 |             return get_redoc_html(
 62 |                 openapi_url="/openapi.json",
 63 |                 title="API Documentation"
 64 |             )
 65 | 
 66 |         # Protected OpenAPI JSON schema endpoint
 67 |         @app.get("/openapi.json", include_in_schema=False)
 68 |         @endpoint_error_handler
 69 |         async def openapi_schema(_: bool = Depends(require_master_key)):
 70 |             try:
 71 |                 if not app.openapi_schema:
 72 |                     from fastapi.openapi.utils import get_openapi
 73 |                     app.openapi_schema = get_openapi(
 74 |                         title=app.title,
 75 |                         version=app.version,
 76 |                         description=app.description,
 77 |                         routes=app.routes,
 78 |                     )
 79 |                     # Remove sensitive security information if needed
 80 |                     for path in app.openapi_schema.get("paths", {}).values():
 81 |                         for operation in path.values():
 82 |                             if "security" in operation:
 83 |                                 del operation["security"]
 84 |                 return app.openapi_schema
 85 |             except Exception as e:
 86 |                 bt.logging.error(f"Failed to generate OpenAPI schema: {str(e)}")
 87 |                 raise HTTPException(status_code=500, detail="Could not generate API documentation")
 88 | 
 89 |         # Rate limit headers middleware
 90 |         @app.middleware("http")
 91 |         async def add_rate_limit_headers(request: Request, call_next):
 92 |             response = await call_next(request)
 93 |             api_key = request.headers.get("X-API-Key")
 94 |             if api_key and self.key_manager.is_valid_key(api_key):
 95 |                 _, headers = self.key_manager.check_rate_limit(api_key)
 96 |                 for header_name, header_value in headers.items():
 97 |                     response.headers[header_name] = header_value
 98 |             return response
 99 | 
100 |         # Set validator instance for dependency injection
101 |         get_validator.api = self
102 | 
103 |         # Include API routes
104 |         app.include_router(router, prefix="/api/v1")
105 |         app.include_router(key_router, prefix="/api/v1/keys")
106 | 
107 |         return app
108 | 
109 |     def start(self):
110 |         """Start API server with better error handling"""
111 |         if self.server_thread and self.server_thread.is_alive():
112 |             bt.logging.warning("API server already running")
113 |             return
114 | 
115 |         def run_server():
116 |             try:
117 |                 bt.logging.info(f"Starting API server on port {self.port}")
118 |                 uvicorn.run(
119 |                     self.app,
120 |                     host="0.0.0.0",
121 |                     port=self.port,
122 |                     log_level="info"
123 |                 )
124 |             except Exception as e:
125 |                 bt.logging.error(f"API server error: {str(e)}")
126 | 
127 |         self.server_thread = Thread(target=run_server, daemon=True)
128 |         self.server_thread.start()
129 |         bt.logging.success(f"API server started on port {self.port}")
130 | 
131 |     def stop(self):
132 |         """Stop API server"""
133 |         bt.logging.info("Stopping API server")
134 |         # The uvicorn server will stop when the thread is terminated
135 |         self.server_thread = None  # Allow for garbage collection
136 | 
137 |     def restart(self):
138 |         """Restart API server"""
139 |         bt.logging.info("Restarting API server")
140 |         self.stop()
141 |         time.sleep(2)  # Give it a moment to fully stop
142 |         self.start()
143 |     def stop(self):
144 |         """Stop API server"""
145 |         if self.server_thread and self.server_thread.is_alive():
146 |             self.server_thread.join(timeout=5)
147 |             bt.logging.info("API server stopped")


--------------------------------------------------------------------------------
/vali_utils/api/utils.py:
--------------------------------------------------------------------------------
  1 | import bittensor as bt
  2 | import random
  3 | from fastapi import HTTPException
  4 | from functools import wraps
  5 | from common.organic_protocol import OrganicRequest
  6 | 
  7 | def select_validation_samples(data, sample_size: int = 1):
  8 |     """Select random samples from the data for validation"""
  9 |     if not data:
 10 |         return []
 11 | 
 12 |     # Select up to sample_size random items, or all items if less than sample_size
 13 |     sample_count = min(sample_size, len(data))
 14 |     return random.sample(data, sample_count)
 15 | 
 16 | 
 17 | def endpoint_error_handler(func):
 18 |     """Return 500 status code if endpoint failed"""
 19 |     @wraps(func)
 20 |     async def wrapper(*args, **kwargs):
 21 |         try:
 22 |             return await func(*args, **kwargs)
 23 |         except HTTPException:
 24 |             # Re-raise FastAPI HTTP exceptions
 25 |             raise
 26 |         except Exception as e:
 27 |             bt.logging.error(f"API endpoint error: {str(e)}")
 28 |             raise HTTPException(
 29 |                 status_code=500,
 30 |                 detail="Internal server error"
 31 |             )
 32 |     return wrapper
 33 | 
 34 | async def query_validator(
 35 |     wallet: bt.wallet,
 36 |     validator_host: str,
 37 |     validator_port: int,
 38 |     validator_hotkey: str,
 39 |     source: str,
 40 |     keywords: list = [],
 41 |     usernames: list = [],
 42 |     start_date: str = None,
 43 |     end_date: str = None,
 44 |     limit: int = 1000
 45 | ):
 46 |     """
 47 |     Query a validator using the OrganicRequest protocol
 48 |     
 49 |     Args:
 50 |         wallet: Bittensor wallet for signing the request
 51 |         validator_host: Validator IP address or hostname
 52 |         validator_port: Validator port number
 53 |         validator_hotkey: Validator hotkey (str)
 54 |         source: Data source (X or REDDIT)
 55 |         keywords: List of keywords to search for
 56 |         usernames: List of usernames to search for
 57 |         start_date: ISO-formatted start date
 58 |         end_date: ISO-formatted end date
 59 |         limit: Maximum number of results to return
 60 |         
 61 |     Returns:
 62 |         OrganicRequest response with data or error information
 63 |     """
 64 |     bt.logging.info(f"Querying validator at {validator_host}:{validator_port} for {source} data")
 65 |     
 66 |     # Create an AxonInfo with required fields
 67 |     axon_info = bt.AxonInfo(
 68 |         ip=validator_host, 
 69 |         port=validator_port,
 70 |         ip_type=0,  # v4
 71 |         hotkey=validator_hotkey, 
 72 |         coldkey="",  # Not needed
 73 |         protocol=0,
 74 |         version=1
 75 |     )
 76 |     
 77 |     # Prepare the OrganicRequest synapse
 78 |     synapse = OrganicRequest(
 79 |         source=source.upper(),
 80 |         usernames=usernames,
 81 |         keywords=keywords,
 82 |         start_date=start_date,
 83 |         end_date=end_date,
 84 |         limit=limit
 85 |     )
 86 |     
 87 |     # Send the request to the validator
 88 |     try:
 89 |         async with bt.dendrite(wallet=wallet) as dendrite:
 90 |             response = await dendrite.forward(
 91 |                 axons=[axon_info],
 92 |                 synapse=synapse,
 93 |                 timeout=180  # 3 minute timeout
 94 |             )
 95 |             
 96 |         if not response or len(response) == 0:
 97 |             bt.logging.error("No response received from validator")
 98 |             return None
 99 |             
100 |         return response[0]
101 |     except Exception as e:
102 |         bt.logging.error(f"Error querying validator at {validator_host}:{validator_port}: {str(e)}")
103 |         raise
104 | 


--------------------------------------------------------------------------------
/vali_utils/load_balancer/validator_registry.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import random
 3 | from collections import deque
 4 | from typing import List, Tuple, Optional, ClassVar
 5 | import bittensor as bt
 6 | import numpy as np
 7 | from pydantic import BaseModel, Field, model_validator
 8 | from common.data import DataSource
 9 | from common.organic_protocol import OrganicRequest
10 | 
11 | class Validator(BaseModel):
12 |     uid: int
13 |     stake: float
14 |     axon: str
15 |     hotkey: str
16 |     timeout: int = 1  # starting cooldown in seconds; doubles on failure (capped at 86400)
17 |     available_at: float = 0.0  # Unix timestamp indicating when the validator is next available
18 | 
19 |     def update_failure(self, status: str) -> int:
20 |         """
21 |         Update the validator's timeout based on failure status.
22 |         """
23 |         current_time = time.time()
24 |         if status != "error":
25 |             self.timeout = 1
26 |             self.available_at = current_time
27 |         else:
28 |             self.timeout = min(self.timeout * 4, 86400)
29 |             self.available_at = current_time + self.timeout
30 | 
31 |     def is_available(self):
32 |         """
33 |         Check if the validator is available based on its cooldown.
34 |         """
35 |         return time.time() >= self.available_at
36 |     
37 | 
38 | class ValidatorRegistry(BaseModel):
39 |     """
40 |     Class to store the success of forwards to validator axons.
41 |     Validators that routinely fail to respond to requests are timed out.
42 |     """
43 | 
44 |     # Using a default factory ensures validators is always a dict.
45 |     validators: dict[int, Validator] = Field(default_factory=dict)
46 |     current_index: int = Field(default=0)
47 | 
48 |     def __init__(self, metagraph: bt.metagraph = None, organic_whitelist: List[str] = None, **data):
49 |         super().__init__(**data)
50 |         # Initialize with empty dict first
51 |         self.validators = {}
52 |         
53 |         # If metagraph is provided, create validator list immediately
54 |         if metagraph is not None:
55 |             organic_whitelist = organic_whitelist or []
56 |             validator_uids = np.where(metagraph.stake >= 50_000)[0].tolist()
57 |             validator_axons = [metagraph.axons[uid].ip_str().split("/")[2] for uid in validator_uids]
58 |             validator_stakes = [metagraph.stake[uid] for uid in validator_uids]
59 |             validator_hotkeys = [metagraph.hotkeys[uid] for uid in validator_uids]
60 |             self.validators = {
61 |                 uid: Validator(uid=uid, stake=stake, axon=axon, hotkey=hotkey)
62 |                 for uid, stake, axon, hotkey in zip(validator_uids, validator_stakes, validator_axons, validator_hotkeys) 
63 |                 if hotkey in organic_whitelist
64 |             }
65 |         bt.logging.info(f"Validator registry for organics: {self.validators}")
66 | 
67 |     def get_available_validators(self) -> List[int]:
68 |         """
69 |         Get a list of available validators, starting from the current index for cycling.
70 |         """
71 |         available = [uid for uid, validator in self.validators.items() if validator.is_available()]
72 |         
73 |         if not available:
74 |             return []
75 |         available.sort()
76 |         
77 |         # Reorder the list to start from current_index for cycling
78 |         if self.current_index >= len(available):
79 |             self.current_index = 0  
80 |             
81 |         # If current_index points to a validator that's no longer available,
82 |         # just start from the beginning
83 |         if self.current_index >= len(available):
84 |             ordered_validators = available
85 |         else:
86 |             # Start the list from current_index
87 |             ordered_validators = available[self.current_index:] + available[:self.current_index]
88 |             self.current_index = (self.current_index + 1) % max(1, len(available))
89 |             
90 |         return ordered_validators
91 | 
92 |     def update_validators(self, uid: int, response_code: int) -> None:
93 |         """
94 |         Update a specific validator's failure count based on the response code.
95 |         If the validator's failure count exceeds the maximum allowed failures,
96 |         the validator is removed from the registry.
97 |         """
98 |         if uid in self.validators:
99 |             self.validators[uid].update_failure(response_code)


--------------------------------------------------------------------------------
/vali_utils/miner_iterator.py:
--------------------------------------------------------------------------------
 1 | import bisect
 2 | import copy
 3 | import threading
 4 | from typing import List
 5 | 
 6 | import random
 7 | 
 8 | 
 9 | class MinerIterator:
10 |     """A thread safe infinite iterator to cyclically enumerate the current set of miner UIDs.
11 | 
12 |     Why? To perform miner evaluations, the validator will enumerate through the miners in order to help ensure
13 |     each miner is evaluated at least once per epoch.
14 |     """
15 | 
16 |     def __init__(self, miner_uids: List[int]):
17 |         self.miner_uids = sorted(copy.deepcopy(miner_uids))
18 |         # Start the index at a random position. This helps ensure that miners with high UIDs aren't penalized if
19 |         # the validator restarts frequently.
20 |         self.index = random.randint(0, len(self.miner_uids) - 1)
21 |         self.lock = threading.Lock()
22 | 
23 |     def __iter__(self):
24 |         return self
25 | 
26 |     def __next__(self) -> int:
27 |         with self.lock:
28 |             if len(self.miner_uids) == 0:
29 |                 # This iterator should be infinite. If there are no miner UIDs, raise an error.
30 |                 raise IndexError("No miner UIDs.")
31 | 
32 |             uid = self.miner_uids[self.index]
33 |             self.index += 1
34 |             if self.index >= len(self.miner_uids):
35 |                 self.index = 0
36 |             return uid
37 | 
38 |     def peek(self) -> int:
39 |         """Returns the next miner UID without advancing the iterator."""
40 |         with self.lock:
41 |             if len(self.miner_uids) == 0:
42 |                 # This iterator should be infinite. If there are no miner UIDs, raise an error.
43 |                 raise IndexError("No miner UIDs.")
44 | 
45 |             return self.miner_uids[self.index]
46 | 
47 |     def set_miner_uids(self, miner_uids: List[int]):
48 |         """Updates the miner UIDs to iterate.
49 | 
50 |         The iterator will be updated to the first miner uid that is greater than or equal to UID that would be next
51 |         returned by the iterator. This helps ensure that frequent updates to the miner_uids does not cause too much
52 |         churn in the sequence of UIDs returned by the iterator.
53 |         """
54 |         sorted_uids = sorted(copy.deepcopy(miner_uids))
55 |         with self.lock:
56 |             next_uid = self.miner_uids[self.index]
57 |             new_index = bisect.bisect_left(sorted_uids, next_uid)
58 |             if new_index >= len(sorted_uids):
59 |                 new_index = 0
60 |             self.index = new_index
61 |             self.miner_uids = sorted_uids
62 | 


--------------------------------------------------------------------------------