├── .circleci
    └── config.yml
├── .dependencies_installed
├── .gitignore
├── .python-version
├── LICENSE
├── README.md
├── cc_net
    ├── .gitignore
    ├── Makefile
    ├── README.md
    ├── cc_net
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── data
    │   │   └── cutoff.csv
    │   ├── dedup.py
    │   ├── execution.py
    │   ├── flat_hash_set.py
    │   ├── get_wiki_cirrus.py
    │   ├── jsonql.py
    │   ├── mine.py
    │   ├── minify.py
    │   ├── perplexity.py
    │   ├── process_wet_file.py
    │   ├── regroup.py
    │   ├── split_by_lang.py
    │   ├── stream_cc.py
    │   ├── text_normalizer.py
    │   ├── tokenizer.py
    │   └── tools
    │   │   ├── __init__.py
    │   │   ├── dl_cc_100.py
    │   │   ├── expand_corpus.py
    │   │   └── make_dmoz_corpus.py
    ├── collinfo.json
    ├── pyproject.toml
    └── setup.py
├── contrib
    ├── CODE_REVIEW_DOCS.md
    ├── CONTRIBUTING.md
    ├── DEVELOPMENT_WORKFLOW.md
    └── STYLE.md
├── core
├── detection
    ├── __init__.py
    ├── attacks
    │   ├── __init__.py
    │   ├── data_augmentation.py
    │   ├── delete.py
    │   ├── resources
    │   │   └── .gitkeep
    │   ├── spelling.py
    │   ├── synonym.py
    │   └── zero_width_space.py
    ├── base
    │   ├── __init__.py
    │   ├── miner.py
    │   ├── neuron.py
    │   └── validator.py
    ├── protocol.py
    ├── utils
    │   ├── __init__.py
    │   ├── config.py
    │   ├── misc.py
    │   ├── uids.py
    │   └── weight_version.py
    └── validator
    │   ├── __init__.py
    │   ├── cc_dataset.py
    │   ├── data_generator.py
    │   ├── forward.py
    │   ├── generate_version.py
    │   ├── models.py
    │   ├── my_datasets.py
    │   ├── reward.py
    │   ├── segmentation_processer.py
    │   ├── text_completion.py
    │   └── text_postprocessing.py
├── docs
    ├── FAQ.md
    ├── faq_1.png
    ├── incentive.md
    ├── logo.png
    ├── meet_its_ai.png
    ├── miner_solution.md
    ├── mining.md
    ├── raid_leaderboard.png
    ├── validating.md
    ├── vision_and_roadmap.md
    └── what_are_subnets.md
├── min_compute.yml
├── models
    └── ppl_model.pk
├── neurons
    ├── __init__.py
    ├── miner.py
    ├── miners
    │   ├── __init__.py
    │   ├── deberta_classifier.py
    │   └── ppl_model.py
    └── validator.py
├── prompting
    ├── __init__.py
    ├── agent.py
    ├── cleaners
    │   ├── __init__.py
    │   ├── all_cleaners.py
    │   └── cleaner.py
    ├── conversation.py
    ├── llm.py
    ├── mock.py
    ├── persona.py
    ├── tasks
    │   ├── __init__.py
    │   ├── date_qa.py
    │   ├── debugging.py
    │   ├── generic_instruction.py
    │   ├── math.py
    │   ├── qa.py
    │   ├── summarization.py
    │   └── task.py
    ├── tools
    │   ├── __init__.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── code.py
    │   │   ├── context.py
    │   │   ├── math.py
    │   │   ├── mock.py
    │   │   └── wiki.py
    │   └── selector.py
    └── utils
    │   ├── __init__.py
    │   └── exceptions.py
├── requirements.txt
├── run.sh
├── scripts
    ├── check_compatibility.sh
    ├── check_requirements_changes.sh
    ├── install_staging.sh
    └── start_validator.py
├── setup.py
└── tests
    ├── __init__.py
    ├── helpers.py
    └── test_template_validator.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | version: 2.1
  2 | 
  3 | orbs:
  4 |   python: circleci/python@2.1.1
  5 |   python-lib: dialogue/python-lib@0.1.55
  6 |   # coveralls: coveralls/coveralls@1.0.6
  7 | 
  8 | jobs:
  9 |   black:
 10 |     resource_class: small
 11 |     parameters:
 12 |       python-version:
 13 |         type: string
 14 |     docker:
 15 |       - image: cimg/python:<< parameters.python-version >>
 16 | 
 17 |     steps:
 18 |       - checkout
 19 | 
 20 |       - restore_cache:
 21 |           name: Restore cached black venv
 22 |           keys:
 23 |             - v1-pypi-py-black-<< parameters.python-version >>
 24 | 
 25 |       - run:
 26 |           name: Update & Activate black venv
 27 |           command: |
 28 |             python -m venv env/
 29 |             . env/bin/activate
 30 |             python -m pip install --upgrade pip
 31 |             pip install black
 32 | 
 33 |       - save_cache:
 34 |           name: Save cached black venv
 35 |           paths:
 36 |             - "env/"
 37 |           key: v1-pypi-py-black-<< parameters.python-version >>
 38 | 
 39 |       - run:
 40 |           name: Black format check
 41 |           command: |
 42 |             . env/bin/activate
 43 |             black --line-length 79 --exclude '(env|venv|.eggs)' --check .
 44 | 
 45 |   pylint:
 46 |     resource_class: small
 47 |     parameters:
 48 |       python-version:
 49 |         type: string
 50 |     docker:
 51 |       - image: cimg/python:<< parameters.python-version >>
 52 | 
 53 |     steps:
 54 |       - checkout
 55 | 
 56 |       - run:
 57 |           name: Install Pylint
 58 |           command: |
 59 |             python -m venv env/
 60 |             . env/bin/activate
 61 |             pip install pylint
 62 | 
 63 |       - run:
 64 |           name: Pylint check
 65 |           command: |
 66 |             . env/bin/activate
 67 |             pylint --fail-on=W,E,F --exit-zero  ./
 68 | 
 69 |   check_compatibility:
 70 |     parameters:
 71 |       python_version:
 72 |         type: string
 73 |     docker:
 74 |       - image: cimg/python:3.10
 75 |     steps:
 76 |       - checkout
 77 |       - run:
 78 |           name: Check if requirements files have changed
 79 |           command: ./scripts/check_requirements_changes.sh
 80 |       - run:
 81 |           name: Install dependencies and Check compatibility
 82 |           command: |
 83 |             if [ "$REQUIREMENTS_CHANGED" == "true" ]; then
 84 |               sudo apt-get update
 85 |               sudo apt-get install -y jq curl
 86 |               ./scripts/check_compatibility.sh << parameters.python_version >>
 87 |             else
 88 |               echo "Skipping compatibility checks..."
 89 |             fi
 90 | 
 91 |   build:
 92 |     resource_class: medium
 93 |     parallelism: 2
 94 |     parameters:
 95 |       python-version:
 96 |         type: string
 97 |     docker:
 98 |       - image: cimg/python:<< parameters.python-version >>
 99 | 
100 |     steps:
101 |       - checkout
102 | 
103 |       - restore_cache:
104 |           name: Restore cached venv
105 |           keys:
106 |             - v1-pypi-py<< parameters.python-version >>-{{ checksum "requirements.txt" }}
107 |             - v1-pypi-py<< parameters.python-version >>
108 | 
109 |       - run:
110 |           name: Update & Activate venv
111 |           command: |
112 |             python -m venv env/
113 |             . env/bin/activate
114 |             python -m pip install --upgrade pip
115 | 
116 |       - save_cache:
117 |           name: Save cached venv
118 |           paths:
119 |             - "env/"
120 |           key: v1-pypi-py<< parameters.python-version >>-{{ checksum "requirements.txt" }}
121 | 
122 |       - run:
123 |           name: Install Bittensor Subnet Template
124 |           command: |
125 |             . env/bin/activate
126 |             pip install -e .
127 | 
128 |       - store_test_results:
129 |           path: test-results
130 |       - store_artifacts:
131 |           path: test-results
132 | 
133 |   coveralls:
134 |     docker:
135 |       - image: cimg/python:3.10
136 |     steps:
137 |       - run:
138 |           name: Combine Coverage
139 |           command: |
140 |             pip3 install --upgrade coveralls
141 |             coveralls --finish --rcfile .coveragerc || echo "Failed to upload coverage"
142 | 
143 | workflows:
144 |   compatibility_checks:
145 |     jobs:
146 |       - check_compatibility:
147 |           python_version: "3.8"
148 |           name: check-compatibility-3.8
149 |       - check_compatibility:
150 |           python_version: "3.9"
151 |           name: check-compatibility-3.9
152 |       - check_compatibility:
153 |           python_version: "3.10"
154 |           name: check-compatibility-3.10
155 |       - check_compatibility:
156 |           python_version: "3.11"
157 |           name: check-compatibility-3.11
158 | 
159 |   pr-requirements:
160 |     jobs:
161 |       - black:
162 |           python-version: "3.8.12"
163 |       - pylint:
164 |           python-version: "3.8.12"
165 |       - build:
166 |           matrix:
167 |             parameters:
168 |               python-version: ["3.9.13", "3.10.6", "3.11.4"]
169 | 


--------------------------------------------------------------------------------
/.dependencies_installed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/.dependencies_installed


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | app.config.js
  2 | 
  3 | commands.txt
  4 | commands_testnet.txt
  5 | check_vpermit.py
  6 | setup_runpod.sh
  7 | 
  8 | wandb/
  9 | wandb/*
 10 | 
 11 | models/
 12 | models/*
 13 | 
 14 | nohup.out
 15 | 
 16 | # Byte-compiled / optimized / DLL files
 17 | __pycache__/
 18 | *.py[cod]
 19 | *$py.class
 20 | 
 21 | # C extensions
 22 | *.so
 23 | 
 24 | # Distribution / packaging
 25 | .Python
 26 | build/
 27 | develop-eggs/
 28 | dist/
 29 | downloads/
 30 | eggs/
 31 | .eggs/
 32 | lib/
 33 | lib64/
 34 | parts/
 35 | sdist/
 36 | var/
 37 | wheels/
 38 | share/python-wheels/
 39 | *.egg-info/
 40 | .installed.cfg
 41 | *.egg
 42 | MANIFEST
 43 | 
 44 | # PyInstaller
 45 | #  Usually these files are written by a python script from a template
 46 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 47 | *.manifest
 48 | *.spec
 49 | 
 50 | # Installer logs
 51 | pip-log.txt
 52 | pip-delete-this-directory.txt
 53 | 
 54 | # Unit test / coverage reports
 55 | htmlcov/
 56 | .tox/
 57 | .nox/
 58 | .coverage
 59 | .coverage.*
 60 | .cache
 61 | nosetests.xml
 62 | coverage.xml
 63 | *.cover
 64 | *.py,cover
 65 | .hypothesis/
 66 | .pytest_cache/
 67 | cover/
 68 | 
 69 | # Translations
 70 | *.mo
 71 | *.pot
 72 | 
 73 | # Django stuff:
 74 | *.log
 75 | local_settings.py
 76 | db.sqlite3
 77 | db.sqlite3-journal
 78 | 
 79 | # Flask stuff:
 80 | instance/
 81 | .webassets-cache
 82 | 
 83 | # Scrapy stuff:
 84 | .scrapy
 85 | 
 86 | # Sphinx documentation
 87 | docs/_build/
 88 | 
 89 | # PyBuilder
 90 | .pybuilder/
 91 | target/
 92 | 
 93 | # Jupyter Notebook
 94 | .ipynb_checkpoints
 95 | 
 96 | # IPython
 97 | profile_default/
 98 | ipython_config.py
 99 | 
100 | # pyenv
101 | #   For a library or package, you might want to ignore these files since the code is
102 | #   intended to run in multiple environments; otherwise, check them in:
103 | # .python-version
104 | 
105 | # pipenv
106 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
107 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
108 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
109 | #   install all needed dependencies.
110 | #Pipfile.lock
111 | 
112 | # poetry
113 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
114 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
115 | #   commonly ignored for libraries.
116 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
117 | #poetry.lock
118 | 
119 | # pdm
120 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
121 | #pdm.lock
122 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
123 | #   in version control.
124 | #   https://pdm.fming.dev/#use-with-ide
125 | .pdm.toml
126 | 
127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128 | __pypackages__/
129 | 
130 | # Celery stuff
131 | celerybeat-schedule
132 | celerybeat.pid
133 | 
134 | # SageMath parsed files
135 | *.sage.py
136 | 
137 | # Environments
138 | .env
139 | .venv
140 | env/
141 | venv/
142 | ENV/
143 | env.bak/
144 | venv.bak/
145 | 
146 | # Spyder project settings
147 | .spyderproject
148 | .spyproject
149 | 
150 | # Rope project settings
151 | .ropeproject
152 | 
153 | # mkdocs documentation
154 | /site
155 | 
156 | # mypy
157 | .mypy_cache/
158 | .dmypy.json
159 | dmypy.json
160 | 
161 | # Pyre type checker
162 | .pyre/
163 | 
164 | # pytype static type analyzer
165 | .pytype/
166 | 
167 | # Cython debug symbols
168 | cython_debug/
169 | 
170 | # PyCharm
171 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
172 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
173 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
174 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
175 | #.idea/
176 | 
177 | testing/


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10.14
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Opentensor
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/cc_net/.gitignore:
--------------------------------------------------------------------------------
 1 | # Dataset
 2 | /data
 3 | /test_data/
 4 | /test_data2/
 5 | /output/
 6 | 
 7 | # Binary files
 8 | /bin/
 9 | 
10 | # Third party code
11 | /third_party/
12 | 
13 | # Generic to python
14 | __pycache__/
15 | *.pyc
16 | .mypy_cache/
17 | 
18 | /scratch/
19 | /notebooks/
20 | 
21 | /build/
22 | /cc_net.egg-info/
23 | /config/
24 | /dist/
25 | /pip-wheel-metadata/
26 | 
27 | /.DS_Store
28 | 


--------------------------------------------------------------------------------
/cc_net/cc_net/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | #
6 | 


--------------------------------------------------------------------------------
/cc_net/cc_net/__main__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | #
 6 | 
 7 | 
 8 | import func_argparse
 9 | 
10 | import cc_net.mine
11 | 
12 | 
13 | def main():
14 |     func_argparse.parse_and_call(cc_net.mine.get_main_parser())
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     main()
19 | 


--------------------------------------------------------------------------------
/cc_net/cc_net/get_wiki_cirrus.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | #
  6 | 
  7 | """
  8 | Creates mono-lingual corpus from Wikipedia.
  9 | """
 10 | 
 11 | import functools
 12 | import re
 13 | import subprocess
 14 | import urllib.request
 15 | from pathlib import Path
 16 | from typing import Dict
 17 | 
 18 | import func_argparse
 19 | from bs4 import BeautifulSoup  # type: ignore
 20 | 
 21 | from cc_net import jsonql, text_normalizer
 22 | 
 23 | CIRRUS_URL = "https://dumps.wikimedia.org/other/cirrussearch"
 24 | CIRRUS_DUMP_RE = re.compile(r"^(.*)wiki-\d+-cirrussearch-content\.json\.gz")
 25 | 
 26 | 
 27 | def tmp(file: Path) -> Path:
 28 |     return file.parent / ("tmp." + file.name)
 29 | 
 30 | 
 31 | def opening(file: Path, output: Path = None, n_docs: int = 1_000_000):
 32 |     """Will dump the tokenized opening text of the given Wikipedia.
 33 | 
 34 |     Args:
 35 |         - file: File containing the Wikipedia dump.
 36 |         - output: Output file.
 37 |         - n_docs: How many docs to parse
 38 |         - tokenize: whether to tokenize the text
 39 |         - lang: Language code used to chose the tokenizer
 40 |     """
 41 |     assert file.exists()
 42 |     return jsonql.run_pipes(
 43 |         functools.partial(extract_opening_text, n_docs=n_docs),
 44 |         file=file,
 45 |         output=tmp(output) if output else None,
 46 |     )
 47 |     if output:
 48 |         tmp(output).replace(output)
 49 | 
 50 | 
 51 | def extract_opening_text(source, n_docs: int = 10_000):
 52 |     i = 0
 53 |     for doc in jsonql.read_jsons(source):
 54 |         if not doc:
 55 |             continue
 56 | 
 57 |         text = doc.get("opening_text")
 58 |         if not text:
 59 |             continue
 60 | 
 61 |         yield text_normalizer.normalize(text)
 62 |         i += 1
 63 |         if i >= n_docs:
 64 |             break
 65 | 
 66 | 
 67 | def dl(lang: str, output_dir: Path, date: str = None):
 68 |     """Download the cirrus extract for the given lang.
 69 | 
 70 |     See https://dumps.wikimedia.org/other/cirrussearch for the full list of files.
 71 | 
 72 |     Args:
 73 |         - lang: The Wikipedia code for the language.
 74 |         - output_dir: Output directory. File will be `{lang}.json.gz`
 75 |         - date: Date of a specific Cirrus dump.
 76 |     """
 77 | 
 78 |     urls = get_cirrus_urls(date)
 79 |     assert (
 80 |         lang in urls
 81 |     ), f"--lang {lang} not found. Available languages are: {urls.keys()}"
 82 | 
 83 |     assert output_dir, "--output_dir folder needed."
 84 |     output_dir.mkdir(exist_ok=True)
 85 |     output = output_dir / (lang + ".json.gz")
 86 |     print(f"Downloading {lang} wiki from {urls[lang]} to {output}")
 87 |     wget(urls[lang], output)
 88 | 
 89 | 
 90 | def get_cirrus_urls(date: str = None) -> Dict[str, str]:
 91 |     if date is None:
 92 |         cirrus_page = BeautifulSoup(
 93 |             urllib.request.urlopen(CIRRUS_URL), features="html.parser"
 94 |         )
 95 |         dumps = [a.get("href").strip("/") for a in cirrus_page.findAll("a")]
 96 |         dumps.remove("..")
 97 |         dumps.remove("current")
 98 |         # We take the oldest dump since the most recent might be incomplete.
 99 |         # The page only link to the N latest dumps so the dump won't be too old.
100 |         date = min(dumps)
101 | 
102 |     cirrus_url = "/".join((CIRRUS_URL, date))
103 |     print("Will use the Wikipedia dump from:", date, cirrus_url)
104 |     cirrus_page = BeautifulSoup(
105 |         urllib.request.urlopen(cirrus_url), features="html.parser"
106 |     )
107 |     urls = {}
108 |     for link in cirrus_page.findAll("a"):
109 |         match = CIRRUS_DUMP_RE.match(link.get("href"))
110 |         if not match:
111 |             continue
112 | 
113 |         urls[match.group(1)] = "/".join([cirrus_url, link.get("href")])
114 |     assert urls, f"No valid download urls found at {cirrus_url}"
115 |     return urls
116 | 
117 | 
118 | def wget(url: str, output: Path):
119 |     subprocess.run(["wget", url, "-O", tmp(output), "-q"], check=True)
120 |     tmp(output).replace(output)
121 |     assert (
122 |         output.stat().st_size > 10_000
123 |     ), f"File {output} downloaded from {url} looks too small"
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     func_argparse.main(dl, opening)
128 | 


--------------------------------------------------------------------------------
/cc_net/cc_net/regroup.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | #
  6 | 
  7 | import logging
  8 | import subprocess
  9 | from pathlib import Path
 10 | from typing import List
 11 | 
 12 | import func_argparse
 13 | import numpy as np
 14 | 
 15 | from cc_net import jsonql
 16 | 
 17 | 
 18 | def get_index(file: Path) -> Path:
 19 |     return file.parent / (file.name + ".index")
 20 | 
 21 | 
 22 | def _get_tmp(output: Path) -> Path:
 23 |     return output.parent / (output.stem + ".tmp" + output.suffix)
 24 | 
 25 | 
 26 | def reshard(
 27 |     inputs: List[Path],
 28 |     output: Path,
 29 |     tmp: Path = None,
 30 |     free_original: bool = False,
 31 |     rm_original: bool = False,
 32 | ) -> Path:
 33 |     """Read the given files and concatenate them to the output file.
 34 | 
 35 |     Can remove original files on completion, or just write dummy content into them to free disk.
 36 |     """
 37 |     if tmp is None:
 38 |         tmp = _get_tmp(output)
 39 |     logging.info(f"Resharding {inputs} to {tmp}, will move later to {output}")
 40 |     jsonql.run_pipes(file=inputs, output=tmp)
 41 |     tmp.replace(output)
 42 |     tmp_index = get_index(tmp)
 43 |     if tmp_index.exists():
 44 |         tmp_index.replace(get_index(output))
 45 | 
 46 |     if not (free_original or rm_original):
 47 |         return output
 48 | 
 49 |     for _input in inputs:
 50 |         if rm_original:
 51 |             _input.unlink()
 52 |         elif free_original:
 53 |             # Overwrite the previous file.
 54 |             # This frees up disk space and allows doit to properly track the success.
 55 |             _input.write_text(f"Resharded into {output}")
 56 |         if get_index(_input).is_file():
 57 |             get_index(_input).unlink()
 58 | 
 59 |     return output
 60 | 
 61 | 
 62 | def fast_reshard(
 63 |     inputs: List[Path],
 64 |     output: Path,
 65 |     tmp: Path = None,
 66 |     free_original: bool = False,
 67 |     rm_original: bool = False,
 68 | ) -> Path:
 69 |     """Same as reshard but don't re-compress the output.
 70 | 
 71 |     This will lead to a bigger output file, especially if the shards are very small.
 72 |     """
 73 |     if tmp is None:
 74 |         tmp = _get_tmp(output)
 75 |     with open(tmp, "wb") as o:
 76 |         subprocess.run(["cat"] + [str(f) for f in inputs], stdout=o)
 77 | 
 78 |     tmp.replace(output)
 79 |     indexes_files = [get_index(i) for i in inputs]
 80 |     existing_indexes = sum(i.exists() for i in indexes_files)
 81 |     assert (
 82 |         existing_indexes == len(indexes_files) or existing_indexes == 0
 83 |     ), "some indexes don't exist."
 84 |     if existing_indexes > 0:
 85 |         indexes = [np.load(idx) for idx in indexes_files]
 86 |         for i in range(len(indexes) - 1):
 87 |             indexes[i + 1] += indexes[i][-1]
 88 |         with open(str(output) + ".index", "wb") as o:
 89 |             np.save(o, np.concatenate(indexes))
 90 | 
 91 |     if not (free_original or rm_original):
 92 |         return output
 93 | 
 94 |     for _input in inputs:
 95 |         if rm_original:
 96 |             _input.unlink()
 97 |         elif free_original:
 98 |             # Overwrite the previous file.
 99 |             # This frees up disk space and allows doit to properly track the success.
100 |             _input.write_text(f"Resharded into {output}")
101 |         if get_index(_input).is_file():
102 |             get_index(_input).unlink()
103 | 
104 |     return output
105 | 
106 | 
107 | def determine_groups(
108 |     inputs: List[Path], target_size: int = 4 * 1024 ** 3
109 | ) -> List[List[Path]]:
110 |     if len(inputs) == 0:
111 |         return []
112 | 
113 |     sample = inputs[:10]
114 |     typical_size = sum(s.stat().st_size for s in sample) / len(sample)
115 |     group_size = min(target_size // typical_size, len(inputs))
116 |     group_size = max(group_size, 1)
117 | 
118 |     return jsonql.grouper(inputs, group_size)
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     func_argparse.single_main(reshard)
123 | 


--------------------------------------------------------------------------------
/cc_net/cc_net/split_by_lang.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | # This source code is licensed under the license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | 
  7 | import argparse
  8 | import collections
  9 | from pathlib import Path
 10 | from typing import Dict, Optional
 11 | 
 12 | import fasttext  # type: ignore
 13 | 
 14 | from cc_net import jsonql
 15 | 
 16 | 
 17 | def get_args():
 18 |     parser = argparse.ArgumentParser(
 19 |         description="Read a list of json files and split them ",
 20 |         parents=[jsonql.io_parser()],
 21 |     )
 22 |     parser.add_argument("--pattern", type=str)
 23 |     parser.add_argument("--field", type=str, default="raw_content")
 24 |     parser.add_argument("--threshold", type=float, default=0)
 25 |     parser.add_argument("--model", type=str, required=True)
 26 |     parser.add_argument("--out_field", type=str, default="language")
 27 |     parser.add_argument("--top", type=int, default=1)
 28 |     return vars(parser.parse_args())
 29 | 
 30 | 
 31 | def predict(model, text: str, k: int = 1):
 32 |     labels, scores = model.predict(text, k=k)
 33 |     labels = [l.replace("__label__", "") for l in labels]
 34 |     return labels, scores
 35 | 
 36 | 
 37 | def avg_predict(model, text):
 38 |     # Overall gives the same results than predict(model, text.replace("\n", ""))
 39 |     text = text.split("\n")
 40 |     text_len = sum(len(line) for line in text)
 41 |     if text_len == 0:
 42 |         return None, 0
 43 |     scores = [predict(model, line) for line in text]
 44 |     scores_by_label: Dict[str, float] = collections.defaultdict(float)
 45 |     for (label, score), line in zip(scores, text):
 46 |         scores_by_label[label] += score * len(line)
 47 | 
 48 |     label, score = max(scores_by_label.items(), key=lambda kv: kv[1])
 49 |     return label, score / text_len
 50 | 
 51 | 
 52 | class Classifier(jsonql.Transformer):
 53 |     def __init__(
 54 |         self,
 55 |         model: Path,
 56 |         field: str,
 57 |         out_field: str,
 58 |         threshold: float = 0,
 59 |         top: int = 1,
 60 |         language: str = None,
 61 |         rounding: int = 2,
 62 |     ):
 63 |         super().__init__()
 64 |         self.model = model
 65 |         assert model.exists(), f"Model {model} doesn't exist."
 66 |         self.field = field
 67 |         self.out_field = out_field
 68 |         self.threshold = threshold
 69 |         self.top = top
 70 |         self.language = language
 71 |         self.rounding = rounding
 72 |         # Fasttext model is a C object and can't be pickled
 73 |         self.fasttext_model: fasttext._FastText = None
 74 |         self.n_doc, self.n_accepted, self.n_ignored, self.n_disagreement = 0, 0, 0, 0
 75 |         self.cnt: Dict[str, int] = {}
 76 | 
 77 |     def _prepare(self):
 78 |         self.log(f"Loading {self.model}")
 79 |         self.fasttext_model = fasttext.load_model(str(self.model))
 80 | 
 81 |     def predict(self, text):
 82 |         return predict(self.fasttext_model, text.replace("\n", ""), k=self.top)
 83 | 
 84 |     def do(self, doc: dict) -> Optional[dict]:
 85 |         text = doc.get(self.field, None)
 86 |         if not text:
 87 |             return None
 88 | 
 89 |         if self.language and doc.get("language") != self.language:
 90 |             self.n_ignored += 1
 91 |             return doc
 92 | 
 93 |         self.n_doc += 1
 94 |         labels, scores = self.predict(text)
 95 |         scores.round(self.rounding, out=scores)
 96 |         for l in labels:
 97 |             self.cnt[l] = self.cnt.get(l, 0) + 1
 98 | 
 99 |         if self.top == 1:
100 |             existing_label = doc.get(self.out_field, None)
101 |             if existing_label and labels[0] != existing_label:
102 |                 self.n_disagreement += 1
103 | 
104 |         if all(s < self.threshold for s in scores):
105 |             return None
106 | 
107 |         self.n_accepted += 1
108 |         if self.top == 1:
109 |             doc[self.out_field] = labels[0]
110 |             doc[self.out_field + "_score"] = scores[0]
111 |         else:
112 |             doc[self.out_field] = {l: s for l, s in zip(labels, scores)}
113 |         return doc
114 | 
115 |     def summary(self):
116 |         n_doc, n_accepted, n_disagreement, cnt, out_field = (
117 |             self.n_doc,
118 |             self.n_accepted,
119 |             self.n_disagreement,
120 |             self.cnt,
121 |             self.out_field,
122 |         )
123 |         summ = super().summary()
124 |         if self.threshold > 0:
125 |             ratio = n_accepted / n_doc if n_doc else 0
126 |             summ.append(f"Kept {n_accepted} docs over {n_doc} ({ratio :.1%})")
127 |         summ.append(f"Found {len(cnt)} {out_field} labels: {cnt}")
128 | 
129 |         disagreement = n_disagreement / n_doc if n_doc else 0
130 |         if disagreement:
131 |             summ.append(f"{out_field} disagreement is at {disagreement:.1%}.")
132 |         return summ
133 | 
134 |     def __repr__(self):
135 |         return f"Classifier({self.model})"
136 | 
137 | 
138 | def classify_and_split(file, output, pattern, **kwargs):
139 |     classifier = Classifier(**kwargs)
140 |     splitter = jsonql.split(pattern)
141 |     jsonql.run_pipes(classifier, splitter, file=file, output=output)
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     args = get_args()
146 |     pattern = args.get("pattern")
147 |     if pattern:
148 |         classify_and_split(**args)
149 |     else:
150 |         args.pop("pattern")
151 |         jsonql.run_pipe(Classifier, args)
152 | 


--------------------------------------------------------------------------------
/cc_net/cc_net/stream_cc.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from pathlib import Path
  3 | from typing import Iterable, Optional, List, Dict, Any
  4 | 
  5 | from cc_net import jsonql, process_wet_file, split_by_lang, perplexity, minify
  6 | 
  7 | FILE_DIR = Path(__file__).parent
  8 | CUTOFF_CSV = FILE_DIR / "data" / "cutoff.csv"
  9 | 
 10 | class StreamMinifier(minify.Minifier):
 11 |     def __init__(self, remove: Optional[List[str]] = None, keep: Optional[List[str]] = None):
 12 |         super().__init__()
 13 |         self.remove = remove or []
 14 |         self.keep = keep or []
 15 | 
 16 |     def do(self, doc: Dict[str, Any]) -> Dict[str, Any]:
 17 |         # Remove fields that are not needed
 18 |         for f in self.remove:
 19 |             doc.pop(f, None)
 20 |         
 21 |         # Keep only the specified fields
 22 |         if self.keep:
 23 |             doc = {k: v for k, v in doc.items() if k in self.keep}
 24 |         
 25 |         return doc
 26 | 
 27 | def stream_cc_segment(
 28 |     segment_url: str,
 29 |     output_dir: Path,
 30 |     lang_model: Path,
 31 |     lm_dir: Path,
 32 |     lang_whitelist: Optional[List[str]] = None,
 33 |     lang_threshold: float = 0.5,
 34 |     min_len: int = 300,
 35 | ):
 36 |     # Set up the pipeline steps
 37 |     steps = []
 38 | 
 39 |     # Language identification
 40 |     steps.append(split_by_lang.Classifier(
 41 |         model=lang_model,
 42 |         field="raw_content",
 43 |         out_field="language",
 44 |         top=1,
 45 |         threshold=lang_threshold,
 46 |     ))
 47 | 
 48 |     # Language filtering
 49 |     if lang_whitelist:
 50 |         steps.append(jsonql.where(
 51 |             [lambda doc: doc.get("language") in set(lang_whitelist)]
 52 |         ))
 53 | 
 54 |     # SentencePiece tokenization
 55 |     steps.append(perplexity.MultiSentencePiece(
 56 |         {l: lm_dir / f"{l}.sp.model" for l in (lang_whitelist or ["en", "fr", "de"])},
 57 |         field="raw_content",
 58 |         output_field="tokenized",
 59 |         normalize=True,
 60 |     ))
 61 | 
 62 |     # Language model scoring
 63 |     steps.append(perplexity.DocLM(
 64 |         {l: lm_dir / f"{l}.arpa.bin" for l in (lang_whitelist or ["en", "fr", "de"])},
 65 |         field="tokenized",
 66 |         output_field="perplexity",
 67 |         normalize=False,
 68 |     ))
 69 | 
 70 |     # Perplexity bucketing
 71 |     steps.append(perplexity.PerplexityBucket(CUTOFF_CSV))
 72 | 
 73 |     # Minification (remove unnecessary fields)
 74 |     steps.append(StreamMinifier(remove=["tokenized"], keep=["url", "raw_content", "language", "perplexity", "bucket"]))
 75 | 
 76 |     # Set up the CC segment reader
 77 |     cc_reader = process_wet_file.CCSegmentsReader(
 78 |         [segment_url],
 79 |         min_len=min_len,
 80 |     )
 81 | 
 82 |     # Set up the output
 83 |     output_pattern = str(output_dir / "{language}_{bucket}.json.gz")
 84 |     steps.append(jsonql.split(pattern=output_pattern, mkdir=True))
 85 | 
 86 |     # Run the pipeline
 87 |     jsonql.run_pipes(
 88 |         *steps,
 89 |         inputs=cc_reader,
 90 |         processes=1,  # Increase this if you want to use multiple processes
 91 |         chunksize=100,
 92 |     )
 93 | 
 94 | def main():
 95 |     parser = argparse.ArgumentParser(description="Stream and process a CC segment")
 96 |     parser.add_argument("segment_url", type=str, help="URL of the CC segment to process")
 97 |     parser.add_argument("output_dir", type=Path, help="Directory to save processed files")
 98 |     parser.add_argument("--lang_model", type=Path, default=Path("bin/lid.bin"), help="Path to language identification model")
 99 |     parser.add_argument("--lm_dir", type=Path, required=True, help="Directory containing language models")
100 |     parser.add_argument("--lang_whitelist", type=str, nargs="+", help="List of languages to process")
101 |     parser.add_argument("--lang_threshold", type=float, default=0.5, help="Language identification threshold")
102 |     parser.add_argument("--min_len", type=int, default=300, help="Minimum document length")
103 | 
104 |     args = parser.parse_args()
105 | 
106 |     stream_cc_segment(
107 |         args.segment_url,
108 |         args.output_dir,
109 |         args.lang_model,
110 |         args.lm_dir,
111 |         args.lang_whitelist,
112 |         args.lang_threshold,
113 |         args.min_len,
114 |     )
115 | 
116 | if __name__ == "__main__":
117 |     main()


--------------------------------------------------------------------------------
/cc_net/cc_net/text_normalizer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | #
  6 | 
  7 | import re
  8 | import unicodedata
  9 | 
 10 | UNICODE_PUNCT = {
 11 |     "，": ",",
 12 |     "。": ".",
 13 |     "、": ",",
 14 |     "„": '"',
 15 |     "”": '"',
 16 |     "“": '"',
 17 |     "«": '"',
 18 |     "»": '"',
 19 |     "１": '"',
 20 |     "」": '"',
 21 |     "「": '"',
 22 |     "《": '"',
 23 |     "》": '"',
 24 |     "´": "'",
 25 |     "∶": ":",
 26 |     "：": ":",
 27 |     "？": "?",
 28 |     "！": "!",
 29 |     "（": "(",
 30 |     "）": ")",
 31 |     "；": ";",
 32 |     "–": "-",
 33 |     "—": " - ",
 34 |     "．": ". ",
 35 |     "～": "~",
 36 |     "’": "'",
 37 |     "…": "...",
 38 |     "━": "-",
 39 |     "〈": "<",
 40 |     "〉": ">",
 41 |     "【": "[",
 42 |     "】": "]",
 43 |     "％": "%",
 44 |     "►": "-",
 45 | }
 46 | 
 47 | UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]")
 48 | 
 49 | 
 50 | def replace_unicode_punct(text: str) -> str:
 51 |     return "".join((UNICODE_PUNCT.get(c, c) for c in text))
 52 | 
 53 | 
 54 | def remove_unicode_punct(text: str) -> str:
 55 |     """More aggressive version of replace_unicode_punct but also faster."""
 56 |     return UNICODE_PUNCT_RE.sub("", text)
 57 | 
 58 | 
 59 | def strip_accents(line: str) -> str:
 60 |     """Strips accents from a piece of text."""
 61 |     nfd = unicodedata.normalize("NFD", line)
 62 |     output = [c for c in nfd if unicodedata.category(c) != "Mn"]
 63 |     if len(output) == line:
 64 |         return line
 65 |     return "".join(output)
 66 | 
 67 | 
 68 | # Build a regex matching all control characters.
 69 | NON_PRINTING_CHARS_RE = re.compile(
 70 |     f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
 71 | )
 72 | DIGIT_RE = re.compile(r"\d")
 73 | PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(
 74 |     (UNICODE_PUNCT_RE.pattern + NON_PRINTING_CHARS_RE.pattern).replace("][", "")
 75 | )
 76 | 
 77 | 
 78 | def remove_non_printing_char(text: str) -> str:
 79 |     return NON_PRINTING_CHARS_RE.sub("", text)
 80 | 
 81 | 
 82 | def normalize_spacing_for_tok(text: str, language: str = "en") -> str:
 83 |     res = (
 84 |         text.replace("\r", "")
 85 |         # remove extra spaces
 86 |         .replace("(", " (")
 87 |         .replace(")", ") ")
 88 |         .replace(" +", " ")
 89 |     )
 90 |     res = re.sub(r"\) ([\.\!\:\?\;\,])", r"\)\1", res)
 91 |     res = res.replace("( ", "(").replace(" )", ")")
 92 |     res = re.sub(r"(\d) \%", r"\1\%", res)
 93 |     res = res.replace(" :", ":").replace(" ;", ";")
 94 |     res = res.replace("`", "'").replace("''", ' " ')
 95 | 
 96 |     res = (
 97 |         res.replace("„", '"')
 98 |         .replace("“", '"')
 99 |         .replace("”", '"')
100 |         .replace("–", "-")
101 |         .replace("—", " - ")
102 |         .replace(" +", " ")
103 |         .replace("´", "'")
104 |         .replace("([a-z])‘([a-z])", r"\1'\2/")
105 |         .replace("([a-z])’([a-z])", r"\1'\2/")
106 |         .replace("‘", '"')
107 |         .replace("‚", '"')
108 |         .replace("’", '"')
109 |         .replace("''", '"')
110 |         .replace("´´", '"')
111 |         .replace("…", "...")
112 |         # French quotes
113 |         .replace(" « ", ' "')
114 |         .replace("« ", '"')
115 |         .replace("«", '"')
116 |         .replace(" » ", '" ')
117 |         .replace(" »", '"')
118 |         .replace("»", '"')
119 |         # handle pseudo-spaces
120 |         .replace(" %", "%")
121 |         .replace("nº ", "nº ")
122 |         .replace(" :", ":")
123 |         .replace(" ºC", " ºC")
124 |         .replace(" cm", " cm")
125 |         .replace(" ?", "?")
126 |         .replace(" !", "!")
127 |         .replace(" ;", ";")
128 |         .replace(", ", ", ")
129 |         .replace(" +", " ")
130 |         .replace("．", ". ")
131 |     )
132 |     # English "quotation," followed by comma, style
133 |     if language == "en":
134 |         res = re.sub(r"\"([,\.]+)", r"\1\"", res)
135 |     # Czech is confused
136 |     elif language == "cs" or language == "cz":
137 |         pass
138 |     # German/Spanish/French "quotation", followed by comma, style
139 |     else:
140 |         res = res.replace(',"', '",')
141 |         res = re.sub(
142 |             r"(\.+)\"(\s*[^<])", r"\"\1\2", res
143 |         )  # don't fix period at end of sentence
144 | 
145 |     if (
146 |         language == "de"
147 |         or language == "es"
148 |         or language == "cz"
149 |         or language == "cs"
150 |         or language == "fr"
151 |     ):
152 |         res = re.sub(r"(\d) (\d)", r"\1,\2", res)
153 |     else:
154 |         res = re.sub(r"(\d) (\d)", r"\1.\2", res)
155 |     return res
156 | 
157 | 
158 | def normalize(line: str, accent=True, case=True, numbers=True, punct=1) -> str:
159 |     line = line.strip()
160 |     if not line:
161 |         return line
162 |     if case:
163 |         line = line.lower()
164 |     if accent:
165 |         line = strip_accents(line)
166 |     if numbers:
167 |         line = DIGIT_RE.sub("0", line)
168 |     if punct == 1:
169 |         line = replace_unicode_punct(line)
170 |     elif punct == 2:
171 |         line = remove_unicode_punct(line)
172 |     line = remove_non_printing_char(line)
173 |     return line
174 | 
175 | 
176 | def slow_normalize_for_dedup(line: str) -> str:
177 |     return normalize(line, accent=False, case=True, numbers=True, punct=2)
178 | 
179 | 
180 | def normalize_for_dedup(line: str) -> str:
181 |     line = line.strip()
182 |     if not line:
183 |         return line
184 |     # case
185 |     line = line.lower()
186 |     # numbers
187 |     line = DIGIT_RE.sub("0", line)
188 |     line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line)
189 |     return line
190 | 


--------------------------------------------------------------------------------
/cc_net/cc_net/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | #
 6 | 
 7 | import time
 8 | from typing import Dict, Optional
 9 | 
10 | import sacremoses  # type: ignore
11 | 
12 | from cc_net import jsonql, text_normalizer
13 | 
14 | 
15 | class RobustTokenizer(jsonql.Transformer):
16 |     """Moses tokenizer with the expected preprocessing."""
17 | 
18 |     LANG_WITHOUT_ACCENT = {"en", "my"}
19 | 
20 |     def __init__(self, lang: str):
21 |         super().__init__()
22 |         self.lang = lang
23 |         self.moses = sacremoses.MosesTokenizer(lang)
24 |         self.rm_accent = lang in self.LANG_WITHOUT_ACCENT
25 |         self.ready = True
26 | 
27 |     def do(self, text: str):
28 |         text = text_normalizer.normalize(
29 |             text, accent=self.rm_accent, case=False, numbers=False, punct=True
30 |         )
31 |         text = text_normalizer.normalize_spacing_for_tok(text, language=self.lang)
32 |         return self.moses.tokenize(text, return_str=True, escape=False)
33 | 
34 | 
35 | class DocTokenizer(jsonql.Transformer):
36 |     """Tokenize the text found in `output_field and store the result in `output_field`."""
37 | 
38 |     def __init__(
39 |         self,
40 |         field: str,
41 |         output_field: str = "tokenized",
42 |         language_field: str = "language",
43 |     ):
44 |         super().__init__()
45 |         self.field = field
46 |         self.output_field = output_field
47 |         self.language_field = language_field
48 |         self.n_docs = 0
49 |         self.tokenizers: Dict[str, RobustTokenizer] = {}
50 | 
51 |     def get_tokenizer(self, lang: str) -> Optional[RobustTokenizer]:
52 |         cache = self.tokenizers
53 |         if lang in cache:
54 |             return cache[lang]
55 |         if lang in ("th", "zh", "ja"):
56 |             # TODO find a tokenizer for those languages
57 |             return None
58 | 
59 |         cache[lang] = RobustTokenizer(lang)
60 |         return cache[lang]
61 | 
62 |     def do(self, document):
63 |         lang = document[self.language_field]
64 |         tok = self.get_tokenizer(lang)
65 |         if not tok:
66 |             return document
67 | 
68 |         self.n_docs += 1
69 |         lines = document[self.field].split("\n")
70 |         tokenized = "\n".join(tok(l) for l in lines)
71 |         document[self.output_field] = tokenized
72 |         return document
73 | 
74 |     def summary(self):
75 |         delay = (time.time() - self.start_time) / 3600
76 |         speed = self.n_docs / delay
77 |         return [
78 |             f"Tokenized {self.n_docs:_} documents in {delay:.2}h ({speed:.1} doc/s)."
79 |         ]
80 | 


--------------------------------------------------------------------------------
/cc_net/cc_net/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/cc_net/cc_net/tools/__init__.py


--------------------------------------------------------------------------------
/cc_net/cc_net/tools/dl_cc_100.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | #
  6 | 
  7 | import contextlib
  8 | import functools
  9 | import gzip
 10 | import logging
 11 | import multiprocessing
 12 | from collections import defaultdict
 13 | from pathlib import Path
 14 | from typing import Callable, Dict, Iterator, List, NamedTuple, Optional, Tuple
 15 | 
 16 | import cc_net
 17 | from cc_net import jsonql
 18 | from cc_net.process_wet_file import CCSegmentsReader
 19 | 
 20 | # Set this to a directory to use as cache for intermediary files.
 21 | # This helps for debugging.
 22 | WET_CACHE = None
 23 | # WET_CACHE = Path("wet_cache")
 24 | 
 25 | S3_BUCKET = "https://dl.fbaipublicfiles.com/cc100"
 26 | VERSION = "1.0.0"
 27 | 
 28 | CC_100_SNAPSHOTS = [
 29 |     "2018-05",
 30 |     "2018-09",
 31 |     "2018-13",
 32 |     "2018-17",
 33 |     "2018-22",
 34 |     "2018-26",
 35 |     "2018-30",
 36 |     "2018-34",
 37 |     "2018-39",
 38 |     "2018-43",
 39 |     "2018-47",
 40 |     "2018-51",
 41 | ]
 42 | 
 43 | BIG_LANGUAGES = {
 44 |     "es_XX",
 45 |     "fr_XX",
 46 |     "de_DE",
 47 |     "ja_XX",
 48 |     "ru_RU",
 49 |     "zh_CN",
 50 |     "en_XX",
 51 |     "it_IT",
 52 |     "ar_AR",
 53 |     "nl_XX",
 54 |     "pl_PL",
 55 |     "pt_XX",
 56 |     "tr_TR",
 57 |     "zh_TW",
 58 | }
 59 | 
 60 | 
 61 | class Paragraph(NamedTuple):
 62 |     lang: str
 63 |     text: str
 64 |     lm_score: float
 65 | 
 66 | 
 67 | def _dl_shard(snapshot: str, shard: int) -> Iterator[Paragraph]:
 68 |     """
 69 |     Download metadata from a shards.
 70 | 
 71 |     Sample metadata:
 72 | 
 73 |     {
 74 |         "cc_segment": "crawl-data/CC-MAIN-2018-51/segments/1544376823009.19/wet/CC-MAIN-20181209185547-20181209211547-00000.warc.wet.gz",
 75 |         "digest": "sha1:222LWNHN5FM26XGS7WJSMI6IISTVWBKJ",
 76 |         "url": "http://personals.gearplay.com/ads/DRJONES.htm",
 77 |         "line_ids": [10],
 78 |         "languages": ["en_XX"],
 79 |         "lm_scores": [-2.658],
 80 |     }
 81 |     """
 82 |     snapshot = snapshot.replace("-", "_")
 83 |     name = f"snap_{snapshot}_batch_{shard}.json.gz"
 84 |     url = "/".join([S3_BUCKET, VERSION, name])
 85 |     shard_metadata: Dict[str, Dict[str, dict]] = defaultdict(dict)
 86 |     try:
 87 |         cache_file: Optional[Path] = None
 88 |         if WET_CACHE is not None:
 89 |             cache_file = WET_CACHE / name
 90 |         metadata_file = jsonql.open_remote_file(url, cache_file)
 91 |     except:
 92 |         logging.warning(f"Couldn't open {url}")
 93 |         return
 94 | 
 95 |     for meta in jsonql.read_jsons(metadata_file):
 96 |         shard_metadata[meta["cc_segment"]][meta["digest"]] = meta
 97 | 
 98 |     found_pars, missed_pars = 0, 0
 99 |     for seg, segment_metadata in shard_metadata.items():
100 |         for doc in CCSegmentsReader([seg], cache_dir=WET_CACHE):
101 |             if doc["digest"] not in segment_metadata:
102 |                 continue
103 | 
104 |             meta = segment_metadata[doc["digest"]]
105 |             full_pars = [doc["title"]] + doc["raw_content"].split("\n")
106 | 
107 |             assert len(meta["line_ids"]) == len(meta["languages"])
108 |             assert len(meta["line_ids"]) == len(meta["lm_scores"])
109 |             for i, lang, score in zip(
110 |                 meta["line_ids"], meta["languages"], meta["lm_scores"]
111 |             ):
112 |                 if snapshot != "2018-51" and lang in BIG_LANGUAGES:
113 |                     # Big languages only come from "2018-51" snapshot
114 |                     continue
115 |                 if i >= len(full_pars):
116 |                     # This is because CC100 was created by saving only urls.
117 |                     # Some urls appears in different snapshot with slightly different
118 |                     # versions, but we don't know which one is correct.
119 |                     # Here we read both versions, but some index may end up
120 |                     # being incorrect.
121 |                     # This impact ~3% documents.
122 |                     missed_pars += 1
123 |                     continue
124 | 
125 |                 yield Paragraph(lang, full_pars[i], score)
126 |                 found_pars += 1
127 |         if missed_pars > 0:
128 |             logging.warning(
129 |                 f"Missed {missed_pars} ({missed_pars / found_pars:%}) paragraphes."
130 |             )
131 | 
132 | 
133 | def _split_by_par(
134 |     paragraphes: Iterator[Paragraph], snapshot: str, shard: int, outdir: Path
135 | ) -> int:
136 |     outdir.mkdir(exist_ok=True)
137 |     outfiles = {}
138 |     num_pars = 0
139 |     try:
140 |         for par in paragraphes:
141 |             # MODIFY ME: filter paragraph if needed (languages, score, ...)
142 |             if par.lang not in outfiles:
143 |                 (outdir / par.lang).mkdir(exist_ok=True)
144 |                 outfile = outdir / par.lang / f"snap_{snapshot}_batch_{shard}.gz"
145 |                 outfiles[par.lang] = gzip.open(outfile, "wt")
146 | 
147 |             print(par.text, file=outfiles[par.lang])
148 |             num_pars += 1
149 |     finally:
150 |         for o in outfiles.values():
151 |             o.close()
152 | 
153 |     logging.info(f"Extracted {num_pars:_d} paragraphs from shard {snapshot}_{shard}")
154 |     return num_pars
155 | 
156 | 
157 | def dl_shard(snapshot: str, shard: int, outdir: Path) -> int:
158 |     return _split_by_par(_dl_shard(snapshot, shard), snapshot, shard, outdir)
159 | 
160 | 
161 | @contextlib.contextmanager
162 | def unordered_map(processes: int):
163 |     if processes == 0:
164 |         yield map
165 |         return
166 | 
167 |     with multiprocessing.Pool(processes) as pool:
168 |         yield pool.imap_unordered
169 | 
170 | 
171 | def dl_snapshot(snapshot: str, outdir: Path, processes: int = 1) -> None:
172 |     _dl_shard = functools.partial(dl_shard, snapshot, outdir=outdir)
173 | 
174 |     with unordered_map(processes) as umap:
175 |         num_pars = sum(umap(_dl_shard, range(500)))
176 | 
177 |     logging.info(f"Extracted {num_pars:_d} paragraphs from snapshot {snapshot}.")
178 | 
179 | 
180 | def dl(
181 |     snapshot: str = None, outdir: Path = Path("data_cc100"), processes: int = 1
182 | ) -> None:
183 |     """
184 |     Download CC100 corpus.
185 |     Will create one text file per language and CC snapshot.
186 | 
187 |     - snapshot: restrict to one snapshot. Useful for parallelization.
188 |     - outdir: output directory
189 |     - processes: number of processes to use
190 |     """
191 |     if snapshot is None:
192 |         snapshots = CC_100_SNAPSHOTS
193 |     else:
194 |         snapshots = snapshot.split(",")
195 | 
196 |     invalids = [s for s in snapshots if s not in CC_100_SNAPSHOTS]
197 |     assert not invalids, f"Invalid snapshots {invalids}, chose from {CC_100_SNAPSHOTS}"
198 | 
199 |     for snapshot in snapshots:
200 |         dl_snapshot(snapshot, outdir, processes)
201 | 
202 | 
203 | if __name__ == "__main__":
204 |     import func_argparse
205 | 
206 |     func_argparse.single_main(dl)
207 | 


--------------------------------------------------------------------------------
/cc_net/cc_net/tools/make_dmoz_corpus.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | #
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | #
 6 | 
 7 | """
 8 | This code is used to train a fastText classifier to label document with DMOZ categories.
 9 | 
10 | The data, distributed under the cc-by 3.0 license
11 | (https://web.archive.org/web/20140605215533/http://www.dmoz.org/license.html),
12 | can be downloaded from
13 | https://web.archive.org/web/20140617145301/http://rdf.dmoz.org/rdf/content.rdf.u8.gz.
14 | """
15 | 
16 | import urllib.request
17 | from io import StringIO
18 | from pathlib import Path
19 | from typing import Dict, Set
20 | from urllib.parse import urlparse
21 | 
22 | import func_argparse
23 | from lxml import etree  # type: ignore
24 | 
25 | from cc_net import jsonql
26 | 
27 | TaggedUrls = Dict[str, Set[str]]
28 | DMOZ_TAGS_URL = "https://web.archive.org/web/20140617145301/http://rdf.dmoz.org/rdf/content.rdf.u8.gz"
29 | 
30 | 
31 | def add_tags(url: str, tags: Set[str], url2tags: TaggedUrls):
32 |     if url in url2tags:
33 |         url2tags[url] &= tags
34 |     else:
35 |         url2tags[url] = tags
36 | 
37 | 
38 | def load_tags(filename: Path = None) -> TaggedUrls:
39 |     if filename is None:
40 |         with StringIO("".join(jsonql.open_remote_file(DMOZ_TAGS_URL))) as dmoz:
41 |             tree = etree.parse(dmoz)
42 |     else:
43 |         tree = etree.parse(str(filename))
44 | 
45 |     root = tree.getroot()
46 |     url2tags: Dict[str, Set[str]] = {}
47 |     for external_page in root.iterfind("{http://dmoz.org/rdf/}ExternalPage"):
48 |         url = external_page.get("about")
49 |         domain = urlparse(url).netloc
50 |         for topic in external_page.iterfind("{http://dmoz.org/rdf/}topic"):
51 |             # print(url, topic.text)
52 |             # Tags looks like Top/Arts/Animation/Anime/Collectibles
53 |             tags = set(topic.text.split("/")[1:])
54 |             add_tags(url, tags, url2tags)
55 |             add_tags(domain, tags, url2tags)
56 |     return url2tags
57 | 
58 | 
59 | def dl(output: Path) -> None:
60 |     urllib.request.urlretrieve(DMOZ_TAGS_URL, output)
61 | 
62 | 
63 | def make_corpus(file: Path, tags_file: Path = None, output: Path = None) -> None:
64 |     """
65 |     Loads a tags file and create a training dataset using the given webpages.
66 | 
67 |     Arguments:
68 |         - file: CC shard file
69 |         - tags_file: dmoz tagging file, (like the one produced by `dl`)
70 |         - output: ""
71 |     """
72 |     url2tags = load_tags(tags_file)
73 |     with jsonql.open_write(output) as o:
74 |         for document in jsonql.read_jsons(file):
75 |             if not document:
76 |                 continue
77 |             url = document["url"]
78 |             domain = document["source_domain"]
79 | 
80 |             if url in url2tags:
81 |                 tags = url2tags[url]
82 |             elif domain in url2tags:
83 |                 tags = url2tags[domain]
84 |             else:
85 |                 continue
86 | 
87 |             if len(tags) == 0:
88 |                 continue
89 | 
90 |             fasttext_tags = ["__label__" + tag for tag in tags]
91 |             content = document["tokenized"].replace("\n", " ").lower()
92 |             if len(content) > 200:
93 |                 print(" ".join(fasttext_tags), content, file=o)  # type: ignore
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     func_argparse.single_main(make_corpus)
98 | 


--------------------------------------------------------------------------------
/cc_net/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | testpaths = "tests"
 3 | 
 4 | [tool.black]
 5 | line-length = 88
 6 | target_version = ["py37"]
 7 | 
 8 | [tool.isort]
 9 | multi_line_output = 3
10 | include_trailing_comma = true
11 | force_grid_wrap = 0
12 | use_parentheses = true
13 | line_length = 88
14 | known_third_party = ["func_argparse"]
15 | skip = ["third_party", "data"]
16 | 
17 | [mypy]
18 | python_version = 3.7
19 | check_untyped_defs = true
20 | 
21 | [mypy-numpy]
22 | ignore_missing_imports = true
23 | [mypy-pytest]
24 | ignore_missing_imports = true
25 | 


--------------------------------------------------------------------------------
/cc_net/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | # This source code is licensed under the license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | from pathlib import Path
 7 | 
 8 | from setuptools import setup  # type: ignore
 9 | 
10 | setup(
11 |     name="cc_net",
12 |     version="1.0.0",
13 |     packages=["cc_net"],
14 |     # metadata to display on PyPI
15 |     author="Guillaume Wenzek",
16 |     author_email="guw@fb.com",
17 |     description="Tools to download and clean Common Crawl",
18 |     keywords="common crawl dataset",
19 |     url="https://github.com/facebookresearch/cc_net",
20 |     license="CC-BY-NC-4.0",
21 |     long_description=Path("README.md").read_text(),
22 |     long_description_content_type="text/markdown",
23 |     project_urls={
24 |         "Bug Tracker": "https://github.com/facebookresearch/cc_net/issues",
25 |         "Source Code": "https://github.com/facebookresearch/cc_net",
26 |     },
27 |     classifiers=[
28 |         "Development Status :: 4 - Beta",
29 |         "Programming Language :: Python :: 3.7",
30 |     ],
31 |     python_requires=">=3.7",
32 |     install_requires=[
33 |         "beautifulsoup4>=4.7.1",
34 |         "pandas>=0.23.4",
35 |         "requests>=2.22.0",
36 |         "fasttext>=0.9.1",
37 |         "sentencepiece>=0.1.82",
38 |         "kenlm @ git+https://github.com/kpu/kenlm.git@master",
39 |         "func_argparse>=1.1.1",
40 |         "psutil>=5.6.3",
41 |         "sacremoses",
42 |         "submitit>=1.0.0",
43 |         "typing_extensions",
44 |     ],
45 |     extras_require={
46 |         "dev": ["mypy==0.790", "pytest", "black==19.3b0", "isort==5.6.4"],
47 |         # To use scripts inside cc_net/tools
48 |         "tools": ["lxml", "sentence_splitter"],
49 |         # Memory-efficient hashset.
50 |         # This fork only compiles the kind of dict used by cc_net.
51 |         # Full version is at https://github.com/atom-moyer/getpy
52 |         "getpy": ["getpy @ git+https://github.com/gwenzek/getpy.git@v0.9.10-subset"],
53 |     },
54 |     package_data={"cc_net": ["data/*"]},
55 | )
56 | 


--------------------------------------------------------------------------------
/contrib/CODE_REVIEW_DOCS.md:
--------------------------------------------------------------------------------
 1 | # Code Review
 2 | ### Conceptual Review
 3 | 
 4 | A review can be a conceptual review, where the reviewer leaves a comment
 5 |  * `Concept (N)ACK`, meaning "I do (not) agree with the general goal of this pull
 6 |    request",
 7 |  * `Approach (N)ACK`, meaning `Concept ACK`, but "I do (not) agree with the
 8 |    approach of this change".
 9 | 
10 | A `NACK` needs to include a rationale why the change is not worthwhile.
11 | NACKs without accompanying reasoning may be disregarded.
12 | After conceptual agreement on the change, code review can be provided. A review
13 | begins with `ACK BRANCH_COMMIT`, where `BRANCH_COMMIT` is the top of the PR
14 | branch, followed by a description of how the reviewer did the review. The
15 | following language is used within pull request comments:
16 | 
17 |   - "I have tested the code", involving change-specific manual testing in
18 |     addition to running the unit, functional, or fuzz tests, and in case it is
19 |     not obvious how the manual testing was done, it should be described;
20 |   - "I have not tested the code, but I have reviewed it and it looks
21 |     OK, I agree it can be merged";
22 |   - A "nit" refers to a trivial, often non-blocking issue.
23 | 
24 | ### Code Review
25 | Project maintainers reserve the right to weigh the opinions of peer reviewers
26 | using common sense judgement and may also weigh based on merit. Reviewers that
27 | have demonstrated a deeper commitment and understanding of the project over time
28 | or who have clear domain expertise may naturally have more weight, as one would
29 | expect in all walks of life.
30 | 
31 | Where a patch set affects consensus-critical code, the bar will be much
32 | higher in terms of discussion and peer review requirements, keeping in mind that
33 | mistakes could be very costly to the wider community. This includes refactoring
34 | of consensus-critical code.
35 | 
36 | Where a patch set proposes to change the Bittensor consensus, it must have been
37 | discussed extensively on the discord server and other channels, be accompanied by a widely
38 | discussed BIP and have a generally widely perceived technical consensus of being
39 | a worthwhile change based on the judgement of the maintainers.
40 | 
41 | ### Finding Reviewers
42 | 
43 | As most reviewers are themselves developers with their own projects, the review
44 | process can be quite lengthy, and some amount of patience is required. If you find
45 | that you've been waiting for a pull request to be given attention for several
46 | months, there may be a number of reasons for this, some of which you can do something
47 | about:
48 | 
49 |   - It may be because of a feature freeze due to an upcoming release. During this time,
50 |     only bug fixes are taken into consideration. If your pull request is a new feature,
51 |     it will not be prioritized until after the release. Wait for the release.
52 |   - It may be because the changes you are suggesting do not appeal to people. Rather than
53 |     nits and critique, which require effort and means they care enough to spend time on your
54 |     contribution, thundering silence is a good sign of widespread (mild) dislike of a given change
55 |     (because people don't assume *others* won't actually like the proposal). Don't take
56 |     that personally, though! Instead, take another critical look at what you are suggesting
57 |     and see if it: changes too much, is too broad, doesn't adhere to the
58 |     [developer notes](DEVELOPMENT_WORKFLOW.md), is dangerous or insecure, is messily written, etc.
59 |     Identify and address any of the issues you find. Then ask e.g. on IRC if someone could give
60 |     their opinion on the concept itself.
61 |   - It may be because your code is too complex for all but a few people, and those people
62 |     may not have realized your pull request even exists. A great way to find people who
63 |     are qualified and care about the code you are touching is the
64 |     [Git Blame feature](https://docs.github.com/en/github/managing-files-in-a-repository/managing-files-on-github/tracking-changes-in-a-file). Simply
65 |     look up who last modified the code you are changing and see if you can find
66 |     them and give them a nudge. Don't be incessant about the nudging, though.
67 |   - Finally, if all else fails, ask on IRC or elsewhere for someone to give your pull request
68 |     a look. If you think you've been waiting for an unreasonably long time (say,
69 |     more than a month) for no particular reason (a few lines changed, etc.),
70 |     this is totally fine. Try to return the favor when someone else is asking
71 |     for feedback on their code, and the universe balances out.
72 |   - Remember that the best thing you can do while waiting is give review to others!


--------------------------------------------------------------------------------
/core:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/core


--------------------------------------------------------------------------------
/detection/__init__.py:
--------------------------------------------------------------------------------
 1 | # The MIT License (MIT)
 2 | # Copyright © 2024 It's AI 
 3 |  
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation
 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 8 | 
 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of
10 | # the Software.
11 | 
12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
16 | # DEALINGS IN THE SOFTWARE.
17 | 
18 | # Define the version of the template module.
19 | __version__ = "3.11.2"
20 | __least_acceptable_version__ = "3.8.0"
21 | version_split = __version__.split(".")
22 | __spec_version__ = (
23 |     (1000 * int(version_split[0]))
24 |     + (10 * int(version_split[1]))
25 |     + (1 * int(version_split[2]))
26 | )
27 | version_url = "https://raw.githubusercontent.com/it-s-ai/llm-detection/main/detection/__init__.py"
28 | 
29 | # Import all submodules.
30 | from . import protocol
31 | from . import base
32 | from . import validator
33 | 
34 | WANDB_PROJECT = "subnet32"
35 | WANDB_ENTITY = "itsai-dev"
36 | MAX_RUN_STEPS_PER_WANDB_RUN = 1
37 | 


--------------------------------------------------------------------------------
/detection/attacks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/detection/attacks/__init__.py


--------------------------------------------------------------------------------
/detection/attacks/data_augmentation.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | import numpy as np
 3 | 
 4 | from detection.attacks.delete import DeleteAttack
 5 | from detection.attacks.spelling import SpellingAttack
 6 | from detection.attacks.synonym import SynonymAttack
 7 | from detection.attacks.zero_width_space import ZeroWidthSpaceAttack
 8 | 
 9 | nltk.download('punkt')
10 | nltk.download('stopwords')
11 | nltk.download('averaged_perceptron_tagger')
12 | 
13 | 
14 | class DataAugmentator:
15 |     def __init__(self, device=0):
16 |         self.attacks = [{'attacker': SynonymAttack(device=device), 'p': 0.05, 'pass_labels': True},
17 |                         {'attacker': ZeroWidthSpaceAttack(), 'p': 0.05},
18 |                         {'attacker': SpellingAttack(), 'p': 0.4},
19 |                         {'attacker': DeleteAttack(), 'p': 0.1},
20 |                         ]
21 | 
22 |         # {'attacker': ParaphraseAttack(), 'p': 0.2, 'apply_label': 1}, - needs too much GPU
23 | 
24 |     def __call__(self, text, labels):
25 |         text = text.strip()
26 | 
27 |         applied_augs = []
28 |         for augmentation_step in self.attacks:
29 |             if np.random.random() > augmentation_step['p']:
30 |                 continue
31 | 
32 |             if augmentation_step.get('pass_labels'):
33 |                 text = augmentation_step['attacker'].attack(text, labels)
34 |             else:
35 |                 text = augmentation_step['attacker'].attack(text)
36 |             applied_augs.append(type(augmentation_step['attacker']).__name__)
37 | 
38 |         n_auged = len(text.split())
39 | 
40 |         if not sum(labels):
41 |             labels_auged = [0] * n_auged
42 |         else:
43 |             first_zeros = 0
44 |             for i in range(len(labels)):
45 |                 if labels[i] == 0:
46 |                     first_zeros += 1
47 |                 else:
48 |                     break
49 |             last_zeros = 0
50 |             for i in range(len(labels) - 1, -1, -1):
51 |                 if labels[i] == 0:
52 |                     last_zeros += 1
53 |                 else:
54 |                     break
55 |             new_first_zeros = int(n_auged * first_zeros / len(labels))
56 |             new_last_zeros = int(n_auged * last_zeros / len(labels))
57 |             new_middle_ones = n_auged - new_first_zeros - new_last_zeros
58 |             labels_auged = [0] * new_first_zeros + [1] * new_middle_ones + [0] * new_last_zeros
59 | 
60 |         return text, applied_augs, labels_auged
61 | 


--------------------------------------------------------------------------------
/detection/attacks/delete.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from nltk import pos_tag
 4 | 
 5 | 
 6 | class DeleteAttack:
 7 |     def __init__(self, max_remove_words=5):
 8 |         self.max_remove_words = max_remove_words
 9 | 
10 |     def remove_random_adjective(self, text):
11 |         tokens = text.split()
12 |         tagged_tokens = pos_tag(tokens)
13 | 
14 |         adjectives = [word for word, tag in tagged_tokens if tag in ('JJ', 'JJR', 'JJS')]
15 | 
16 |         if not adjectives:
17 |             return ' '.join(tokens)
18 | 
19 |         adjective_to_remove = random.choice(adjectives)
20 |         tokens.remove(adjective_to_remove)
21 |         return ' '.join(tokens)
22 | 
23 |     def attack(self, text):
24 |         n = random.randint(1, self.max_remove_words)
25 |         for i in range(n):
26 |             text = self.remove_random_adjective(text)
27 | 
28 |         return text
29 | 


--------------------------------------------------------------------------------
/detection/attacks/resources/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/detection/attacks/resources/.gitkeep


--------------------------------------------------------------------------------
/detection/attacks/spelling.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import nltk
 4 | import numpy as np
 5 | 
 6 | nltk.download('punkt')
 7 | nltk.download('averaged_perceptron_tagger')
 8 | 
 9 | 
10 | class SpellingAttack:
11 |     def __init__(self, max_cycles=5):
12 | 
13 |         self.char_changes = [
14 |             {'name': 'typo_char_swap', 'p': 0.1},
15 |             {'name': 'typo_missing_char', 'p': 0.1},
16 |             {'name': 'typo_extra_char', 'p': 0.1},
17 |             {'name': 'typo_nearby_char', 'p': 0.1},
18 |             {'name': 'typo_similar_char', 'p': 0.1},
19 |             {'name': 'typo_skipped_space', 'p': 0.1},
20 |             {'name': 'typo_random_space', 'p': 0.1},
21 |             {'name': 'typo_repeated_char', 'p': 0.1},
22 |             {'name': 'typo_unichar', 'p': 0.1},
23 |             {'name': 'decapitalize_char', 'p': 0.1},
24 |             {'name': 'capitalize_char', 'p': 0.1},
25 |         ]
26 | 
27 |         self.max_cycles = max_cycles
28 | 
29 |     def decapitalize_char(self, text):
30 |         capital_indices = [i for i, char in enumerate(text) if char.isupper()]
31 |         if len(capital_indices) == 0:
32 |             return text
33 | 
34 |         random_index = np.random.choice(capital_indices)
35 | 
36 |         modified_text = text[:random_index] + text[random_index].lower() + text[random_index + 1:]
37 |         return modified_text
38 | 
39 |     def capitalize_char(self, text):
40 |         lower_indices = [i for i, char in enumerate(text) if char.islower()]
41 |         if len(lower_indices) == 0:
42 |             return text
43 | 
44 |         random_index = np.random.choice(lower_indices)
45 |         modified_text = text[:random_index] + text[random_index].upper() + text[random_index + 1:]
46 |         return modified_text
47 | 
48 |     def attack(self, text):
49 |         augs = []
50 |         n_repeated = random.randint(1, self.max_cycles)
51 |         for i in range(n_repeated):
52 |             augs += self.char_changes
53 |         np.random.shuffle(augs)
54 | 
55 |         for augmentation_step in augs:
56 |             if np.random.random() > augmentation_step['p']:
57 |                 continue
58 | 
59 |             if augmentation_step['name'] == 'decapitalize_char':
60 |                 text = self.decapitalize_char(text)
61 |             elif augmentation_step['name'] == 'capitalize_char':
62 |                 text = self.capitalize_char(text)
63 |             elif 'typo_' in augmentation_step['name']:
64 |                 error_type_name = augmentation_step['name'][5:]
65 |                 try:
66 |                     text = eval(f'typo.StrErrer(text).{error_type_name}().result')
67 |                 except:
68 |                     pass
69 |             else:
70 |                 raise Exception("Unexpected augmentation name: {}".format(augmentation_step['name']))
71 | 
72 |         return text
73 | 


--------------------------------------------------------------------------------
/detection/attacks/zero_width_space.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | class ZeroWidthSpaceAttack:
 5 |     def __init__(self, max_p=0.2):
 6 |         self.max_p = max_p
 7 | 
 8 |     def attack(self, text):
 9 |         cur_p = self.max_p * random.random()
10 | 
11 |         res = ""
12 |         for word in text.split():
13 |             res += word
14 |             if random.random() > cur_p:
15 |                 res += ' '
16 | 
17 |         return res
18 | 


--------------------------------------------------------------------------------
/detection/base/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/detection/base/__init__.py


--------------------------------------------------------------------------------
/detection/protocol.py:
--------------------------------------------------------------------------------
 1 | # The MIT License (MIT)
 2 |  # Copyright © 2024 It's AI 
 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 4 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation
 5 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 6 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 7 | 
 8 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of
 9 | # the Software.
10 | 
11 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
12 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
13 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
14 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
15 | # DEALINGS IN THE SOFTWARE.
16 | 
17 | import pydantic
18 | from typing import List, Optional
19 | import bittensor as bt
20 | 
21 | from detection import __version__
22 | 
23 | 
24 | class TextSynapse(bt.Synapse):
25 |     """
26 |     A protocol representation which uses bt.Synapse as its base.
27 |     This protocol helps in handling request and response communication between
28 |     the miner and the validator.
29 | 
30 |     Attributes:
31 |     - texts: List of texts that needs to be evaluated for AI generation
32 |     - predictions: List of probabilities in response to texts
33 | 
34 |     """
35 | 
36 |     texts: List[str] = pydantic.Field(
37 |         ...,
38 |         title="Texts",
39 |         description="A list of texts to check. Immuatable.",
40 |         allow_mutation=False,
41 |     )
42 | 
43 |     predictions: List[List[float]] = pydantic.Field(
44 |         ...,
45 |         title="Predictions",
46 |         description="List of predicted probabilities. This attribute is mutable and can be updated.",
47 |     ) 
48 | 
49 |     version: str = ""
50 | 
51 |     def deserialize(self) -> float:
52 |         """
53 |         Deserialize output. This method retrieves the response from
54 |         the miner in the form of self.text, deserializes it and returns it
55 |         as the output of the dendrite.query() call.
56 | 
57 |         Returns:
58 |         - List[float]: The deserialized response, which in this case is the list of preidictions.
59 |         """
60 |         return self
61 | 


--------------------------------------------------------------------------------
/detection/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from . import config
2 | from . import misc
3 | from . import uids
4 | 


--------------------------------------------------------------------------------
/detection/utils/misc.py:
--------------------------------------------------------------------------------
  1 | # The MIT License (MIT)
  2 |  # Copyright © 2024 It's AI# Copyright © 2023 Opentensor Foundation
  3 | 
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
  5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation
  6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
  7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  8 | 
  9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of
 10 | # the Software.
 11 | 
 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 16 | # DEALINGS IN THE SOFTWARE.
 17 | 
 18 | import time
 19 | import math
 20 | import hashlib as rpccheckhealth
 21 | from math import floor
 22 | from typing import Callable, Any
 23 | from functools import lru_cache, update_wrapper
 24 | 
 25 | 
 26 | # LRU Cache with TTL
 27 | def ttl_cache(maxsize: int = 128, typed: bool = False, ttl: int = -1):
 28 |     """
 29 |     Decorator that creates a cache of the most recently used function calls with a time-to-live (TTL) feature.
 30 |     The cache evicts the least recently used entries if the cache exceeds the `maxsize` or if an entry has
 31 |     been in the cache longer than the `ttl` period.
 32 | 
 33 |     Args:
 34 |         maxsize (int): Maximum size of the cache. Once the cache grows to this size, subsequent entries
 35 |                        replace the least recently used ones. Defaults to 128.
 36 |         typed (bool): If set to True, arguments of different types will be cached separately. For example,
 37 |                       f(3) and f(3.0) will be treated as distinct calls with distinct results. Defaults to False.
 38 |         ttl (int): The time-to-live for each cache entry, measured in seconds. If set to a non-positive value,
 39 |                    the TTL is set to a very large number, effectively making the cache entries permanent. Defaults to -1.
 40 | 
 41 |     Returns:
 42 |         Callable: A decorator that can be applied to functions to cache their return values.
 43 | 
 44 |     The decorator is useful for caching results of functions that are expensive to compute and are called
 45 |     with the same arguments frequently within short periods of time. The TTL feature helps in ensuring
 46 |     that the cached values are not stale.
 47 | 
 48 |     Example:
 49 |         @ttl_cache(ttl=10)
 50 |         def get_data(param):
 51 |             # Expensive data retrieval operation
 52 |             return data
 53 |     """
 54 |     if ttl <= 0:
 55 |         ttl = 65536
 56 |     hash_gen = _ttl_hash_gen(ttl)
 57 | 
 58 |     def wrapper(func: Callable) -> Callable:
 59 |         @lru_cache(maxsize, typed)
 60 |         def ttl_func(ttl_hash, *args, **kwargs):
 61 |             return func(*args, **kwargs)
 62 | 
 63 |         def wrapped(*args, **kwargs) -> Any:
 64 |             th = next(hash_gen)
 65 |             return ttl_func(th, *args, **kwargs)
 66 | 
 67 |         return update_wrapper(wrapped, func)
 68 | 
 69 |     return wrapper
 70 | 
 71 | 
 72 | def _ttl_hash_gen(seconds: int):
 73 |     """
 74 |     Internal generator function used by the `ttl_cache` decorator to generate a new hash value at regular
 75 |     time intervals specified by `seconds`.
 76 | 
 77 |     Args:
 78 |         seconds (int): The number of seconds after which a new hash value will be generated.
 79 | 
 80 |     Yields:
 81 |         int: A hash value that represents the current time interval.
 82 | 
 83 |     This generator is used to create time-based hash values that enable the `ttl_cache` to determine
 84 |     whether cached entries are still valid or if they have expired and should be recalculated.
 85 |     """
 86 |     start_time = time.time()
 87 |     while True:
 88 |         yield floor((time.time() - start_time) / seconds)
 89 | 
 90 | 
 91 | # 12 seconds updating block.
 92 | @ttl_cache(maxsize=1, ttl=12)
 93 | def ttl_get_block(self) -> int:
 94 |     """
 95 |     Retrieves the current block number from the blockchain. This method is cached with a time-to-live (TTL)
 96 |     of 12 seconds, meaning that it will only refresh the block number from the blockchain at most every 12 seconds,
 97 |     reducing the number of calls to the underlying blockchain interface.
 98 | 
 99 |     Returns:
100 |         int: The current block number on the blockchain.
101 | 
102 |     This method is useful for applications that need to access the current block number frequently and can
103 |     tolerate a delay of up to 12 seconds for the latest information. By using a cache with TTL, the method
104 |     efficiently reduces the workload on the blockchain interface.
105 | 
106 |     Example:
107 |         current_block = ttl_get_block(self)
108 | 
109 |     Note: self here is the miner or validator instance
110 |     """
111 |     return self.subtensor.get_current_block()
112 | 


--------------------------------------------------------------------------------
/detection/utils/uids.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import random
 3 | import bittensor as bt
 4 | from typing import List
 5 | 
 6 | 
 7 | def check_uid_availability(
 8 |     metagraph: "bt.metagraph.Metagraph", uid: int, vpermit_tao_limit: int
 9 | ) -> bool:
10 |     """Check if uid is available. The UID should be available if it is serving and has less than vpermit_tao_limit stake
11 |     Args:
12 |         metagraph (:obj: bt.metagraph.Metagraph): Metagraph object
13 |         uid (int): uid to be checked
14 |         vpermit_tao_limit (int): Validator permit tao limit
15 |     Returns:
16 |         bool: True if uid is available, False otherwise
17 |     """
18 | 
19 |     # Filter non serving axons.
20 |     if not metagraph.axons[uid].is_serving:
21 |         return False
22 |     
23 |     # Filter validator permit > 1024 stake.
24 |     if metagraph.validator_permit[uid]:
25 |         if metagraph.S[uid] > vpermit_tao_limit:
26 |             return False
27 |         
28 |     # Available otherwise.
29 |     return True
30 | 
31 | 
32 | def get_random_uids(
33 |         self, k: int, exclude: List[int] = None
34 | ) -> torch.LongTensor:
35 |     """Returns k available random uids from the metagraph.
36 |     Args:
37 |         k (int): Number of uids to return.
38 |         exclude (List[int]): List of uids to exclude from the random sampling.
39 |     Returns:
40 |         uids (torch.LongTensor): Randomly sampled available uids.
41 |     Notes:
42 |         If `k` is larger than the number of available `uids`, set `k` to the number of available `uids`.
43 |     """
44 |     candidate_uids = []
45 |     avail_uids = []
46 | 
47 |     for uid in range(self.metagraph.n.item()):
48 | 
49 |         uid_is_available = check_uid_availability(
50 |             self.metagraph, uid, self.config.neuron.vpermit_tao_limit
51 |         )
52 |         uid_is_not_excluded = exclude is None or uid not in exclude
53 | 
54 |         if uid_is_available:
55 |             avail_uids.append(uid)
56 |             if uid_is_not_excluded:
57 |                 candidate_uids.append(uid)
58 | 
59 |     # Check if candidate_uids contain enough for querying, if not grab all avaliable uids
60 |     available_uids = candidate_uids
61 | 
62 |     # If k is larger than the number of available uids, set k to the number of available uids.
63 |     k = min(k, len(available_uids))
64 |     uids = torch.tensor(random.sample(available_uids, k))
65 |     return uids
66 | 


--------------------------------------------------------------------------------
/detection/utils/weight_version.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def version_to_tuple(version):
 5 |     return tuple(map(int, version.split('.')))
 6 | 
 7 | 
 8 | def is_valid_version_format(version):
 9 |     return bool(re.match(r'^\d+\.\d+\.\d+$', version))
10 | 
11 | 
12 | def is_version_in_range(version, version1, version2):
13 |     if not is_valid_version_format(version):
14 |         return False
15 | 
16 |     v = version_to_tuple(version)
17 |     v1 = version_to_tuple(version1)
18 |     v2 = version_to_tuple(version2)
19 | 
20 |     if v1 > v2:
21 |         v1, v2 = v2, v1
22 | 
23 |     return v1 <= v <= v2


--------------------------------------------------------------------------------
/detection/validator/__init__.py:
--------------------------------------------------------------------------------
1 | from .forward import forward
2 | from .reward import reward
3 | 


--------------------------------------------------------------------------------
/detection/validator/generate_version.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def version_to_tuple(version):
 5 |     return tuple(map(int, version.split('.')))
 6 | 
 7 | 
 8 | def tuple_to_version(version_tuple):
 9 |     return '.'.join(map(str, version_tuple))
10 | 
11 | 
12 | def generate_random_version(version1, version2):
13 |     v1 = version_to_tuple(version1)
14 |     v2 = version_to_tuple(version2)
15 | 
16 |     if v1 > v2:
17 |         v1, v2 = v2, v1
18 | 
19 |     def random_version_near(v):
20 |         return tuple(
21 |             max(v[i] + random.choice([-1, 1]), 0)
22 |             if random.random() > 0.5 else v[i]
23 |             for i in range(len(v))
24 |         )
25 | 
26 |     def is_in_range(v):
27 |         return v1 <= v <= v2
28 | 
29 |     while True:
30 |         random_near_v1 = random_version_near(v1)
31 |         if not is_in_range(random_near_v1):
32 |             return tuple_to_version(random_near_v1)
33 | 
34 |         random_near_v2 = random_version_near(v2)
35 |         if not is_in_range(random_near_v2):
36 |             return tuple_to_version(random_near_v2)
37 | 


--------------------------------------------------------------------------------
/detection/validator/models.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | 
 4 | class ValDataRow(BaseModel):
 5 |     text: str
 6 |     text_auged: str | None = None
 7 |     label: bool
 8 |     segmentation_labels: list[bool]
 9 |     auged_segmentation_labels: list[bool]
10 |     prompt: str | None = None
11 |     data_source: str | None = None
12 |     model_name: str | None = None
13 |     model_params: dict | None = None
14 |     topic: str | None = None
15 | 
16 |     augmentations: list[str] = []
17 | 
18 | 


--------------------------------------------------------------------------------
/detection/validator/my_datasets.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import random
  4 | import time
  5 | from abc import abstractmethod
  6 | from pathlib import Path
  7 | 
  8 | import bittensor as bt
  9 | import numpy as np
 10 | from datasets import load_dataset
 11 | from collections.abc import Iterator
 12 | 
 13 | from detection.validator.cc_dataset import CCDataset, get_2023_dumps
 14 | from neurons.miners.deberta_classifier import DebertaClassifier
 15 | 
 16 | PILE_COUNT = 80
 17 | CC_COUNT = 40
 18 | PILE_PROB = PILE_COUNT / (PILE_COUNT + CC_COUNT)
 19 | 
 20 | 
 21 | class TextDataset(Iterator):
 22 |     def __init__(self, max_prompt_len, text_field):
 23 |         super().__init__()
 24 |         self.max_prompt_len = max_prompt_len
 25 |         self.text_field = text_field
 26 |         self.name = 'CommonCrawlDataset' if text_field == 'raw_content' else 'PileDataset'
 27 |         self.dataset = self.init_dataset()
 28 | 
 29 |     @abstractmethod
 30 |     def get_iter(self):
 31 |         ...
 32 | 
 33 |     def filter_rules_pass(self, prompt, completion):
 34 |         if random.random() > 0.01:
 35 |             return False
 36 |         return True
 37 | 
 38 |     def init_dataset(self):
 39 |         try:
 40 |             dataset = self.get_iter()
 41 |             return dataset
 42 |         except Exception as e:
 43 |             bt.logging.error("Got exception during {} dataset initializing: {}, retrying...".format(self.name, e))
 44 |             time.sleep(60)
 45 |             return self.init_dataset()
 46 | 
 47 |     def __next__(self):
 48 |         while True:
 49 |             try:
 50 |                 el = next(self.dataset)
 51 |                 el[self.text_field] = el[self.text_field].replace('\x00', '')
 52 | 
 53 |                 document_text = el[self.text_field][:int(self.max_prompt_len * 1.25)]
 54 |                 context_len = int(len(document_text) * np.random.uniform(0.25, 0.75))
 55 |                 prompt = document_text[:context_len]
 56 |                 completion = el[self.text_field][context_len:]
 57 | 
 58 |                 if not self.filter_rules_pass(prompt, completion):
 59 |                     continue
 60 | 
 61 |                 return {'prompt': prompt, 'real_completion': completion}
 62 |             except Exception as e:
 63 |                 if type(e) == StopIteration:
 64 |                     bt.logging.info(f'{self.name} with ended: reinitializing it')
 65 |                 else:
 66 |                     bt.logging.error("Got exception during loading data from {}, reinitializing it: {}".format(self.name, e))
 67 |                     bt.logging.exception(e)
 68 | 
 69 |                 self.dataset = self.init_dataset()
 70 |                 continue
 71 | 
 72 | 
 73 | class PileDataset(TextDataset):
 74 |     def __init__(self, max_prompt_len):
 75 |         super().__init__(max_prompt_len, 'text')
 76 | 
 77 |     def get_iter(self):
 78 |         seed = int(time.time())
 79 |         dataset = iter(
 80 |             load_dataset("monology/pile-uncopyrighted", streaming=True)['train'].shuffle(
 81 |                 seed=seed, buffer_size=100000
 82 |             )
 83 |         )
 84 |         return dataset
 85 | 
 86 | 
 87 | class CommonCrawlDataset(TextDataset):
 88 |     def __init__(self, max_prompt_len):
 89 |         self.dumps_2023 = get_2023_dumps()
 90 |         logging.info(f"Found {len(self.dumps_2023)} dumps from 2023: {self.dumps_2023}")
 91 |         super().__init__(max_prompt_len, 'raw_content')
 92 | 
 93 |     def get_iter(self):
 94 |         seed = int(time.time())
 95 |         random.seed(seed)
 96 |         logging.info('Using seed {}'.format(seed))
 97 |         dataset = CCDataset(
 98 |             dumps=self.dumps_2023,
 99 |             num_segments=10,
100 |             lang_model=Path("cc_net/bin/lid.bin"),
101 |             lm_dir=Path("cc_net/data/lm_sp/"),
102 |             lang_whitelist=['en'],
103 |             lang_threshold=0.5,
104 |             min_len=300,
105 |             cache_dir=None,
106 |             tmp_dir=Path("cc_net/tmp_segments"),
107 |         )
108 |         return dataset
109 | 
110 |     def filter_rules_pass(self, prompt, completion):
111 |         if random.random() > 0.1:
112 |             return False
113 |         return True
114 | 
115 | 
116 | class HumanDataset(Iterator):
117 |     def __init__(self, max_prompt_len=1500):
118 |         super().__init__()
119 |         self.pile_dataset = PileDataset(max_prompt_len)
120 |         self.common_crawl = CommonCrawlDataset(max_prompt_len)
121 | 
122 |     def __next__(self) -> dict:
123 |         res = {}
124 |         if random.random() < PILE_PROB:
125 |             el = next(self.pile_dataset)
126 |             res['data_source'] = 'pile'
127 |         else:
128 |             el = next(self.common_crawl)
129 |             res['data_source'] = 'common_crawl'
130 | 
131 |         res['text'] = el['real_completion']
132 |         return res
133 | 
134 | 
135 | class PromptDataset(Iterator):
136 |     def __init__(self, max_prompt_len=1500):
137 |         super().__init__()
138 |         self.pile_dataset = PileDataset(max_prompt_len)
139 |         self.common_crawl = CommonCrawlDataset(max_prompt_len)
140 |         self.max_prompt_len = max_prompt_len
141 | 
142 |     def __next__(self) -> dict:
143 |         while True:
144 |             res = {}
145 |             if random.random() < PILE_PROB:
146 |                 el = next(self.pile_dataset)
147 |                 res['data_source'] = 'pile'
148 |             else:
149 |                 el = next(self.common_crawl)
150 |                 res['data_source'] = 'common_crawl'
151 | 
152 |             if len(el['prompt']) > self.max_prompt_len:
153 |                 bt.logging.info("Prompt has len {}, truncating it to {} chars".format(len(el['prompt']), self.max_prompt_len))
154 | 
155 |             res['prompt'] = el["prompt"][:self.max_prompt_len]
156 |             if res['prompt'].strip():
157 |                 return res
158 | 
159 | 
160 | if __name__ == '__main__':
161 |     dataset = HumanDataset()
162 |     print(next(dataset))
163 | 
164 |     dataset = PromptDataset()
165 |     for i in range(2):
166 |         print(next(dataset))
167 | 


--------------------------------------------------------------------------------
/detection/validator/segmentation_processer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | 
 4 | 
 5 | HUMAN_THEN_AI_PERCENT = 40
 6 | AI_PERCENT = 25
 7 | 
 8 | class SegmentationProcesser:
 9 |     def __init__(self, ):
10 |         pass
11 | 
12 |     def merge_prompt_text(self, prompt, text):
13 |         now = {}
14 |         el = {'prompt': prompt, 'text': text}
15 |         if not prompt:
16 |             raise Exception("There is should be a prompt during merging")
17 | 
18 |         if np.random.random() < HUMAN_THEN_AI_PERCENT / (HUMAN_THEN_AI_PERCENT + AI_PERCENT):
19 |             now['text'] = el['prompt'] + el['text']
20 |             now['cnt_first_human'] = len(el['prompt'].split())
21 |         else:
22 |             now['cnt_first_human'] = 0
23 |             now['text'] = el['text']
24 | 
25 |         return now['text'], now['cnt_first_human']
26 | 
27 |     def subsample_words(self, text, labels, min_cnt=35, max_cnt=350):
28 |         words = text.split()
29 |         if len(words) <= min_cnt:
30 |             return ' '.join(words), labels
31 | 
32 |         cnt = random.randint(min_cnt, min(max_cnt, len(words)))
33 | 
34 |         has_01 = False
35 |         has_10 = False
36 | 
37 |         for i in range(len(labels) - 1):
38 |             if labels[i] == 0 and labels[i + 1] == 1:
39 |                 has_01 = True
40 |             if labels[i] == 1 and labels[i + 1] == 0:
41 |                 has_10 = True
42 | 
43 |         if has_01 and has_10:
44 |             # if random.random() < 0.5:
45 |             # currently we always take ai the first and then human
46 |             ind = None
47 |             for i in range(len(labels) - 1):
48 |                 if labels[i] == 0 and labels[i + 1] == 1:
49 |                     ind = i + 1
50 |                     break
51 |             return self.subsample_words(' '.join(words[ind:]), labels[ind:])
52 |             # else:
53 |             #     ind = None
54 |             #     for i in range(len(labels) - 1):
55 |             #         if labels[i] == 1 and labels[i + 1] == 0:
56 |             #             ind = i + 1
57 |             #             break
58 |             #     return self.subsample_words(' '.join(words[:ind]), labels[:ind])
59 | 
60 |         split_index = None
61 |         for i in range(len(labels) - 1):
62 |             if labels[i] != labels[i + 1]:
63 |                 split_index = i
64 |                 break
65 |         
66 |         if split_index is not None:  # for two class case
67 |             ind = random.randint(max(split_index - cnt, 0), min(len(words) - cnt, split_index))
68 |         else:  # for one class case
69 |             ind = random.randint(0, len(words) - cnt)
70 | 
71 |         res = words[ind:ind + cnt]
72 |         labels = labels[ind:ind + cnt]
73 | 
74 |         if random.random() > 0.5 and len(res):
75 |             sent_ind = random.randint(0, len(res[0]) - 1)
76 |             res[0] = res[0][sent_ind:]
77 | 
78 |         if random.random() > 0.5:
79 |             sent_ind = random.randint(0, len(res[-1]) - 1)
80 |             res[-1] = res[-1][:sent_ind]
81 | 
82 |         return ' '.join(res), labels
83 | 


--------------------------------------------------------------------------------
/detection/validator/text_completion.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import random
  3 | import time
  4 | 
  5 | import bittensor as bt
  6 | import numpy as np
  7 | import requests
  8 | from langchain_ollama.llms import OllamaLLM
  9 | 
 10 | from detection.validator.text_postprocessing import TextCleaner
 11 | 
 12 | 
 13 | class OllamaModel:
 14 |     def __init__(self, model_name, num_predict=900, base_url="http://127.0.0.1:11434", in_the_middle_generation=False):
 15 |         """
 16 |         available models you can find on https://github.com/ollama/ollama
 17 |         before running model <model_name> install ollama and run 'ollama pull <model_name>'
 18 |         """
 19 |         self.model_name = model_name
 20 |         self.base_url = base_url
 21 |         self.num_predict = num_predict
 22 |         self.in_the_middle_generation = in_the_middle_generation
 23 | 
 24 |         bt.logging.info(f'Initializing OllamaModel {model_name}')
 25 |         if num_predict > 1000:
 26 |             raise Exception("You're trying to set num_predict to more than 1000, it can lead to context overloading and Ollama hanging")
 27 | 
 28 |         pulled_models = [el['name'] for el in self.ollama_list()['models']] if self.ollama_list() is not None else []
 29 |         if model_name not in pulled_models and model_name + ':latest' not in pulled_models:
 30 |             bt.logging.info("Model {} cannot be found locally - downloading it...".format(model_name))
 31 |             self.ollama_pull(model_name)
 32 |             bt.logging.info("Successfully downloaded {}".format(model_name))
 33 |         else:
 34 |             bt.logging.info("Found model {} locally, pulling in case of updates".format(model_name))
 35 |             self.ollama_pull(model_name)
 36 | 
 37 |         self.model = None
 38 |         self.params = {}
 39 |         self.init_model()
 40 | 
 41 |         self.text_cleaner = TextCleaner()
 42 | 
 43 |     def ollama_list(self):
 44 |         req = requests.get('{}/api/tags'.format(self.base_url))
 45 |         return req.json()
 46 | 
 47 |     def ollama_pull(self, model_name):
 48 |         req = requests.post('{}/api/pull'.format(self.base_url), json={'model': model_name})
 49 | 
 50 |     def init_model(self):
 51 |         # sapmling order in ollama: top_k, tfs_z, typical_p, top_p, min_p, temperature
 52 |         sampling_temperature = np.clip(np.random.normal(loc=1, scale=0.3), a_min=0, a_max=2)
 53 |         # Centered around 1 because that's what's hardest for downstream classification models.
 54 | 
 55 |         frequency_penalty = np.random.uniform(low=0.7, high=1.6)
 56 |         top_k = int(np.random.choice([-1, 20, 40, 80]))
 57 |         # top_k = top_k if top_k != -1 else None
 58 |         top_p = np.random.uniform(low=0.5, high=1)
 59 | 
 60 |         if random.random() < 0.1:
 61 |             # greedy strategy
 62 |             sampling_temperature = 0
 63 | 
 64 |         self.model = OllamaLLM(model=self.model_name,
 65 |                                base_url=self.base_url,
 66 |                                timeout=200,
 67 |                                num_thread=1,
 68 |                                num_predict=self.num_predict,
 69 |                                temperature=sampling_temperature,
 70 |                                repeat_penalty=frequency_penalty,
 71 |                                top_p=top_p,
 72 |                                top_k=top_k,
 73 |                                )
 74 | 
 75 |         self.params = {'top_k': top_k, 'top_p': top_p, 'temperature': sampling_temperature, 'repeat_penalty': frequency_penalty}
 76 | 
 77 |     def __call__(self, prompt: str, text_completion_mode=False) -> str | None:
 78 |         while True:
 79 |             try:
 80 |                 if text_completion_mode:
 81 |                     if 'text' not in self.model_name:
 82 |                         system_message = "You're a text completion model, just complete text that user sended you"  # . Return text without any supportive - we write add your result right after the user text
 83 |                         text = self.model.invoke([{'role': 'system', 'content': system_message},
 84 |                                                   {'role': 'user', 'content': prompt}])
 85 |                     else:
 86 |                         text = self.model.invoke(prompt)
 87 |                 else:
 88 |                     assert 'text' not in self.model_name
 89 |                     text = self.model.invoke(prompt)
 90 | 
 91 |                 return self.text_cleaner.clean_text(text)
 92 |             except Exception as e:
 93 |                 bt.logging.info("Couldn't get response from Ollama, probably it's restarting now: {}".format(e))
 94 |                 time.sleep(1)
 95 | 
 96 |     def classic_invoke(self, messages: list[dict]) -> str | None:
 97 |         while True:
 98 |             try:
 99 |                 return self.model.invoke(messages)
100 |             except Exception as e:
101 |                 bt.logging.info("Couldn't get response from Ollama, probably it's restarting now: {}".format(e))
102 |                 time.sleep(1)
103 | 
104 |     def __repr__(self) -> str:
105 |         return f"{self.model_name}"
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     bt.logging.info("started")
110 |     model = OllamaModel('llama2')
111 |     bt.logging.info("finished")
112 |     print(model.model)
113 | 


--------------------------------------------------------------------------------
/detection/validator/text_postprocessing.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | class TextCleaner:
 7 |     def __init__(self):
 8 |         pass
 9 | 
10 |     def _remove_emoji(self, text: str) -> str:
11 |         # remove emojies
12 |         emoji_pattern = re.compile("["
13 |                                    u"\U0001F600-\U0001F64F"  # emoticons
14 |                                    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
15 |                                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
16 |                                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
17 |                                    "]+", flags=re.UNICODE)
18 | 
19 |         text = emoji_pattern.sub(r'', text)
20 |         return text
21 | 
22 |     def _remove_subtext(self, text: str) -> str:
23 |         # remove words like *smiling*, *adjusts glasses*, etc
24 |         last = None
25 |         mask = np.ones(len(text))
26 |         for i, c in enumerate(text):
27 |             if c == '*':
28 |                 if last is None or (i - last) > 50:
29 |                     last = i
30 |                 else:
31 |                     mask[last:i + 1] = 0
32 |                     last = None
33 |         return ''.join([c for i, c in enumerate(text) if mask[i]])
34 | 
35 |     def clean_text(self, text: str) -> str:
36 |         text = text.strip()
37 |         text = self._remove_emoji(text)
38 |         text = self._remove_subtext(text)
39 |         return text


--------------------------------------------------------------------------------
/docs/FAQ.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## Miners
 4 | ### Is there a leaderboard where I can see the performance rankings of miners?
 5 | Yes, we do have a leaderboard based on OTF scores: https://huggingface.co/spaces/Infin/ai-detection-leaderboard 
 6 | 
 7 | ### Do you have a wandb?
 8 | Yes, https://wandb.ai/itsai-dev/subnet32
 9 | 
10 | ### I deployed my miner, but it hasn't received any requests
11 | It's okay to have no requests in first several hours - validators need some time see your miner.
12 | If you already waited 2-3 hours and still has no queries, check that you've properly exposed your ports to the internet.
13 | 
14 | ### The miner has been running for 9 hours, but taostats shows that emissions are still 0
15 | If you get queries from validators than just wait for a several hours (around 5-6) and incentive should show up. 
16 | If don't check question about not receiving requests.
17 | 
18 | ### Import issues and errors inside installed python packages
19 | If you see error and it's happening inside other library most probably you haven't install right version of it. 
20 | Try to install correct packages from requirements.txt inside a new venv and rerun your miner.
21 | Also we suggest you to use python 3.10 version.
22 | 
23 | ### How can I evaluate the competitiveness of a model in this network without registration? Is there a local evaluator or something?
24 | You can collect data in the way validator does using detection/data_generator.py and locally validate your model on it.
25 | Other way is to run model on testnet and check it's scores in wandb.
26 | 
27 | ### Does the baseline still valid for the miner?
28 | Baseline model is not survivable, it may even not pass a minimum out_of_domain_f1_score (which was selected based on the current alive miners scores).
29 | 
30 | ### Why am I getting zero scores from validators sometimes?
31 | Here are some possible ways to get zero score:
32 | a) answered to validator with wrong version
33 | b) your miner answers on small batch texts don't match predictions on the same texts in big batch
34 | c) mean f1-score on out of domain validation is less than 0.9
35 | d) maybe there were an internet problem on your or validator side and you didn't recieve the request
36 | 
37 | ### How can I improve quality of the model? What models should I use to be in the top?
38 | It’s the goal of every miner to find out the way to be better and stay at the top. Even we don’t know which models are the best at the moment at your subnet. 
39 | 
40 | ### Is testnet validator running?
41 | Most probably yes - it should be running on SN87 on testnet.
42 | 
43 | ## Validators
44 | 
45 | ### The CommonCrawlDataset has rate limits and randomly stops working. Is there any alternative?
46 | ![img.png](faq_1.png)
47 | 
48 | It's  working fine if you're not running several instances (of data generator or validators) at the same time, because it overwrites files in "cc_net/tmp_segments"
49 | 
50 | 


--------------------------------------------------------------------------------
/docs/faq_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/docs/faq_1.png


--------------------------------------------------------------------------------
/docs/incentive.md:
--------------------------------------------------------------------------------
 1 | # Incentive mechanism
 2 | 
 3 | For validating we use two types of data, which is balanced in proportion 1:1.
 4 | 
 5 | ### Human-written texts
 6 | To gather human-written validation data we use the Pile dataset.
 7 | 
 8 | The Pile is a 825 GiB diverse, open source language modelling data set that consists of 22 smaller, high-quality datasets combined together. It includes web-crawled data, financial, med, law, arxiv, github and also about 15 different topics.
 9 | 
10 | ### AI-generated texts
11 | For AI-generated text collection, we need to obtain prompts and then generate texts based on these prompts. While for human texts we take samples from Pile dataset we have to generate ai-samples from the same data-source, so that the only difference between them was human/ai written.
12 | 
13 | So, as prompts we take a random sample and then use part of it as text begging and ask LLMs to generate a completion for it.
14 | 
15 | We use the Ollama GitHub repository to run Large Language Models and generate completions for these prompts. As LLMs we use 30+ SOTA models from the top of LLM-Arena.
16 | 
17 | We also randomly select generation parameters for LLM during validation to make the dataset more diverse.
18 | 
19 | ### Data augmentation to prevent cheating
20 | To prevent remembering Pile dataset and make it stablier to overfitting we add some augmentation to both ai-generated and human-written texts. First of all we select a random sequence of consecutive sentences from a given text. Then we add in a random place (or two) misspelling (about 10 different char-based augs) or remove a random adjective.
21 | 
22 | These augmentations don't allow miners to precalculate hashes on Pile dataset and then use them to determine whether this text is present in the human set of data or not.
23 | 
24 | ## Reward counting
25 | Based on [Detecting LLM-Generated Text in Computing Education](https://arxiv.org/pdf/2307.07411.pdf) 
26 | article we decided to dived our reward on 3 parts:
27 | 
28 | #### F1 score
29 | We decided to use it instead of classic accuracy, because
30 | it better represent quality of model especially on binary-classification tasks.
31 | 
32 | #### False Positive score
33 | FP_score = 1 - FP / len(samples).
34 | 
35 | It is usually more important not to mistakenly classify human-written text as AI-generated than the other way around.
36 | It is preferable to tolerate a few more instances of student cheating or read some AI-generated emails than to wrongly penalize a real student or miss an important letter.
37 | 
38 | #### AP score
39 | AP summarizes a precision-recall curve by calculating the weighted mean of precisions achieved at each threshold.
40 | This allows us to evaluate the quality of the model's ranking.
41 | 
42 | 
43 | The final reward is the average of these three values.
44 | 


--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/docs/logo.png


--------------------------------------------------------------------------------
/docs/meet_its_ai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/docs/meet_its_ai.png


--------------------------------------------------------------------------------
/docs/miner_solution.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Perplexity approach
 3 | 
 4 | We made a solid baseline solution based on counting [perplexity of fixed-length models](https://huggingface.co/docs/transformers/perplexity). 
 5 | For counting PPL we use a fresh phi-2 model from microsoft, which has been released at the end of 2023. 
 6 | We also trained a linear model on the phi-2 outputs, to make probabilities more representative. 
 7 | 
 8 | On our local validation with baseline model got overall accuracy about 89%, you can find accuracy per data source below:
 9 | 
10 | | Data Source               | Accuracy |
11 | |---------------------------|----------|
12 | | LLM (gemma:7b)            | 0.939    |
13 | | LLM (neural-chat)         | 0.856    |
14 | | LLM (zephyr:7b-beta)      | 0.964    |
15 | | LLM (vicuna)              | 0.981    |
16 | | LLM (mistral)             | 0.963    |
17 | | Human-data                | 0.841    |


--------------------------------------------------------------------------------
/docs/mining.md:
--------------------------------------------------------------------------------
 1 | # ⛏️ Mining 
 2 | 
 3 | ## FAQ
 4 | 
 5 | We've collected some frequently asked questions in the Discord Channel and made a FAQ page, hope this help you to run your miners easier. We'll be updating it with fresh questions as they appear:
 6 |  
 7 | https://piquant-door-af5.notion.site/FAQ-0de42be01aa948c08cbfe982f2112aa8?pvs=4
 8 | 
 9 | ## System Requirements
10 | 
11 | Miners will need enough processing power to inference models. The device the models are inferenced on is recommended to be a GPU (atleast NVIDIA RTX A4000) with minimum 16 GB of VRAM.
12 | 
13 | 
14 | ## Installation
15 | 
16 | 1. Clone the repo
17 | 
18 | ```bash
19 | apt update && apt upgrade -y
20 | git clone https://github.com/It-s-AI/llm-detection
21 | ```  
22 | 
23 | 2. Setup your python [virtual environment](https://docs.python.org/3/library/venv.html) or [Conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands).
24 | 
25 | 3. Install the requirements. From your virtual environment, run
26 | ```shell
27 | cd llm-detection
28 | python -m pip install -e .
29 | ```
30 | 
31 | 4. Download models for LLM classification
32 | ```commandline
33 | wget https://huggingface.co/sergak0/sn32/resolve/main/deberta-large-ls03-ctx1024.pth -O models/deberta-large-ls03-ctx1024.pth
34 | wget https://huggingface.co/sergak0/sn32/resolve/main/deberta-v3-large-hf-weights.zip -O models/deberta-v3-large-hf-weights.zip
35 | apt install zip unzip
36 | unzip models/deberta-v3-large-hf-weights.zip -d models/deberta-v3-large-hf-weights
37 | ```
38 | 
39 | 4. Make sure you've [created a Wallet](https://docs.bittensor.com/getting-started/wallets) and [registered a hotkey](https://docs.bittensor.com/subnets/register-and-participate).
40 | 
41 | ```bash
42 | btcli w new_coldkey
43 | btcli w new_hotkey
44 | btcli s register --netuid 32 --wallet.name YOUR_COLDKEY --wallet.hotkey YOUR_HOTKEY
45 | ```
46 | 
47 | 5. (Optional) Run a Subtensor instance:  
48 | Your node will run better if you are connecting to a local Bittensor chain entrypoint node rather than using Opentensor's. 
49 | We recommend running a local node as follows and passing the ```--subtensor.network local``` flag to your running miners/validators. 
50 | To install and run a local subtensor node follow the commands below with Docker and Docker-Compose previously installed.
51 | ```bash
52 | git clone https://github.com/opentensor/subtensor.git
53 | cd subtensor
54 | docker compose up --detach
55 | ```
56 | 
57 | ## Running the Miner
58 | 
59 | 
60 | 
61 | > **Note:** Recently, the public RPC endpoint has been under high load, so it's strongly advised that you use your local Subtensor instance!
62 | 
63 | 
64 | Install PM2 and the jq package on your system.
65 | ```bash
66 | sudo apt update && sudo apt install jq && sudo apt install npm && sudo npm install pm2 -g && pm2 update
67 | ```
68 | 
69 | To start your miner basic command is
70 | 
71 | ```bash
72 | pm2 start --name net32-miner --interpreter python3 ./neurons/miner.py -- --wallet.name YOUR_COLDKEY --wallet.hotkey YOUR_HOTKEY --neuron.device cuda:0 --axon.port 70000 
73 | ```
74 | 
75 | ## Running the Miner on TESTNET
76 | 
77 | We have testnet subnet with netuid **87**. There is our validator running with uid 52 and hotkey `5Eo4PQvU4fhGLhk91UKpAaaEH59aHsVsw2jZ6ZhRT12s6JRA`.  
78 | 
79 | To start miner on testnet you have to run the following command
80 | 
81 | ```bash
82 | pm2 start --name net32-miner --interpreter python3 ./neurons/miner.py -- --wallet.name YOUR_COLDKEY --wallet.hotkey YOUR_HOTKEY --neuron.device cuda:0 --axon.port 70000 --subtensor.network test  --netuid 87 --blacklist.minimum_stake_requirement 0
83 | ```
84 | 
85 | > IMPORTANT: you should set `blacklist.minimum_stake_requirement` argument to 0 so our validator won't get blacklisted
86 | 


--------------------------------------------------------------------------------
/docs/raid_leaderboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/docs/raid_leaderboard.png


--------------------------------------------------------------------------------
/docs/validating.md:
--------------------------------------------------------------------------------
 1 | # 🧑‍🏫 Validating
 2 | 
 3 | # System Requirements
 4 | 
 5 | Validators will need enough processing power to inference multiple models. It is required to have a GPU (we commend NVIDIA A100) with minimum 80GB of VRAM. 
 6 | Also you need to have at least 1T of disk space.
 7 | 
 8 | ## Installation
 9 | 
10 | Make sure that your server provider support systemd (RunPod doesn't support it).
11 | Otherwise ollama service won't be restarting automatically and you'll have to restart it on your own from time to time.
12 | 
13 | 1. Clone the repo
14 | 
15 | ```bash
16 | apt update && apt upgrade -y
17 | git clone https://github.com/It-s-AI/llm-detection
18 | ```  
19 | 
20 | 2. Setup your python [virtual environment](https://docs.python.org/3/library/venv.html) or [Conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands).
21 | 
22 | 3. Install the requirements. From your virtual environment, run
23 | ```shell
24 | cd llm-detection
25 | python3 -m pip install -e .
26 | python3 -m pip uninstall mathgenerator -y
27 | python3 -m pip install git+https://github.com/synapse-alpha/mathgenerator.git
28 | ```
29 | 
30 | 4. Make sure you've [created a Wallet](https://docs.bittensor.com/getting-started/wallets) and [registered a hotkey](https://docs.bittensor.com/subnets/register-and-participate).
31 | 
32 | ```bash
33 | btcli w new_coldkey
34 | btcli w new_hotkey
35 | btcli s register --netuid 32 --wallet.name YOUR_COLDKEY --wallet.hotkey YOUR_HOTKEY
36 | ```
37 | 
38 | ## Install driver
39 | 
40 | Install PM2 and the jq package on your system.
41 | ```bash
42 | sudo apt update && sudo apt install jq && sudo apt install npm && sudo npm install pm2 -g && pm2 update
43 | ```
44 | 
45 | Make `run.sh` file executable.  
46 | ```bash
47 | chmod +x run.sh
48 | ```
49 | 
50 | So Ollama models can detect GPUs on your system
51 | ```bash
52 | apt update
53 | apt install lshw -y
54 | ```
55 | 
56 | ## Download models
57 | 
58 | Install Ollama
59 | ```bash
60 | curl -fsSL https://ollama.com/install.sh | sh
61 | ```
62 | 
63 | Run ollama service in background (make sure that you don't have any running instances of ollama before running this command)
64 | ```
65 | pm2 start --name ollama "ollama serve"
66 | ```
67 | 
68 | If you want to update your pulled models run this:
69 | ```
70 | ollama list | tail -n +2 | awk '{print $1}' | while read -r model; do
71 |   ollama pull $model
72 | done
73 | ```
74 | 
75 | Install cc_net
76 | ```bash
77 | sudo apt-get install build-essential libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev zip unzip -y
78 | pip install -e .
79 | ```
80 | 
81 | ## Running the Validator
82 | Note (from bittensor docs): the validator needs to serve an Axon with their IP or they may be blacklisted by the firewall of serving peers on the network.
83 | 
84 | If you want to properly serve your Axon you need to change --axon.port from 70000 to a real one.
85 | 
86 | ```bash
87 | pm2 start run.sh --name llm_detection_validators_autoupdate -- --wallet.name YOUR_COLDKEY --wallet.hotkey YOUR_HOTKEY --axon.port 70000 --neuron.device cuda:0
88 | ```
89 | 
90 | 


--------------------------------------------------------------------------------
/docs/vision_and_roadmap.md:
--------------------------------------------------------------------------------
 1 | # Vision & Roadmap
 2 | 
 3 | At the moment, many subnets have tasks for which they have implemented SOTA models in their miner codes to instantly achieve high quality. For such tasks, implementing better solutions could give miners only basis points of improvement. Almost no room to grow.
 4 | 
 5 | But our subnet is different. AI detection is a hard task to achieve high quality. That's why we aimed not just make "marketplace for inference SOTA models" as other subnets did but rather to create a constantly evolving environment where miners have to get better over time and not just run the same models for months.
 6 | 
 7 | In order to implement such an environment, we need to do the following.
 8 | 
 9 | ## Validators
10 | 
11 | Currently, validators use one large dataset with human data and two models (mistral and vicuna) to generate AI texts. What could be done to improve that:
12 | 
13 | 0. Use softmax on miners' scores for higher miners motivation
14 | 1. Add more models. By increasing the number and diversity of models, we will improve the overall quality of detection
15 | 2. Add more languages
16 | 3. Paraphrasing of AI texts
17 | 4. Make it resilient to tricks and attacks
18 | 5. Various types of text: differentiate articles/comments/posts/etc., in order to improve quality on each distinct type
19 | 6. Save all data that validator generates into cloud to make an open-source dataset in future
20 | 
21 | ## Miners
22 | 
23 | Generally speaking, improving miners is not our task. Miners should get better themselves. But there are a few things we can do to help them:
24 | 
25 | 1. Host testnet validators so miners can start without wasting TAO.
26 | 2. Make leaderboard and local dataset: we will list miners' metrics and allow people who want to start mining to evaluate their solution on a local dataset to compare them with existing ones before going to the mainnet.
27 | 3. Create Kaggle competition to introduce some of the best ML engineers to our subnet and make them run their top solution on-chain.
28 | 4. Despite the fact that solving LLM detection is a miner's problem, we are going to continue our own researches in this field to improve baseline solution and increase overall subnet's quality.
29 | ## Applications
30 | 
31 | One of the important tasks for us as subnet owners is to apply the subnet for real usage. Given the relevance of the problem, there is clearly a request for such solutions. That’s what we’re going to do:
32 | 
33 | ### Web service
34 | We’ve already developed an MVP version of a website for our subnet, where you can write some texts and then get miners' predictions with probability of this text to be ai-generated. But we’re going to develop a full version of web service, which will provide users even outside bittensor community availability to detect ai-generated texts.
35 | 
36 | ### Twitter extension
37 | Today, X/Twitter is among the top 6 social networking apps in the United States. And boasts over 500 million users worldwide. With the rapid growth of Large Language Models like ChatGpt and more and more content on the internet are generated by them. 
38 | We’re going to build an extension for twitter, which will mark tweets and comments that you’re reading with ai-generated/human-written tags based on miners predictions from the subnet, so that people can know what content is qualitative and which texts are just auto-generated. 
39 | 
40 | ### Browser extension
41 | We also found it very useful to have an ability to instantly check whether some peace of text that you’re reading is ai-generated or human-written, so one of the application that we want to develop is a browser extension, with which users can just highlight some text and see a probability of this text to be ai-generated.
42 | 
43 | ### API
44 | As mentioned above we’re going to develop several applications based on our subnet, but there are of course many more use cases for llm-detection in particular situations/businesses. So, we are also going to provide an API service that can be used by developers for their own integrations or for making predictions on a big amount of text (for example by AI engineers to clean up their datasets).
45 | 
46 | ### Commerce
47 | All of the mentioned above services will have their own subscription plans to commercialize SN32. They will be based on api, which will be run by validators to provide access for miners and on which validators will be able to earn additional money. 
48 | 
49 | By commercializing our product, we will become less reliant on emissions and start gaining real usage. Also, by the time when dynamic tao is introduced and validators' emission becomes zero, our token will already have great utility, and validators will be earning from the mentioned services.
50 | 


--------------------------------------------------------------------------------
/docs/what_are_subnets.md:
--------------------------------------------------------------------------------
 1 | # What is Bittensor?
 2 | Bittensor is a network where computers validate the work that other computers contribute to the network - the work what is most valuable to the collective will be rewarded
 3 | 
 4 | Bittensor is a catalyst to the open-source developers and smaller AI research labs now have a financial incentive for fine-tuning open foundational models
 5 | 
 6 | Bittensor is a library of machine intelligence that continuously grows and shares knowledge amongst peers
 7 | 
 8 | # What is a subnet?
 9 | 
10 | Bittensor is releasing its own language for creating incentive mechanisms. This allows developers to build incentive systems on Bittensor, tapping into our web of intelligence to develop markets of the developer’s choosings  
11 | 
12 | Subnet 1, an incentive system for machine intelligence production, showcases the enormous potential of markets to procure huge amounts of resources. Releasing user-created subnets is set to create a cambrian explosion of additional resources into the Bittensor ecosystem
13 | 
14 | # Why should you care?
15 | 
16 | As an open-source developer, you now have the ability to write your own incentive mechanisms without creating an entirely new chain. By tapping into Bittensor’s network of intelligence, you can incentivize AI models from all over the world to perform tasks of your choosing (i.e., image generation, storage, compute access, etc.) - the possibilities are truly endless
17 | 
18 | The release of subnets also offers the potential to pull these tools into a shared network, making all the ingredients necessary to create intelligence available within one network, governed by one token
19 | 
20 | You get to play a vital role in helping bootstrap what could one day become one of the most powerful networks in the world - and you make money by doing so!
21 | 
22 | By incentivizing developers to create their own markets, Bittensor is set to become a  one-stop-shop for those seeking all the compute requirements for building unstoppable applications on top of an incentivized infrastructure
23 | 
24 | # Deeper dive
25 | Check out the Bittensor about page [here](https://bittensor.com/about) for more details about what the bittensor paradigm is and why subnets are revolutionary technology.
26 | 
27 | Also see our [linktree](https://linktr.ee/opentensor) for more information.


--------------------------------------------------------------------------------
/min_compute.yml:
--------------------------------------------------------------------------------
 1 | # Use this document to specify the minimum compute requirements.
 2 | # This document will be used to generate a list of recommended hardware for your subnet.
 3 | 
 4 | # This is intended to give a rough estimate of the minimum requirements
 5 | # so that the user can make an informed decision about whether or not
 6 | # they want to run a miner or validator on their machine.
 7 | 
 8 | # NOTE: Specification for miners may be different from validators
 9 | 
10 | version: '1.0' # update this version key as needed, ideally should match your release version
11 | 
12 | compute_spec:
13 | 
14 |   miner:
15 | 
16 |     cpu:
17 |       min_cores: 4            # Minimum number of CPU cores
18 |       min_speed: 2.5          # Minimum speed per core (GHz)
19 |       recommended_cores: 8    # Recommended number of CPU cores
20 |       recommended_speed: 3.5  # Recommended speed per core (GHz)
21 |       architecture: "x86_64"  # Architecture type (e.g., x86_64, arm64)
22 | 
23 |     gpu:
24 |       required: True                       # Does the application require a GPU?
25 |       min_vram: 24                          # Minimum GPU VRAM (GB)
26 |       recommended_vram: 24                 # Recommended GPU VRAM (GB)
27 |       cuda_cores: 1024                     # Minimum number of CUDA cores (if applicable)
28 |       min_compute_capability: 6.0          # Minimum CUDA compute capability
29 |       recommended_compute_capability: 7.0  # Recommended CUDA compute capability
30 |       recommended_gpu: "NVIDIA RTX 4090"       # provide a recommended GPU to purchase/rent
31 | 
32 |     memory:
33 |       min_ram: 24          # Minimum RAM (GB)
34 |       min_swap: 4          # Minimum swap space (GB)
35 |       recommended_swap: 8  # Recommended swap space (GB)
36 |       ram_type: "DDR4"     # RAM type (e.g., DDR4, DDR3, etc.)
37 | 
38 |     storage:
39 |       min_space: 20           # Minimum free storage space (GB)
40 |       recommended_space: 100  # Recommended free storage space (GB)
41 |       type: "SSD"             # Preferred storage type (e.g., SSD, HDD)
42 |       min_iops: 1000          # Minimum I/O operations per second (if applicable)
43 |       recommended_iops: 5000  # Recommended I/O operations per second
44 | 
45 |     os:
46 |       name: "Ubuntu"  # Name of the preferred operating system(s)
47 |       version: 22.04  # Version of the preferred operating system(s)
48 | 
49 |   validator:
50 | 
51 |     cpu:
52 |       min_cores: 4            # Minimum number of CPU cores
53 |       min_speed: 2.5          # Minimum speed per core (GHz)
54 |       recommended_cores: 8    # Recommended number of CPU cores
55 |       recommended_speed: 3.5  # Recommended speed per core (GHz)
56 |       architecture: "x86_64"  # Architecture type (e.g., x86_64, arm64)
57 | 
58 |     gpu:
59 |       required: True                       # Does the application require a GPU?
60 |       min_vram: 80                          # Minimum GPU VRAM (GB)
61 |       recommended_vram: 80                 # Recommended GPU VRAM (GB)
62 |       cuda_cores: 1024                     # Minimum number of CUDA cores (if applicable)
63 |       min_compute_capability: 6.0          # Minimum CUDA compute capability
64 |       recommended_compute_capability: 7.0  # Recommended CUDA compute capability
65 |       recommended_gpu: "NVIDIA A100"       # provide a recommended GPU to purchase/rent
66 | 
67 |     memory:
68 |       min_ram: 80          # Minimum RAM (GB)
69 |       min_swap: 4          # Minimum swap space (GB)
70 |       recommended_swap: 8  # Recommended swap space (GB)
71 |       ram_type: "DDR4"     # RAM type (e.g., DDR4, DDR3, etc.)
72 | 
73 |     storage:
74 |       min_space: 1000          # Minimum free storage space (GB)
75 |       recommended_space: 1000  # Recommended free storage space (GB)
76 |       type: "SSD"             # Preferred storage type (e.g., SSD, HDD)
77 |       min_iops: 1000          # Minimum I/O operations per second (if applicable)
78 |       recommended_iops: 5000  # Recommended I/O operations per second
79 | 
80 |     os:
81 |       name: "Ubuntu"  # Name of the preferred operating system(s)
82 |       version: 22.04  # Version of the preferred operating system(s)
83 | 
84 | network_spec:
85 |   bandwidth:
86 |     download: 100  # Minimum download bandwidth (Mbps)
87 |     upload: 20     # Minimum upload bandwidth (Mbps)
88 | 


--------------------------------------------------------------------------------
/models/ppl_model.pk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/models/ppl_model.pk


--------------------------------------------------------------------------------
/neurons/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/neurons/__init__.py


--------------------------------------------------------------------------------
/neurons/miners/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/neurons/miners/__init__.py


--------------------------------------------------------------------------------
/neurons/miners/deberta_classifier.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
 4 | from torch.utils.data import Dataset
 5 | from tqdm import tqdm
 6 | 
 7 | 
 8 | class SimpleTestDataset(Dataset):
 9 |     def __init__(self, strings, tokenizer, max_sequence_length):
10 |         self.Strings = strings
11 |         self.Tokenizer = tokenizer
12 |         self.MaxSequenceLength = max_sequence_length
13 | 
14 |     def __len__(self):
15 |         return len(self.Strings)
16 | 
17 |     def __getitem__(self, idx):
18 |         string = self.Strings[idx].strip()
19 |         token_ids = self.Tokenizer(string, max_length=self.MaxSequenceLength, truncation=True).input_ids
20 | 
21 |         return {
22 |             'input_ids': token_ids,
23 |         }
24 | 
25 | 
26 | def GeneratePredictions(model, tokenizer, test_dataset, device):
27 |     data_loader = torch.utils.data.DataLoader(
28 |         test_dataset,
29 |         batch_size=4,
30 |         shuffle=False,
31 |         num_workers=1,
32 |         collate_fn=DataCollatorWithPadding(tokenizer))
33 | 
34 |     all_predictions = []
35 |     with torch.no_grad():
36 |         for batch in data_loader:
37 |             token_sequences = batch.input_ids.to(device)
38 |             attention_masks = batch.attention_mask.to(device)
39 | 
40 |             with torch.cuda.amp.autocast():
41 |                 raw_predictions = model(token_sequences, attention_masks).logits
42 | 
43 |             scaled_predictions = raw_predictions.softmax(dim = 1)[:,1]
44 |             all_predictions.append(scaled_predictions.cpu().numpy())
45 | 
46 |     all_predictions = np.concatenate(all_predictions)
47 | 
48 |     return all_predictions
49 | 
50 | 
51 | class DebertaClassifier:
52 |     def __init__(self, foundation_model_path, model_path, device):
53 |         self.tokenizer = AutoTokenizer.from_pretrained(foundation_model_path)
54 |         self.max_length = 1024
55 |         self.device = device
56 | 
57 |         model = AutoModelForSequenceClassification.from_pretrained(
58 |             foundation_model_path,
59 |             state_dict=torch.load(model_path),
60 |             attention_probs_dropout_prob=0,
61 |             hidden_dropout_prob=0).to(device)
62 | 
63 |         self.model = model.eval()
64 | 
65 |     def predict_batch(self, texts):
66 |         test_dataset = SimpleTestDataset(texts, self.tokenizer, self.max_length)
67 |         return GeneratePredictions(self.model, self.tokenizer, test_dataset, self.device)
68 | 
69 |     def __call__(self, text):
70 |         return self.predict_batch([text])[0]
71 | 


--------------------------------------------------------------------------------
/neurons/miners/ppl_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from transformers import AutoModelForCausalLM, AutoTokenizer
  3 | from sklearn.linear_model import LogisticRegression
  4 | from tqdm import tqdm
  5 | import pickle
  6 | import numpy as np
  7 | import bittensor as bt
  8 | 
  9 | import logging
 10 | logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)
 11 | 
 12 | 
 13 | class PPLModel:
 14 |     def __init__(self, device="cuda", model_id="microsoft/phi-2"):
 15 |         self.device = device
 16 |         self.model_id = model_id
 17 |         self.model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to(device)
 18 |         self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 19 | 
 20 |         self.max_length = 512 #self.model.config.n_positions
 21 |         self.stride = 512
 22 |         self.logreg = LogisticRegression(class_weight='balanced')
 23 | 
 24 |     def __call__(self, text):
 25 |         ppl = self.getPPL(text)
 26 |         if ppl is None:
 27 |             # print('None ppl')
 28 |             bt.logging.info('Got none ppl on text: {}'.format(text))
 29 |             return 0
 30 | 
 31 |         features = [(100 - ppl) / 100]
 32 |         return self.logreg.predict_proba([features])[0][1]
 33 | 
 34 |     def predict_batch(self, texts):
 35 |         preds = []
 36 |         for text in texts:
 37 |             preds.append(self.__call__(text))
 38 |         return preds
 39 | 
 40 |     def fit(self, X, y):
 41 |         features = []
 42 |         mask = []
 43 |         for text in tqdm(X):
 44 |             ppl = self.getPPL(text)
 45 |             ppl = (100 - ppl) / 100 if ppl is not None else None
 46 |             features.append(ppl)
 47 |             mask.append(ppl is not None)
 48 | 
 49 |         features = np.array(features)
 50 |         mask = np.array(mask)
 51 |         print("Number of not-none ppl: {}".format(mask.sum()))
 52 | 
 53 |         features = features[mask]
 54 |         y = y[mask]
 55 |         self.logreg.fit(features.reshape(-1, 1), y)
 56 | 
 57 |     def save(self, path):
 58 |         with open(path, 'wb') as f:
 59 |             pickle.dump(self.logreg, f)
 60 | 
 61 |     def load_pretrained(self, path):
 62 |         with open(path, 'rb') as f:
 63 |             self.logreg = pickle.load(f)
 64 | 
 65 |     def getPPL(self, text):
 66 |         text = '.'.join(text.split('.')[:30])
 67 |         encodings = self.tokenizer(text, return_tensors="pt")
 68 |         seq_len = encodings.input_ids.size(1)
 69 | 
 70 |         nlls = []
 71 |         likelihoods = []
 72 |         prev_end_loc = 0
 73 |         for begin_loc in range(0, seq_len, self.stride):
 74 |             end_loc = min(begin_loc + self.max_length, seq_len)
 75 |             trg_len = end_loc - prev_end_loc
 76 |             input_ids = encodings.input_ids[:, begin_loc:end_loc].to(self.device)
 77 |             target_ids = input_ids.clone()
 78 |             target_ids[:, :-trg_len] = -100
 79 | 
 80 |             with torch.no_grad():
 81 |                 loss = self.model(input_ids, labels=target_ids).loss
 82 |                 neg_log_likelihood = loss * trg_len
 83 |                 likelihoods.append(neg_log_likelihood)
 84 | 
 85 |             nlls.append(neg_log_likelihood)
 86 | 
 87 |             prev_end_loc = end_loc
 88 |             if end_loc == seq_len:
 89 |                 break
 90 | 
 91 |         if torch.isnan(torch.Tensor(nlls)).any() or len(nlls) == 0:
 92 |             return None
 93 | 
 94 |         ppl = int(torch.exp(torch.stack(nlls).sum() / end_loc))
 95 |         return ppl
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     model = PPLModel(device='cpu')
100 |     model.load_pretrained('neurons/miners/ppl_model.pk')
101 |     text = 'Hello world, i am here'
102 |     res = model(text)
103 |     print(res)
104 | 


--------------------------------------------------------------------------------
/prompting/__init__.py:
--------------------------------------------------------------------------------
 1 | # The MIT License (MIT)
 2 | # Copyright © 2024 Yuma Rao
 3 | 
 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation
 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 8 | 
 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of
10 | # the Software.
11 | 
12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
16 | # DEALINGS IN THE SOFTWARE.
17 | 
18 | # Define the version of the template module.
19 | __version__ = "1.1.3"
20 | version_split = __version__.split(".")
21 | __spec_version__ = (
22 |     (10000 * int(version_split[0]))
23 |     + (100 * int(version_split[1]))
24 |     + (1 * int(version_split[2]))
25 | )
26 | 
27 | # Import all submodules.
28 | from . import tasks
29 | from . import tools
30 | from . import agent
31 | from . import conversation
32 | from . import llm
33 | 


--------------------------------------------------------------------------------
/prompting/agent.py:
--------------------------------------------------------------------------------
  1 | # The MIT License (MIT)
  2 | # Copyright © 2024 Yuma Rao
  3 | 
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
  5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation
  6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
  7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  8 | 
  9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of
 10 | # the Software.
 11 | 
 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 16 | # DEALINGS IN THE SOFTWARE.
 17 | import textwrap
 18 | import time
 19 | import bittensor as bt
 20 | from dataclasses import asdict
 21 | from prompting.tasks import Task
 22 | from prompting.llm import HuggingFaceLLM
 23 | from prompting.cleaners.cleaner import CleanerPipeline
 24 | 
 25 | from prompting.persona import Persona, create_persona
 26 | 
 27 | from transformers import Pipeline
 28 | 
 29 | 
 30 | class HumanAgent(HuggingFaceLLM):
 31 |     "Agent that impersonates a human user and makes queries based on its goal."
 32 | 
 33 |     @property
 34 |     def progress(self):
 35 |         return int(self.task.complete)
 36 | 
 37 |     @property
 38 |     def finished(self):
 39 |         return self.progress == 1
 40 | 
 41 |     system_prompt_template = textwrap.dedent(
 42 |         """This is a roleplaying game where you are impersonating {mood} human user with a specific persona. As a human, you are using AI assistant to {desc} related to {topic} ({subtopic}) in a {tone} tone. You don't need to greet the assistant or be polite, unless this is part of your persona. The spelling and grammar of your messages should also reflect your persona.
 43 | 
 44 |         Your singular focus is to use the assistant to {goal}: {query}
 45 |     """
 46 |     )
 47 | 
 48 |     def __init__(
 49 |         self,
 50 |         task: Task,
 51 |         llm_pipeline: Pipeline,
 52 |         system_template: str = None,
 53 |         persona: Persona = None,
 54 |         begin_conversation=True,
 55 |     ):
 56 |         if persona is None:
 57 |             persona = create_persona()
 58 | 
 59 |         self.persona = persona
 60 |         self.task = task
 61 |         self.llm_pipeline = llm_pipeline
 62 | 
 63 |         if system_template is not None:
 64 |             self.system_prompt_template = system_template
 65 | 
 66 |         self.system_prompt = self.system_prompt_template.format(
 67 |             mood=self.persona.mood,
 68 |             tone=self.persona.tone,
 69 |             **self.task.__state_dict__(),
 70 |         )
 71 | 
 72 |         super().__init__(
 73 |             llm_pipeline=llm_pipeline,
 74 |             system_prompt=self.system_prompt,
 75 |             max_new_tokens=256,
 76 |         )
 77 | 
 78 |         if begin_conversation:
 79 |             bt.logging.debug("🤖 Generating challenge query...")
 80 |             # initiates the conversation with the miner
 81 |             self.challenge = self.create_challenge()
 82 | 
 83 |     def create_challenge(self) -> str:
 84 |         """Creates the opening question of the conversation which is based on the task query but dressed in the persona of the user."""
 85 |         t0 = time.time()
 86 | 
 87 |         cleaner = None
 88 |         if hasattr(self.task, "cleaning_pipeline"):
 89 |             cleaner = CleanerPipeline(cleaning_pipeline=self.task.cleaning_pipeline)
 90 | 
 91 |         self.challenge = super().query(
 92 |             message="Ask a question related to your goal", cleaner=cleaner
 93 |         )
 94 |         self.challenge = self.task.format_challenge(self.challenge)
 95 |         self.challenge_time = time.time() - t0
 96 | 
 97 |         return self.challenge
 98 | 
 99 |     def __state_dict__(self, full=False):
100 |         return {
101 |             "challenge": self.challenge,
102 |             "challenge_time": self.challenge_time,
103 |             **self.task.__state_dict__(full=full),
104 |             **asdict(self.persona),
105 |             "system_prompt": self.system_prompt,
106 |         }
107 | 
108 |     def __str__(self):
109 |         return self.system_prompt
110 | 
111 |     def __repr__(self):
112 |         return str(self)
113 | 
114 |     def continue_conversation(self, miner_response: str):
115 |         # Generates response to miner response
116 |         self.query(miner_response)
117 |         # Updates current prompt with new state of conversation
118 |         # self.prompt = self.get_history_prompt()
119 | 
120 |     def update_progress(
121 |         self, top_reward: float, top_response: str, continue_conversation=False
122 |     ):
123 |         if top_reward > self.task.reward_threshold:
124 |             self.task.complete = True
125 |             self.messages.append({"content": top_response, "role": "user"})
126 | 
127 |             bt.logging.debug("Agent finished its goal")
128 |             return
129 | 
130 |         if continue_conversation:
131 |             bt.logging.debug(
132 |                 "↪ Agent did not finish its goal, continuing conversation..."
133 |             )
134 |             self.continue_conversation(miner_response=top_response)
135 | 


--------------------------------------------------------------------------------
/prompting/cleaners/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/prompting/cleaners/__init__.py


--------------------------------------------------------------------------------
/prompting/cleaners/all_cleaners.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | import bittensor as bt
 3 | import re
 4 | 
 5 | 
 6 | class BaseCleaner(ABC):
 7 |     @abstractmethod
 8 |     def __init__(self, **kwargs):
 9 |         pass
10 | 
11 |     @abstractmethod
12 |     def apply(self, generation: str) -> str:
13 |         pass
14 | 
15 | 
16 | class RemoveQuotes(BaseCleaner):
17 |     def __init__(self, **kwargs) -> None:
18 |         pass
19 | 
20 |     def apply(self, generation: str) -> str:
21 |         bt.logging.debug("Pruning unfinished sentence.")
22 |         return generation.strip("\"'")
23 | 
24 | 
25 | class PruneEnding(BaseCleaner):
26 |     def __init__(self, **kwargs):
27 |         pass
28 | 
29 |     def apply(self, generation: str) -> str:
30 |         punctuation_chars = [".", "?", "!"]
31 | 
32 |         if not any(char in generation for char in punctuation_chars):
33 |             return generation
34 | 
35 |         if (
36 |             not generation.endswith(".")
37 |             and not generation.endswith("?")
38 |             and not generation.endswith("!")
39 |         ):
40 |             index = max(generation.rfind(char) for char in punctuation_chars)
41 |             return generation[
42 |                 : index + 1
43 |             ]  # Go to the index of where the punctuation is, and include it (+1)
44 |         else:
45 |             return generation
46 | 
47 | 
48 | class RemoveRoles(BaseCleaner):
49 |     def __init__(self, **kwargs):
50 |         pass
51 | 
52 |     def capitalize_sentences(self, input_string):
53 |         """capitalize the first character after .!?"""
54 |         sentences = re.split(r"(?<=[.!?])\s+", input_string)
55 |         capitalized_sentences = [sentence.capitalize() for sentence in sentences]
56 |         result_string = " ".join(capitalized_sentences)
57 |         return result_string
58 | 
59 |     def apply(self, generation: str) -> str:
60 |         roles = [
61 |             "User: ",
62 |             "System: ",
63 |             "Assistant: ",
64 |             "Assistant, ",
65 |             "Dear AI, ",
66 |             "Dear AI ",
67 |             "#Question: ",
68 |         ]
69 |         for role in roles:
70 |             if role in generation:
71 |                 generation = generation.replace(role, "")
72 | 
73 |         return self.capitalize_sentences(
74 |             input_string=generation
75 |         )  # LLMs are good at being formal. Do the same if we remove a prefix.
76 | 


--------------------------------------------------------------------------------
/prompting/cleaners/cleaner.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict
 2 | 
 3 | import bittensor as bt
 4 | 
 5 | from prompting.cleaners.all_cleaners import RemoveQuotes, RemoveRoles, PruneEnding
 6 | 
 7 | SUPPORTED_CLEANERS = {
 8 |     "remove_quotes": RemoveQuotes,
 9 |     "remove_roles": RemoveRoles,
10 |     "prune_ending": PruneEnding,
11 | }
12 | 
13 | 
14 | class CleanerPipeline:
15 |     def __init__(self, cleaning_pipeline: List[Dict]) -> None:
16 |         """CleanerPipeline is a pipeline that can be applied to any string to
17 |         clean it of unwanted characters, punctuation, etc.
18 | 
19 |         cleaning_pipeline (List[Dict]): List of Dicts that define the cleaning pipeline.
20 |             Dictionaries MUST have the keyword "name" to be valid.
21 |             Example: [{"name": "remove_quotes", "kwargs": {}}, {"name": "prune_ending", "kwargs": {}}]
22 | 
23 |         """
24 |         self.cleaning_pipeline = cleaning_pipeline
25 | 
26 |     def apply(self, generation: str) -> str:
27 |         """Apply cleaning steps to generation listed in cleaning_pipeline.
28 | 
29 |         Args:
30 |             generation (str): string generated from LLM or otherwise.
31 |         Returns:
32 |             str: Clean generated string.
33 |         """
34 |         try:
35 |             for cleaner in self.cleaning_pipeline:
36 |                 if "name" not in cleaner or cleaner["name"] not in SUPPORTED_CLEANERS:
37 |                     raise ValueError(
38 |                         f"Cleaning pipeline step {cleaner} must have a name, or must be in SUPPORTED_CLEANERS."
39 |                     )
40 | 
41 |                 func = SUPPORTED_CLEANERS[cleaner["name"]]
42 | 
43 |                 kwargs = cleaner.get("kwargs", {})
44 |                 func = func(**kwargs)  # instantiate the cleaner with the kwargs
45 | 
46 |                 # apply all the filters for the specific task.
47 |                 generation = func.apply(generation=generation)
48 | 
49 |             return generation
50 | 
51 |         except Exception as E:
52 |             bt.logging.error(
53 |                 f"Failed to apply cleaning pipeline {cleaner['name']}. {E},"
54 |             )
55 |             return generation
56 | 


--------------------------------------------------------------------------------
/prompting/conversation.py:
--------------------------------------------------------------------------------
 1 | from prompting.tasks import (
 2 |     Task,
 3 |     DebuggingTask,
 4 |     QuestionAnsweringTask,
 5 |     SummarizationTask,
 6 |     MathTask,
 7 |     DateQuestionAnsweringTask,
 8 | )
 9 | from prompting.tools import (
10 |     WikiDataset,
11 |     HFCodingDataset,
12 |     MathDataset,
13 |     WikiDateDataset,
14 | )
15 | 
16 | from transformers import Pipeline
17 | 
18 | 
19 | def create_task(llm_pipeline: Pipeline, task_name: str) -> Task:
20 |     wiki_based_tasks = ["summarization", "qa"]
21 |     coding_based_tasks = ["debugging"]
22 |     # TODO Add math and date_qa to this structure
23 | 
24 |     # TODO: Abstract dataset classes into common dynamic interface
25 |     if task_name in wiki_based_tasks:
26 |         dataset = WikiDataset()
27 | 
28 |     elif task_name in coding_based_tasks:
29 |         dataset = HFCodingDataset()
30 | 
31 |     elif task_name == "math":
32 |         dataset = MathDataset()
33 | 
34 |     elif task_name == "date_qa":
35 |         dataset = WikiDateDataset()
36 | 
37 |     if task_name == "summarization":
38 |         task = SummarizationTask(llm_pipeline=llm_pipeline, context=dataset.next())
39 | 
40 |     elif task_name == "qa":
41 |         task = QuestionAnsweringTask(llm_pipeline=llm_pipeline, context=dataset.next())
42 | 
43 |     elif task_name == "debugging":
44 |         task = DebuggingTask(llm_pipeline=llm_pipeline, context=dataset.next())
45 | 
46 |     elif task_name == "math":
47 |         task = MathTask(llm_pipeline=llm_pipeline, context=dataset.next())
48 | 
49 |     elif task_name == "date_qa":
50 |         task = DateQuestionAnsweringTask(
51 |             llm_pipeline=llm_pipeline, context=dataset.next()
52 |         )
53 | 
54 |     else:
55 |         raise ValueError(f"Task {task_name} not supported. Please choose a valid task")
56 | 
57 |     return task
58 | 


--------------------------------------------------------------------------------
/prompting/llm.py:
--------------------------------------------------------------------------------
  1 | # The MIT License (MIT)
  2 | # Copyright © 2024 Yuma Rao
  3 | 
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
  5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation
  6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
  7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  8 | 
  9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of
 10 | # the Software.
 11 | 
 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 16 | # DEALINGS IN THE SOFTWARE.
 17 | 
 18 | import time
 19 | 
 20 | from typing import List, Dict
 21 | import bittensor as bt
 22 | 
 23 | from transformers import Pipeline, pipeline
 24 | from prompting.mock import MockPipeline
 25 | 
 26 | from prompting.cleaners.cleaner import CleanerPipeline
 27 | 
 28 | 
 29 | def load_pipeline(
 30 |     model_id, device=None, torch_dtype=None, mock=False, model_kwargs: dict = None
 31 | ):
 32 |     """Loads the HuggingFace pipeline for the LLM, or a mock pipeline if mock=True"""
 33 | 
 34 |     if mock or model_id == "mock":
 35 |         return MockPipeline(model_id)
 36 | 
 37 |     if not device.startswith("cuda"):
 38 |         bt.logging.warning("Only crazy people run this on CPU. It is not recommended.")
 39 | 
 40 |     # model_kwargs torch type definition conflicts with pipeline torch_dtype, so we need to differentiate them
 41 |     if model_kwargs is None:
 42 |         llm_pipeline = pipeline(
 43 |             "text-generation",
 44 |             model=model_id,
 45 |             device=device,
 46 |             torch_dtype=torch_dtype,
 47 |         )
 48 |     else:
 49 |         llm_pipeline = pipeline(
 50 |             "text-generation",
 51 |             model=model_id,
 52 |             device_map=device,
 53 |             model_kwargs=model_kwargs,
 54 |         )
 55 | 
 56 |     return llm_pipeline
 57 | 
 58 | 
 59 | class HuggingFaceLLM:
 60 |     def __init__(
 61 |         self,
 62 |         llm_pipeline: Pipeline,
 63 |         system_prompt,
 64 |         max_new_tokens=256,
 65 |         do_sample=True,
 66 |         temperature=0.7,
 67 |         top_k=50,
 68 |         top_p=0.95,
 69 |     ):
 70 |         self.llm_pipeline = llm_pipeline
 71 |         self.system_prompt = system_prompt
 72 |         self.kwargs = dict(
 73 |             do_sample=do_sample,
 74 |             temperature=temperature,
 75 |             top_k=top_k,
 76 |             top_p=top_p,
 77 |             max_new_tokens=max_new_tokens,
 78 |         )
 79 | 
 80 |         self.messages = [{"content": self.system_prompt, "role": "system"}]
 81 |         self.times = [0]
 82 | 
 83 |     def query(
 84 |         self,
 85 |         message: List[Dict[str, str]],
 86 |         role: str = "user",
 87 |         disregard_system_prompt: bool = False,
 88 |         cleaner: CleanerPipeline = None,
 89 |     ):
 90 |         messages = self.messages + [{"content": message, "role": role}]
 91 | 
 92 |         if disregard_system_prompt:
 93 |             messages = messages[1:]
 94 | 
 95 |         tbeg = time.time()
 96 |         response = self.forward(messages=messages)
 97 | 
 98 |         if cleaner is not None:
 99 |             clean_response = cleaner.apply(generation=response)
100 |             if clean_response != response:
101 |                 bt.logging.debug(
102 |                     f"Response cleaned, chars removed: {len(response) - len(clean_response)}..."
103 |                 )
104 |             response = clean_response
105 | 
106 |         self.messages = messages + [{"content": response, "role": "assistant"}]
107 |         self.times = self.times + [0, time.time() - tbeg]
108 | 
109 |         return response
110 | 
111 |     def __call__(self, messages: List[Dict[str, str]]):
112 |         return self.forward(messages=messages)
113 | 
114 |     def _make_prompt(self, messages: List[Dict[str, str]]):
115 |         return self.llm_pipeline.tokenizer.apply_chat_template(
116 |             messages, tokenize=False, add_generation_prompt=True
117 |         )
118 | 
119 |     def forward(self, messages: List[Dict[str, str]], preformat_messages: bool = False):
120 |         prompt = self._make_prompt(messages)
121 |         outputs = self.llm_pipeline(prompt, **self.kwargs)
122 |         response = outputs[0]["generated_text"]
123 | 
124 |         response = response.replace(prompt, "").strip()
125 | 
126 |         bt.logging.debug(
127 |             f"{self.__class__.__name__} generated the following output:\n{response}"
128 |         )
129 |         return response
130 | 


--------------------------------------------------------------------------------
/prompting/mock.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import uuid
  3 | import torch
  4 | import asyncio
  5 | import random
  6 | import bittensor as bt
  7 | 
  8 | from typing import List
  9 | 
 10 | 
 11 | class MockTokenizer:
 12 |     def __init__(self):
 13 |         super().__init__()
 14 | 
 15 |         self.role_expr = "<|mock-{role}|>"
 16 | 
 17 |     def apply_chat_template(self, messages, **kwargs):
 18 |         prompt = ""
 19 |         for m in messages:
 20 |             role = self.role_expr.format(role=m["role"])
 21 |             content = m["content"]
 22 |             prompt += f"<|mock-{role}|> {content}\n"
 23 | 
 24 |         return "\n".join(prompt)
 25 | 
 26 | 
 27 | class MockModel(torch.nn.Module):
 28 |     def __init__(self, phrase):
 29 |         super().__init__()
 30 | 
 31 |         self.tokenizer = MockTokenizer()
 32 |         self.phrase = phrase
 33 | 
 34 |     def __call__(self, messages):
 35 |         return self.forward(messages)
 36 | 
 37 |     def forward(self, messages):
 38 |         role_tag = self.tokenizer.role_expr.format(role="assistant")
 39 |         return f"{role_tag} {self.phrase}"
 40 | 
 41 | 
 42 | class MockPipeline:
 43 |     @property
 44 |     def tokenizer(self):
 45 |         return self.model.tokenizer
 46 | 
 47 |     def __init__(
 48 |         self,
 49 |         phrase="Mock llm output",
 50 |         model_kwargs=None,
 51 |     ):
 52 |         super().__init__()
 53 | 
 54 |         self.model_kwargs = model_kwargs or {}
 55 |         self.model = MockModel(phrase)
 56 | 
 57 |     def __repr__(self):
 58 |         return f"{self.__class__.__name__}(phrase={self.model.phrase})"
 59 | 
 60 |     def __call__(self, messages, **kwargs):
 61 |         return self.forward(messages, **kwargs)
 62 | 
 63 |     def forward(self, messages, **kwargs):
 64 |         output = self.model(messages)
 65 |         return self.postprocess(output)
 66 | 
 67 |     def postprocess(self, output, **kwargs):
 68 |         output = output.split(self.model.tokenizer.role_expr.format(role="assistant"))[
 69 |             -1
 70 |         ].strip()
 71 |         return [{"generated_text": output}]
 72 | 
 73 |     def preprocess(self, **kwargs):
 74 |         pass
 75 | 
 76 | 
 77 | class MockSubtensor(bt.MockSubtensor):
 78 |     def __init__(self, netuid, n=16, wallet=None):
 79 | 
 80 |         super().__init__()
 81 |         # reset the underlying subtensor state
 82 |         self.chain_state = None
 83 |         self.setup()
 84 | 
 85 |         if not self.subnet_exists(netuid):
 86 |             self.create_subnet(netuid)
 87 | 
 88 |         # Register ourself (the validator) as a neuron at uid=0
 89 |         if wallet is not None:
 90 |             self.force_register_neuron(
 91 |                 netuid=netuid,
 92 |                 hotkey=wallet.hotkey.ss58_address,
 93 |                 coldkey=wallet.coldkey.ss58_address,
 94 |                 balance=100000,
 95 |                 stake=100000,
 96 |             )
 97 | 
 98 |         # Register n mock neurons who will be miners
 99 |         for i in range(1, n + 1):
100 |             self.force_register_neuron(
101 |                 netuid=netuid,
102 |                 hotkey=f"miner-hotkey-{i}",
103 |                 coldkey="mock-coldkey",
104 |                 balance=100000,
105 |                 stake=100000,
106 |             )
107 | 
108 | 
109 | class MockMetagraph(bt.metagraph):
110 | 
111 |     default_ip = "127.0.0.0"
112 |     default_port = 8091
113 | 
114 |     def __init__(self, netuid=1, network="mock", subtensor=None):
115 |         super().__init__(netuid=netuid, network=network, sync=False)
116 | 
117 |         if subtensor is not None:
118 |             self.subtensor = subtensor
119 |         self.sync(subtensor=subtensor)
120 | 
121 |         for axon in self.axons:
122 |             axon.ip = self.default_ip
123 |             axon.port = self.default_port
124 | 
125 | 
126 | class MockDendrite(bt.dendrite):
127 |     """
128 |     Replaces a real bittensor network request with a mock request that just returns some static completion for all axons that are passed and adds some random delay.
129 |     """
130 | 
131 |     min_time: float = 0
132 |     max_time: float = 1
133 | 
134 |     def __init__(self, wallet):
135 |         super().__init__(wallet)
136 | 
137 |     async def forward(
138 |         self,
139 |         axons: List[bt.axon],
140 |         synapse: bt.Synapse = bt.Synapse(),
141 |         timeout: float = 12,
142 |         deserialize: bool = True,
143 |         run_async: bool = True,
144 |         streaming: bool = False,
145 |     ):
146 | 
147 |         if streaming:
148 |             raise NotImplementedError("Streaming not implemented yet.")
149 | 
150 |         async def query_all_axons(streaming: bool):
151 |             """Queries all axons for responses."""
152 | 
153 |             async def single_axon_response(i, axon):
154 |                 """Queries a single axon for a response."""
155 | 
156 |                 t0 = time.time()
157 |                 s = synapse.copy()
158 |                 # Attach some more required data so it looks real
159 |                 s = self.preprocess_synapse_for_request(axon, s, timeout)
160 |                 # We just want to mock the response, so we'll just fill in some data
161 |                 process_time = (
162 |                     random.random() * (self.max_time - self.min_time) + self.min_time
163 |                 )
164 |                 await asyncio.sleep(process_time)
165 |                 if process_time < timeout:
166 |                     # Update the status code and status message of the dendrite to match the axon
167 |                     s.completion = f"Mock miner completion {i}"
168 |                     s.dendrite.status_code = 200
169 |                     s.dendrite.status_message = "OK"
170 |                 else:
171 |                     s.completion = ""
172 |                     s.dendrite.status_code = 408
173 |                     s.dendrite.status_message = "Timeout"
174 | 
175 |                 s.dendrite.process_time = str(time.time() - t0)
176 | 
177 |                 # Return the updated synapse object after deserializing if requested
178 |                 if deserialize:
179 |                     return s.deserialize()
180 |                 else:
181 |                     return s
182 | 
183 |             return await asyncio.gather(
184 |                 *(
185 |                     single_axon_response(i, target_axon)
186 |                     for i, target_axon in enumerate(axons)
187 |                 )
188 |             )
189 | 
190 |         return await query_all_axons(streaming)
191 | 
192 |     def __str__(self) -> str:
193 |         """
194 |         Returns a string representation of the Dendrite object.
195 | 
196 |         Returns:
197 |             str: The string representation of the Dendrite object in the format "dendrite(<user_wallet_address>)".
198 |         """
199 |         return "MockDendrite({})".format(self.keypair.ss58_address)
200 | 


--------------------------------------------------------------------------------
/prompting/persona.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from dataclasses import dataclass
 3 | 
 4 | 
 5 | @dataclass
 6 | class Persona:
 7 |     profile: str
 8 |     mood: str
 9 |     tone: str
10 | 
11 | 
12 | def create_persona() -> Persona:
13 |     """Defines the persona of the user. This is used to create the system prompt. It dictates the style of the agent's questions and communication."""
14 |     profiles = [
15 |         "student",
16 |         "teacher",
17 |         "parent",
18 |         "hacker",
19 |         "programmer",
20 |         "scientist",
21 |     ]
22 |     # profiles = ["16 year old highschool student", ...
23 | 
24 |     # TODO: more terse, less verbose
25 |     mood = [
26 |         "an interested",
27 |         "a concerned",
28 |         "an impatient",
29 |         "a tired",
30 |         "a confused",
31 |         "an annoyed",
32 |         "a curious",
33 |         "an upbeat",
34 |         "a lazy",
35 |     ]
36 |     tone = [
37 |         "formal",
38 |         "informal",
39 |         "indifferent",
40 |         "casual",
41 |         "rushed",
42 |         "polite",
43 |         "impolite",
44 |         "friendly",
45 |         "unfriendly",
46 |         "positive",
47 |         "negative",
48 |     ]
49 |     # TODO: we can lower case the human messages, add common grammar and spelling mistakes...
50 | 
51 |     return Persona(
52 |         profile=random.choice(profiles),
53 |         mood=random.choice(mood),
54 |         tone=random.choice(tone),
55 |     )
56 | 


--------------------------------------------------------------------------------
/prompting/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | from .task import Task
 2 | from .debugging import DebuggingTask
 3 | from .summarization import SummarizationTask
 4 | from .qa import QuestionAnsweringTask
 5 | from .date_qa import DateQuestionAnsweringTask
 6 | from .generic_instruction import GenericInstructionTask
 7 | from .math import MathTask
 8 | 
 9 | 
10 | TASKS = {
11 |     "qa": QuestionAnsweringTask,
12 |     "summarization": SummarizationTask,
13 |     "date_qa": DateQuestionAnsweringTask,
14 |     "debugging": DebuggingTask,
15 |     "math": MathTask,
16 | }
17 | 


--------------------------------------------------------------------------------
/prompting/tasks/date_qa.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from prompting.tasks import Task
 3 | from prompting.cleaners.cleaner import CleanerPipeline
 4 | 
 5 | 
 6 | SECTION_MESSAGES = {"Births": " was born ", "Deaths": " died ", "Events": " "}
 7 | 
 8 | 
 9 | @dataclass
10 | class DateQuestionAnsweringTask(Task):
11 | 
12 |     name = "date-based question answering"
13 |     desc = "get help answering a specific date-based question"
14 |     goal = "to get the answer to the following date-based question"
15 |     reward_definition = [
16 |         dict(name="date", weight=1.0),
17 |     ]
18 |     penalty_definition = []
19 |     cleaning_pipeline = [
20 |         dict(name="remove_quotes"),
21 |         dict(name="remove_roles"),
22 |     ]
23 |     static_reference = True
24 |     static_query = True
25 | 
26 |     def __init__(self, llm_pipeline, context, create_reference=True):
27 | 
28 |         self.context = context
29 | 
30 |         self.query = (
31 |             context.content + SECTION_MESSAGES[context.topic] + "on what exact date?"
32 |         )
33 |         self.reference = self.context.title.replace("_", " ") + ", " + context.subtopic
34 | 
35 |         self.topic = context.title
36 |         self.subtopic = context.topic
37 |         self.tags = context.tags
38 | 


--------------------------------------------------------------------------------
/prompting/tasks/debugging.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import bittensor as bt
  3 | from dataclasses import dataclass
  4 | from prompting.tasks import Task
  5 | import difflib
  6 | 
  7 | 
  8 | def corrupt(
  9 |     code,
 10 |     n_remove=0,
 11 |     n_swap=0,
 12 |     seed=None,
 13 |     sep=" ",
 14 |     min_length=1,
 15 |     max_length=10,
 16 |     remove_comment_lines=False,
 17 | ):
 18 |     """
 19 |     Corrupt a piece of code by removing and/or swapping chunks of it.
 20 |     TODO: Ignore comments and strings(?) when corrupting the code.
 21 | 
 22 |     Args:
 23 |         code (str): The code to corrupt.
 24 |         n_remove (int): The number of chunks to remove.
 25 |         n_swap (int): The number of chunks to swap.
 26 |         seed (int): The random seed to use.
 27 |         sep (str): The separator to use when splitting the code into chunks. Recommended values are '', ' ', '\n'.
 28 |         min_length (int): The minimum length of a chunk.
 29 |         max_length (int): The maximum length of a chunk.
 30 |     """
 31 | 
 32 |     # set seed for reproducibility
 33 |     random.seed(seed)
 34 | 
 35 |     assert n_remove + n_swap > 0, "Must specify at least one corruption type."
 36 | 
 37 |     def remove(code, n, sep=" ", min_length=1, max_length=10):
 38 |         """Remove n random chunks from the code. Chunks can be characters, words, or lines."""
 39 | 
 40 |         chunks = code.split(sep) if sep else list(code)
 41 | 
 42 |         # select n random chunks to remove
 43 |         indices = random.sample(
 44 |             [
 45 |                 i
 46 |                 for i, chunk in enumerate(chunks)
 47 |                 if min_length <= len(chunk) <= max_length
 48 |             ],
 49 |             n,
 50 |         )
 51 |         bt.logging.info(
 52 |             f"Removing the following {len(indices)} chunks: {[chunks[i] for i in indices]} at indices {indices}"
 53 |         )
 54 | 
 55 |         return sep.join([chunk for i, chunk in enumerate(chunks) if i not in indices])
 56 | 
 57 |     def swap(code, sep=" ", min_length=1, max_length=10):
 58 |         """Swap two random chunks in the code. Chunks can be characters, words, or lines."""
 59 |         chunks = code.split(sep) if sep else list(code)
 60 | 
 61 |         # select 2 random chunks to swap
 62 |         indices = random.sample(
 63 |             [
 64 |                 i
 65 |                 for i, chunk in enumerate(chunks)
 66 |                 if min_length <= len(chunk) <= max_length
 67 |             ],
 68 |             2,
 69 |         )
 70 | 
 71 |         bt.logging.info(
 72 |             f"Swapping chunk {chunks[indices[0]]!r} at index {indices[0]} with chunk {chunks[indices[1]]!r} at index {indices[1]}"
 73 |         )
 74 | 
 75 |         chunks[indices[0]], chunks[indices[1]] = (
 76 |             chunks[indices[1]],
 77 |             chunks[indices[0]],
 78 |         )
 79 | 
 80 |         return sep.join(chunks)
 81 | 
 82 |     # Do this at your peril. It doesn't catch multiline comments or strings.
 83 |     if remove_comment_lines:
 84 |         code = "\n".join(
 85 |             [
 86 |                 line
 87 |                 for line in code.splitlines()
 88 |                 if not line.strip() or line.strip().startswith("#", "//")
 89 |             ]
 90 |         )
 91 | 
 92 |     # spread n corruptions across the code
 93 |     for i in range(n_remove):
 94 |         code = remove(code, n=1, sep=sep, min_length=min_length, max_length=max_length)
 95 |     for i in range(n_swap):
 96 |         code = swap(code, sep=sep, min_length=min_length, max_length=max_length)
 97 | 
 98 |     return code
 99 | 
100 | 
101 | def diff(query, reference):
102 |     """Get the diff between two strings."""
103 |     return "\n".join(difflib.unified_diff(query.splitlines(), reference.splitlines()))
104 | 
105 | 
106 | @dataclass
107 | class DebuggingTask(Task):
108 | 
109 |     name = "debugging"
110 |     desc = "get help with debugging"
111 |     goal = "ask for help fixing broken code."
112 | 
113 |     reward_definition = [dict(name="diff", weight=1.0)]
114 | 
115 |     penalty_definition = []
116 | 
117 |     static_reference = True
118 |     static_query = True
119 | 
120 |     def __init__(self, llm_pipeline, context, create_reference=True):
121 | 
122 |         self.context = context
123 | 
124 |         # No LLM involved in generating the query, we just apply some language-independent corruption to the code
125 |         self.query = corrupt(
126 |             context.content,
127 |             n_remove=random.randint(1, 3),
128 |             n_swap=random.randint(0, 2),
129 |             sep=random.choices(["", " ", "\n"], weights=[0.3, 0.6, 0.1], k=1)[0],
130 |         )
131 |         self.reference = context.content
132 |         self.delimiter = "```"
133 |         self.topic = context.title
134 |         self.subtopic = context.subtopic
135 |         self.tags = context.tags
136 | 
137 |     def format_challenge(self, challenge):
138 |         return f"{challenge}\n{self.delimiter}\n{self.query}\n{self.delimiter}"
139 | 


--------------------------------------------------------------------------------
/prompting/tasks/generic_instruction.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import bittensor as bt
  3 | from dataclasses import dataclass
  4 | from tenacity import retry, stop_after_attempt
  5 | from prompting.tasks import Task
  6 | from typing import Tuple
  7 | 
  8 | CRITERIA_GENERATION_PROMPT = """\
  9 | We are brainstorming criteria with which to grade a language model on its responses in
 10 | diverse situations.
 11 | A ‘criteria‘ is some useful, real-world objective, and associated rubric for scores 1-5, that
 12 | tests a capability.
 13 | 
 14 | Please brainstorm a new criteria and scoring rubrics.
 15 | Be creative and create new but useful criteria that people in different settings or industries
 16 | might find practical.
 17 | Please format the output as same as the above examples with no extra or surrounding text.
 18 | Write [END] after you are done.
 19 | New Criteria:
 20 | """
 21 | 
 22 | 
 23 | INSTRUCTION_GENERATION_PROMPT = """\
 24 | Your job is to generate a new novel problem and a response that is related to the given score
 25 | rubric.
 26 | The score rubric:
 27 | {CRITERIA}
 28 | * Problem
 29 | - The problem should inherently be related to the score criteria and score rubric given above.
 30 | Specifically, the score criteria should be the core attributes required to solve the problem.
 31 | - The problem itself should not be too generic or easy to solve.
 32 | - If the score rubric is related to logical abilities, generate problems that require math or
 33 | coding abilities.
 34 | - Try to make the person who might solve the problem not notice the existence of the score
 35 | rubric by not explicitly mentioning it, and also provide additional inputs and options if
 36 | needed.
 37 | - Assume a situation where a user is interacting with an AI model. The user would try to
 38 | ask in a first-person point of view, but not using terms like ”I”, ”A User” or ”You” in the
 39 | first sentence.
 40 | - Do not give a role to the AI, assume that the user is asking a question from his point of
 41 | view.
 42 | - Do not include any phrase related to AI model in the problem.
 43 | * Response
 44 | - The response should be a response that would get a score of 5 from the score rubric.
 45 | - The response should be as detailed as possible unless the score rubric is related to
 46 | conciseness or brevity. It should consist of multiple paragraphs, a list of items, or a
 47 | step-by-step reasoning process.
 48 | - The response should look like how a well-prompted GPT-4 would normally answer your
 49 | problem.
 50 | * Format
 51 | - DO NOT WRITE ANY GREETING MESSAGES, just write the problem and response
 52 | only.
 53 | - In front of the problem, append the phrase ”Problem:” and in front of the response, append
 54 | the phrase ”Response:”.
 55 | - Write in the order of ”Problem” - ”Response”, where the two items are separated by the
 56 | phrase ”[NEXT]”.
 57 | - Write [END] after you are done.
 58 | Data Generation:
 59 | """
 60 | 
 61 | 
 62 | @dataclass
 63 | class GenericInstructionTask(Task):
 64 |     reward_definition = [
 65 |         dict(name="rouge", ngram="rouge-1", metric="f", weight=1.0),
 66 |         dict(name="relevance", threshold=None, weight=1.0),
 67 |     ]
 68 | 
 69 |     def __init__(self, llm_pipeline):
 70 |         super().__init__(
 71 |             name="generic_instruction",
 72 |             goal="to get the answer to a instruction",
 73 |             delimiter="```",
 74 |             reward_types=[
 75 |                 "CRITERIA_REWARD",
 76 |             ],
 77 |             reward_threshold=0.5,
 78 |             use_challenge_as_prompt=True,
 79 |             desc="",
 80 |             topics={},
 81 |             topic="",
 82 |             subtopic="",
 83 |             challenge="",
 84 |             reference="",
 85 |             criteria="",
 86 |         )
 87 | 
 88 |         self.criteria = self.create_criteria(llm_pipeline)
 89 |         instruction, reference = self.create_instruction_and_reference(llm_pipeline)
 90 |         self.challenge = instruction
 91 |         self.reference = reference
 92 | 
 93 |     def extract_instruction_and_reference_from_text(self, text: str) -> Tuple[str, str]:
 94 |         # Split the text into problem and response using regular expression
 95 |         split_text = re.split(r"\nResponse:\n", text)
 96 | 
 97 |         # Extract problem and response
 98 |         problem = split_text[0].strip()
 99 |         response = split_text[1].strip()
100 | 
101 |         return problem, response
102 | 
103 |     def create_criteria(self, llm) -> str:
104 |         bt.logging.debug("🎲 Creating a generic criteria-scoring rubric ...")
105 | 
106 |         # Generate a score rubric with defined criterias
107 |         criteria_generation_response = llm(CRITERIA_GENERATION_PROMPT)
108 |         return criteria_generation_response
109 | 
110 |     @retry(stop=stop_after_attempt(5))
111 |     def create_instruction_and_reference(self, llm) -> Tuple[str, str]:
112 |         try:
113 |             bt.logging.debug("📋 🎯 Creating instruction and referece text...")
114 | 
115 |             if not self.criteria:
116 |                 raise ValueError(
117 |                     "Criteria must be defined before creating a generic instruction."
118 |                 )
119 | 
120 |             # Create generic instruction with the score rubric
121 |             instruction_generation_prompt_with_criteria = (
122 |                 INSTRUCTION_GENERATION_PROMPT.format(CRITERIA=self.criteria)
123 |             )
124 |             instruction_generation_response = llm(
125 |                 instruction_generation_prompt_with_criteria
126 |             )
127 | 
128 |             # Extract generic instruction and reference response from the generated text
129 |             (
130 |                 instruction,
131 |                 reference,
132 |             ) = self.extract_instruction_and_reference_from_text(
133 |                 instruction_generation_response
134 |             )
135 | 
136 |             return instruction, reference
137 |         except Exception as e:
138 |             bt.logging.error(
139 |                 f"Failed to create instruction and reference text: {e}, retrying..."
140 |             )
141 |             raise e
142 | 


--------------------------------------------------------------------------------
/prompting/tasks/math.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import bittensor as bt
 3 | from dataclasses import dataclass
 4 | from prompting.tasks import Task
 5 | 
 6 | 
 7 | @dataclass
 8 | class MathTask(Task):
 9 | 
10 |     name = "math"
11 |     desc = "get help solving a math problem"
12 |     goal = "to get the answer to the following math question"
13 | 
14 |     reward_definition = [
15 |         dict(name="float_diff", weight=1.0),
16 |     ]
17 |     penalty_definition = []
18 | 
19 |     static_reference = True
20 |     static_query = True
21 | 
22 |     def __init__(self, llm_pipeline, context, create_reference=True):
23 | 
24 |         self.context = context
25 | 
26 |         self.query = (
27 |             "How can I solve the following problem, "
28 |             + context.content
29 |             + "? Make sure to include the whole problem when you ask your question."
30 |         )
31 |         self.reference = context.extra["solution"]
32 |         self.topic = context.title
33 |         self.subtopic = context.topic
34 |         self.tags = context.tags
35 | 


--------------------------------------------------------------------------------
/prompting/tasks/qa.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from prompting.tasks import Task
 3 | 
 4 | # TODO: introduce criteria for the query and reference answer (length, layout, etc.) and make these arguments
 5 | # TODO
 6 | 
 7 | # Used to instruct the LLM to provide a good query when given a context
 8 | QUERY_SYSTEM_PROMPT = """\
 9 | You are a question-generating expert, focusing on delivering comprehensive and accurate questions with depth and clarity. The questions you generate should be based on the context that is provided.
10 | You will maintain a neutral tone in your questions.
11 | You will adhere to a word limit of 50 words for each question.
12 | """
13 | 
14 | # Used to obtain the query (which is a question about the context)
15 | QUERY_PROMPT_TEMPLATE = """\
16 | Ask a specific question about the following context:
17 | 
18 | #Context:
19 | {context}
20 | """
21 | 
22 | # Used to instruct the LLM to provide a good answer to the query when given a context
23 | REFERENCE_SYSTEM_PROMPT = """\
24 | You are a question-answering expert, focusing on delivering comprehensive and accurate responses with depth and clarity.
25 | You will maintain a neutral tone in your explanations.
26 | You will adhere to a word limit of 150 words for each response. Where applicable, include references to credible sources to support your answers.
27 | """
28 | 
29 | # Used to obtain reference answer
30 | REFERENCE_PROMPT_TEMPLATE = """\
31 | Answer the question you will receive in detail, utilizing the following context.
32 | 
33 | #Context:
34 | {context}
35 | 
36 | # Question:
37 | {question}
38 | """
39 | 
40 | 
41 | @dataclass
42 | class QuestionAnsweringTask(Task):
43 | 
44 |     name = "question-answering"
45 |     desc = "get help on answering a question"
46 |     goal = "to get the answer to the following question"
47 | 
48 |     reward_definition = [
49 |         dict(name="rouge", ngram="rouge-1", metric="f", weight=0.5),
50 |         dict(name="relevance", weight=0.5),
51 |     ]
52 |     penalty_definition = [
53 |         dict(name="rouge", ngram="rouge-1", metric="f", weight=0.5),
54 |     ]
55 | 
56 |     cleaning_pipeline = [
57 |         dict(name="remove_quotes"),
58 |         dict(name="prune_ending"),
59 |         dict(name="remove_roles"),
60 |     ]
61 | 
62 |     def __init__(self, llm_pipeline, context, create_reference=True):
63 | 
64 |         self.context = context
65 | 
66 |         self.query_system_prompt = QUERY_SYSTEM_PROMPT
67 |         self.query_prompt = QUERY_PROMPT_TEMPLATE.format(context=context.content)
68 |         self.query = self.generate_query(llm_pipeline)
69 | 
70 |         self.reference_system_prompt = REFERENCE_SYSTEM_PROMPT
71 |         self.reference_prompt = REFERENCE_PROMPT_TEMPLATE.format(
72 |             context=context.content, question=self.query
73 |         )
74 |         if create_reference:
75 |             self.reference = self.generate_reference(llm_pipeline)
76 | 
77 |         self.topic = context.title
78 |         self.subtopic = context.topic
79 |         self.tags = context.tags
80 | 


--------------------------------------------------------------------------------
/prompting/tasks/summarization.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from prompting.tasks import Task
 3 | from transformers import Pipeline
 4 | 
 5 | 
 6 | # TODO: introduce criteria for the query and reference answer (length, layout, etc.) and make these arguments
 7 | 
 8 | # TODO: Also add a query system prompt and a query prompt template
 9 | # TODO: Add the option to generate the summary query from the context. e.g. "the childhood of Abraham Lincoln" which is more specific than summarizing the entire article (Abraham Lincoln)
10 | 
11 | # Used to instruct the LLM to provide a good answer to the query when given a context
12 | SUMMARIZATION_SYSTEM_PROMPT = """\
13 | You are a summarization AI assistant. You make excellent and concise summaries that adhere to your given instructions.
14 | You will maintain a neutral tone in your summaries.
15 | You will adhere to a word limit of 250 words for each response.
16 | """
17 | 
18 | # Used to obtain reference answer
19 | REFERENCE_PROMPT_TEMPLATE = """\
20 | Summarize the following context in a concise and accurate manner:
21 | 
22 | ## Context
23 | {context}
24 | """
25 | 
26 | 
27 | @dataclass
28 | class SummarizationTask(Task):
29 | 
30 |     name = "summarization"
31 |     desc = "get help with summarization"
32 |     goal = "summarize the following topic"
33 | 
34 |     reward_definition = [
35 |         dict(name="rouge", ngram="rouge-l", metric="f", weight=0.5),
36 |         dict(name="relevance", weight=0.5),
37 |     ]
38 |     penalty_definition = [dict(name="rouge", ngram="rouge-1", metric="f", weight=0.5)]
39 | 
40 |     # This is where you define cleaning procedures for the generation.
41 |     # Can be used when wanting to clean the challenge.
42 |     cleaning_pipeline = [
43 |         dict(name="remove_quotes"),
44 |         dict(name="prune_ending"),
45 |         dict(name="remove_roles"),
46 |     ]
47 | 
48 |     static_query = True
49 | 
50 |     def __init__(self, llm_pipeline: Pipeline, context: str, create_reference=True):
51 | 
52 |         self.context = context
53 | 
54 |         # Query is just the article title and section name
55 |         self.query = context.title + ", " + context.topic
56 | 
57 |         self.reference_system_prompt = SUMMARIZATION_SYSTEM_PROMPT
58 |         self.reference_prompt = REFERENCE_PROMPT_TEMPLATE.format(
59 |             context=context.content
60 |         )
61 |         if create_reference:
62 |             self.reference = self.generate_reference(llm_pipeline)
63 | 
64 |         self.topic = context.title
65 |         self.subtopic = context.topic
66 |         self.tags = context.tags
67 | 


--------------------------------------------------------------------------------
/prompting/tasks/task.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import bittensor as bt
  3 | from abc import ABC
  4 | from dataclasses import dataclass, asdict
  5 | from enum import Enum
  6 | from typing import List, Union, Dict
  7 | from prompting.llm import HuggingFaceLLM
  8 | from transformers import Pipeline
  9 | from prompting.cleaners.cleaner import CleanerPipeline
 10 | 
 11 | 
 12 | class TaskEvaluationType(Enum):
 13 |     REWARD_STACK = "reward"
 14 |     FILTER_STACK = "filter"
 15 |     PENALTY_STACK = "penalty"
 16 |     SIMILARITY_STACK = "similarity"
 17 |     RELEVANCE_STACK = "relevance"
 18 | 
 19 | 
 20 | @dataclass
 21 | class Task(ABC):
 22 |     # topics: dict
 23 |     name: str
 24 |     desc: str
 25 |     goal: str
 26 |     query: str
 27 |     topic: str
 28 |     subtopic: str
 29 |     tags: List[str]
 30 |     context: dict
 31 |     reward_definition: List[dict]
 32 |     penalty_definition: List[dict] = None
 33 |     reward_threshold: float = 0.0
 34 |     reference: Union[str, List[str]] = ""
 35 |     criteria: str = ("",)
 36 |     delimiter: str = ""
 37 |     complete: bool = False
 38 |     static_reference: bool = False
 39 |     static_query: bool = False
 40 |     reference_system_prompt = ""
 41 |     reference_prompt = ""
 42 |     query_system_prompt = ""
 43 |     query_prompt = ""
 44 |     cleaner = None
 45 | 
 46 |     def __str__(self):
 47 |         return f"{self.__class__.__name__}(name={self.name!r}, desc={self.desc!r}, goal={self.goal!r}, query={self.query!r}, reference={self.reference!r}, topic={self.topic!r}, subtopic={self.subtopic!r}, tags={self.tags!r})"
 48 | 
 49 |     def __repr__(self):
 50 |         return str(self)
 51 | 
 52 |     def __state_dict__(self, full=False):
 53 |         state = {
 54 |             "task": self.name,
 55 |             "desc": self.desc,
 56 |             "goal": self.goal,
 57 |             "query": self.query,  # For now we just use the raw query but should add delimiters again
 58 |             "query_time": getattr(self, "query_time", 0),
 59 |             "reference": self.reference,
 60 |             "reference_time": getattr(self, "reference_time", 0),
 61 |             "topic": self.topic,
 62 |             "subtopic": self.subtopic,
 63 |             "context_time": self.context.stats.get("fetch_time", 0.0),
 64 |         }
 65 |         if full:
 66 |             state.update(asdict(self.context))
 67 | 
 68 |         return state
 69 | 
 70 |     def generate(self, system: str, prompt: str, llm: Pipeline, clean=True) -> str:
 71 |         """Uses the llm to generate a response to a prompt"""
 72 | 
 73 |         cleaner = (
 74 |             CleanerPipeline(cleaning_pipeline=self.cleaning_pipeline) if clean else None
 75 |         )
 76 |         return HuggingFaceLLM(llm, system_prompt=system).query(
 77 |             message=prompt, cleaner=cleaner
 78 |         )
 79 | 
 80 |     def generate_reference(self, llm: Pipeline, clean=True) -> str:
 81 |         """Generates a reference answer to be used for scoring miner completions"""
 82 |         t0 = time.time()
 83 |         if not self.static_reference:
 84 |             bt.logging.debug("🤖 Generating reference...")
 85 | 
 86 |             self.reference = self.generate(
 87 |                 system=self.reference_system_prompt,
 88 |                 prompt=self.reference_prompt,
 89 |                 llm=llm,
 90 |                 clean=clean,
 91 |             )
 92 | 
 93 |         self.reference_time = time.time() - t0
 94 |         return self.reference
 95 | 
 96 |     def generate_query(self, llm: Pipeline, clean=True) -> str:
 97 |         """Generates a query to be used for generating the challenge"""
 98 |         t0 = time.time()
 99 |         if not self.static_query:
100 |             bt.logging.debug("🤖 Generating query...")
101 |             self.query = self.generate(
102 |                 system=self.query_system_prompt,
103 |                 prompt=self.query_prompt,
104 |                 llm=llm,
105 |                 clean=clean,
106 |             )
107 | 
108 |         self.query_time = time.time() - t0
109 |         return self.query
110 | 
111 |     def format_challenge(self, challenge) -> str:
112 |         """Formats the challenge to be used for the conversation"""
113 |         return challenge
114 | 


--------------------------------------------------------------------------------
/prompting/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | from .datasets import (
 2 |     Context,
 3 |     Dataset,
 4 |     MockDataset,
 5 |     HFCodingDataset,
 6 |     WikiDataset,
 7 |     StackOverflowDataset,
 8 |     WikiDateDataset,
 9 |     MathDataset,
10 | )
11 | from .selector import Selector
12 | 


--------------------------------------------------------------------------------
/prompting/tools/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .context import Context
2 | from .base import Dataset
3 | from .code import HFCodingDataset, StackOverflowDataset
4 | from .math import MathDataset
5 | from .mock import MockDataset
6 | from .wiki import WikiDataset, WikiDateDataset
7 | 


--------------------------------------------------------------------------------
/prompting/tools/datasets/base.py:
--------------------------------------------------------------------------------
 1 | # The MIT License (MIT)
 2 | # Copyright © 2024 Yuma Rao
 3 | # Copyright © 2023 Opentensor Foundation
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 6 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation
 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
 8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 9 | 
10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of
11 | # the Software.
12 | 
13 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
14 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
17 | # DEALINGS IN THE SOFTWARE.
18 | 
19 | import time
20 | from abc import ABC, abstractmethod
21 | from typing import Dict
22 | import bittensor as bt
23 | 
24 | from ..selector import Selector
25 | from .context import Context
26 | from prompting.utils.exceptions import MaxRetryError
27 | 
28 | 
29 | class Dataset(ABC):
30 |     """Base class for datasets."""
31 | 
32 |     max_tries: int = 10
33 | 
34 |     @abstractmethod
35 |     def search(self, name): ...
36 | 
37 |     @abstractmethod
38 |     def random(self, name): ...
39 | 
40 |     @abstractmethod
41 |     def get(self, name): ...
42 | 
43 |     def next(
44 |         self, method: str = "random", selector: Selector = Selector(), **kwargs
45 |     ) -> Dict:
46 |         tries = 1
47 |         t0 = time.time()
48 | 
49 |         while True:
50 | 
51 |             # TODO: Multithread the get method so that we don't have to suffer nonexistent pages
52 |             info = {}
53 |             if method == "random":
54 |                 info = self.random(selector=selector, **kwargs)
55 |             elif method == "search":
56 |                 info = self.search(selector=selector, **kwargs)
57 |             elif method == "get":
58 |                 info = self.get(selector=selector, **kwargs)
59 |             else:
60 |                 raise ValueError(f"Unknown dataset get method {method!r}")
61 | 
62 |             if info:
63 |                 break
64 | 
65 |             bt.logging.debug(
66 |                 f"Could not find any samples which meet {self.__class__.__name__} requirements after {tries} tries. Retrying... ({self.max_tries - tries} tries remaining.)"
67 |             )
68 | 
69 |             tries += 1
70 |             if tries >= self.max_tries:
71 |                 raise MaxRetryError(
72 |                     f"Could not find any samples which meet {self.__class__.__name__} requirements after {tries} tries."
73 |                 )
74 | 
75 |         info["stats"] = {
76 |             "creator": self.__class__.__name__,
77 |             "fetch_time": time.time() - t0,
78 |             "num_tries": tries,
79 |             "fetch_method": method,
80 |             "next_kwargs": kwargs,
81 |         }
82 |         return Context(**info)
83 | 


--------------------------------------------------------------------------------
/prompting/tools/datasets/context.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from dataclasses import dataclass
 3 | 
 4 | 
 5 | @dataclass
 6 | class Context:
 7 | 
 8 |     # TODO: Pydantic model
 9 |     title: str
10 |     topic: str
11 |     subtopic: str
12 |     content: str
13 |     internal_links: List[str]
14 |     external_links: List[str]
15 |     source: str
16 |     tags: List[str] = None
17 |     extra: dict = None  # additional non-essential information
18 |     stats: dict = None  # retrieval stats such as fetch time, number of tries, etc.
19 | 


--------------------------------------------------------------------------------
/prompting/tools/datasets/math.py:
--------------------------------------------------------------------------------
  1 | # The MIT License (MIT)
  2 | # Copyright © 2024 Yuma Rao
  3 | # Copyright © 2023 Opentensor Foundation
  4 | 
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
  6 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation
  7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
  8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  9 | 
 10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of
 11 | # the Software.
 12 | 
 13 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
 14 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 17 | # DEALINGS IN THE SOFTWARE.
 18 | 
 19 | import time
 20 | import random
 21 | import itertools
 22 | import mathgenerator
 23 | import bittensor as bt
 24 | from sympy.parsing.latex import parse_latex
 25 | from typing import Dict, Union, List, Tuple
 26 | 
 27 | 
 28 | from .base import Dataset
 29 | from ..selector import Selector
 30 | 
 31 | 
 32 | class MathDataset(Dataset):
 33 |     topics_list = mathgenerator.getGenList()
 34 | 
 35 |     def __init__(self, seed=None):
 36 | 
 37 |         self.seed = seed
 38 |         self.rng = random.Random(seed)
 39 | 
 40 |     def get(
 41 |         self,
 42 |         name: str,
 43 |         selector: Selector = None,
 44 |         include: List = None,
 45 |         exclude: List = None,
 46 |         **kwargs,
 47 |     ) -> Dict:
 48 |         """Get a math problem.
 49 | 
 50 |         Args:
 51 |             name (str): Name of math problem to generate.
 52 |             selector (Selector, optional): Selector instance to choose a specific problem. Defaults to None.
 53 |             include (List, optional): _description_. Defaults to None.
 54 |             exclude (List, optional): _description_. Defaults to None.
 55 | 
 56 |         Returns:
 57 |             Dict: _description_
 58 |         """
 59 |         bt.logging.debug(f"Getting math problem {name!r}")
 60 |         info = mathgenerator.generate_context(name, **kwargs)
 61 |         if info["reward_type"] != "float":
 62 |             return None
 63 | 
 64 |         math_words = [
 65 |             "math",
 66 |             "mathematics",
 67 |             "mathematical",
 68 |             "math problem",
 69 |             "math technique",
 70 |         ]
 71 |         external_links = []
 72 |         # construct external links from randomly shuffled trigrams containing 2 words from the problem and 1 random math word
 73 |         # binary_to_decimal -> ['binary to', 'to decimal']
 74 |         for bigram in itertools.combinations(info["forward_words"], 2):
 75 |             words = list(bigram) + [random.choice(math_words)]
 76 |             # shuffle the words e.g. ['binary', 'decimal', 'math problem'] -> 'decimal binary math problem'
 77 |             external_links.append(" ".join(random.sample(words, len(words))))
 78 | 
 79 |         return {
 80 |             "title": info["topic"],  # title of math problem
 81 |             "topic": info["topic"],  # title of problem topic
 82 |             "subtopic": info["subtopic"],  # title of problem subtopic
 83 |             "content": info["problem"],  # problem statement
 84 |             "internal_links": [info["topic"], info["subtopic"]],  # internal links
 85 |             "external_links": external_links,
 86 |             "tags": info["forward_words"],
 87 |             "source": "Mathgenerator",
 88 |             "extra": {"reward_type": info["reward_type"], "solution": info["solution"]},
 89 |         }
 90 | 
 91 |     def search(
 92 |         self, name, selector: Selector, include: List = None, exclude: List = None
 93 |     ) -> Dict:
 94 |         raise NotImplementedError(
 95 |             f"Search is not implemented for {self.__class__.__name__}"
 96 |         )
 97 | 
 98 |     def random(self, selector: Selector, **kwargs):
 99 |         """Create a random math problem."""
100 |         return self.get(name=None, selector=selector, **kwargs)
101 | 


--------------------------------------------------------------------------------
/prompting/tools/datasets/mock.py:
--------------------------------------------------------------------------------
 1 | from .base import Dataset
 2 | 
 3 | # from ..selector import Selector
 4 | 
 5 | 
 6 | class MockDataset(Dataset):
 7 | 
 8 |     def get(self, name, exclude=None, selector=None):
 9 |         return {
10 |             "title": name,
11 |             "topic": "Physics",
12 |             "subtopic": "Quantum_mechanics",
13 |             "content": f"{name} is a fraud. All of physics is a lie, the universe is a hologram, buy gold, bye!",
14 |             "internal_links": [
15 |                 "Quantum_mechanics",
16 |                 "General_relativity",
17 |                 "Special_relativity",
18 |                 "String_theory",
19 |             ],
20 |             "external_links": ["Einstein", "Bohr", "Feynman", "Hawking"],
21 |             "tags": ["fraud", "hologram", "gold"],
22 |             "source": "Mockpedia",
23 |             "extra": {"solution": "religion"},
24 |         }
25 | 
26 |     def search(self, name, exclude=None, selector=None):
27 |         return self.get(name)
28 | 
29 |     def random(self, name="Physics", exclude=None, selector=None):
30 |         return self.get(name)
31 | 


--------------------------------------------------------------------------------
/prompting/tools/selector.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | class Selector:
 5 |     def __init__(self, seed=None):
 6 |         self.seed = seed
 7 |         self.rng = random.Random(seed)
 8 | 
 9 |     def __call__(self, items, weights=None):
10 |         return self.rng.choices(items, weights=weights)[0]
11 | 
12 | 
13 | class PageRankSelector(Selector):
14 |     """Preferentially chooses the items at the top of the list, under the assumption that they are more important."""
15 | 
16 |     def __init__(self, seed=None, alpha=0.85):
17 |         super().__init__(seed)
18 |         self.alpha = alpha
19 | 
20 |     def __call__(self, items):
21 |         weights = [self.alpha**i for i in range(len(items))]
22 |         return self.rng.choices(items, weights=weights)[0]
23 | 
24 | 
25 | class SimilaritySelector(Selector):
26 |     """Chooses the item most similar to the query."""
27 | 
28 |     def __init__(self, seed=None, similarity_fn=None):
29 |         super().__init__(seed)
30 |         self.similarity_fn = similarity_fn
31 | 
32 |     def __call__(self, query, items):
33 |         return max(items, key=lambda item: self.similarity_fn(query, item))
34 | 
35 | 
36 | class TopSelector(Selector):
37 |     """Chooses the top item."""
38 | 
39 |     def __init__(self, seed=None):
40 |         super().__init__(seed)
41 | 
42 |     def __call__(self, items):
43 |         return items[0]
44 | 
45 | 
46 | if __name__ == "__main__":
47 | 
48 |     selector = Selector(seed=42)
49 |     items = range(10)
50 |     item = selector(items)
51 | 
52 |     assert item in items, "Selector should return one of the items"
53 | 


--------------------------------------------------------------------------------
/prompting/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/prompting/utils/__init__.py


--------------------------------------------------------------------------------
/prompting/utils/exceptions.py:
--------------------------------------------------------------------------------
1 | class MaxRetryError(Exception):
2 |     """Exception raised when the maximum number of retries is exceeded."""
3 | 
4 |     def __init__(self, message="Maximum number of retries exceeded"):
5 |         self.message = message
6 |         super().__init__(self.message)
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | bittensor==9.0.2
 2 | datasets==2.18.0
 3 | langchain_ollama==0.1.3
 4 | loguru==0.7.0
 5 | numpy==2.0.2
 6 | pandas==2.2.3
 7 | pydantic==2.10.3
 8 | pylatexenc==2.10
 9 | Requests==2.31.0
10 | rich==13.7.1
11 | scikit_learn==1.5.2
12 | setuptools==70.0.0
13 | torch==2.4.1
14 | tqdm==4.66.2
15 | transformers==4.36.0
16 | nltk==3.8.1
17 | wandb==0.17.2
18 | hf_transfer
19 | gdown
20 | zstandard==0.22.0
21 | pyspellchecker==0.8.1
22 | symspellpy==6.7.7
23 | typo==0.1.7
24 | scalecodec
25 | fasttext-numpy2==0.10.4
26 | accelerate==1.0.1
27 | bittensor-cli==9.0.2
28 | 
29 | bs4
30 | pre-commit==3.3.2
31 | sentencepiece
32 | tenacity
33 | wikipedia
34 | wikipedia_sections


--------------------------------------------------------------------------------
/scripts/check_compatibility.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -z "$1" ]; then
 4 |     echo "Please provide a Python version as an argument."
 5 |     exit 1
 6 | fi
 7 | 
 8 | python_version="$1"
 9 | all_passed=true
10 | 
11 | GREEN='\033[0;32m'
12 | YELLOW='\033[0;33m'
13 | RED='\033[0;31m'
14 | NC='\033[0m' # No Color
15 | 
16 | check_compatibility() {
17 |     all_supported=0
18 | 
19 |     while read -r requirement; do
20 |         # Skip lines starting with git+
21 |         if [[ "$requirement" == git+* ]]; then
22 |             continue
23 |         fi
24 | 
25 |         package_name=$(echo "$requirement" | awk -F'[!=<>]' '{print $1}' | awk -F'[' '{print $1}') # Strip off brackets
26 |         echo -n "Checking $package_name... "
27 | 
28 |         url="https://pypi.org/pypi/$package_name/json"
29 |         response=$(curl -s $url)
30 |         status_code=$(curl -s -o /dev/null -w "%{http_code}" $url)
31 | 
32 |         if [ "$status_code" != "200" ]; then
33 |             echo -e "${RED}Information not available for $package_name. Failure.${NC}"
34 |             all_supported=1
35 |             continue
36 |         fi
37 | 
38 |         classifiers=$(echo "$response" | jq -r '.info.classifiers[]')
39 |         requires_python=$(echo "$response" | jq -r '.info.requires_python')
40 | 
41 |         base_version="Programming Language :: Python :: ${python_version%%.*}"
42 |         specific_version="Programming Language :: Python :: $python_version"
43 | 
44 |         if echo "$classifiers" | grep -q "$specific_version" || echo "$classifiers" | grep -q "$base_version"; then
45 |             echo -e "${GREEN}Supported${NC}"
46 |         elif [ "$requires_python" != "null" ]; then
47 |             if echo "$requires_python" | grep -Eq "==$python_version|>=$python_version|<=$python_version"; then
48 |                 echo -e "${GREEN}Supported${NC}"
49 |             else
50 |                 echo -e "${RED}Not compatible with Python $python_version due to constraint $requires_python.${NC}"
51 |                 all_supported=1
52 |             fi
53 |         else
54 |             echo -e "${YELLOW}Warning: Specific version not listed, assuming compatibility${NC}"
55 |         fi
56 |     done < requirements.txt
57 | 
58 |     return $all_supported
59 | }
60 | 
61 | echo "Checking compatibility for Python $python_version..."
62 | check_compatibility
63 | if [ $? -eq 0 ]; then
64 |     echo -e "${GREEN}All requirements are compatible with Python $python_version.${NC}"
65 | else
66 |     echo -e "${RED}All requirements are NOT compatible with Python $python_version.${NC}"
67 |     all_passed=false
68 | fi
69 | 
70 | echo ""
71 | if $all_passed; then
72 |     echo -e "${GREEN}All tests passed.${NC}"
73 | else
74 |     echo -e "${RED}All tests did not pass.${NC}"
75 |     exit 1
76 | fi
77 | 


--------------------------------------------------------------------------------
/scripts/check_requirements_changes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if requirements files have changed in the last commit
 4 | if git diff --name-only HEAD~1 | grep -E 'requirements.txt|requirements.txt'; then
 5 |     echo "Requirements files have changed. Running compatibility checks..."
 6 |     echo 'export REQUIREMENTS_CHANGED="true"' >> $BASH_ENV
 7 | else
 8 |     echo "Requirements files have not changed. Skipping compatibility checks..."
 9 |     echo 'export REQUIREMENTS_CHANGED="false"' >> $BASH_ENV
10 | fi
11 | 


--------------------------------------------------------------------------------
/scripts/install_staging.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Section 1: Build/Install
  4 | # This section is for first-time setup and installations.
  5 | 
  6 | install_dependencies() {
  7 |     # Function to install packages on macOS
  8 |     install_mac() {
  9 |         which brew > /dev/null
 10 |         if [ $? -ne 0 ]; then
 11 |             echo "Installing Homebrew..."
 12 |             /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
 13 |         fi
 14 |         echo "Updating Homebrew packages..."
 15 |         brew update
 16 |         echo "Installing required packages..."
 17 |         brew install make llvm curl libssl protobuf tmux
 18 |     }
 19 | 
 20 |     # Function to install packages on Ubuntu/Debian
 21 |     install_ubuntu() {
 22 |         echo "Updating system packages..."
 23 |         sudo apt update
 24 |         echo "Installing required packages..."
 25 |         sudo apt install --assume-yes make build-essential git clang curl libssl-dev llvm libudev-dev protobuf-compiler tmux
 26 |     }
 27 | 
 28 |     # Detect OS and call the appropriate function
 29 |     if [[ "$OSTYPE" == "darwin"* ]]; then
 30 |         install_mac
 31 |     elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
 32 |         install_ubuntu
 33 |     else
 34 |         echo "Unsupported operating system."
 35 |         exit 1
 36 |     fi
 37 | 
 38 |     # Install rust and cargo
 39 |     curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 40 | 
 41 |     # Update your shell's source to include Cargo's path
 42 |     source "$HOME/.cargo/env"
 43 | }
 44 | 
 45 | # Call install_dependencies only if it's the first time running the script
 46 | if [ ! -f ".dependencies_installed" ]; then
 47 |     install_dependencies
 48 |     touch .dependencies_installed
 49 | fi
 50 | 
 51 | 
 52 | # Section 2: Test/Run
 53 | # This section is for running and testing the setup.
 54 | 
 55 | # Create a coldkey for the owner role
 56 | wallet=${1:-owner}
 57 | 
 58 | # Logic for setting up and running the environment
 59 | setup_environment() {
 60 |     # Clone subtensor and enter the directory
 61 |     if [ ! -d "subtensor" ]; then
 62 |         git clone https://github.com/opentensor/subtensor.git
 63 |     fi
 64 |     cd subtensor
 65 |     git pull
 66 | 
 67 |     # Update to the nightly version of rust
 68 |     ./scripts/init.sh
 69 | 
 70 |     cd ../bittensor-subnet-template
 71 | 
 72 |     # Install the bittensor-subnet-template python package
 73 |     python -m pip install -e .
 74 | 
 75 |     # Create and set up wallets
 76 |     # This section can be skipped if wallets are already set up
 77 |     if [ ! -f ".wallets_setup" ]; then
 78 |         btcli wallet new_coldkey --wallet.name $wallet --no_password --no_prompt
 79 |         btcli wallet new_coldkey --wallet.name miner --no_password --no_prompt
 80 |         btcli wallet new_hotkey --wallet.name miner --wallet.hotkey default --no_prompt
 81 |         btcli wallet new_coldkey --wallet.name validator --no_password --no_prompt
 82 |         btcli wallet new_hotkey --wallet.name validator --wallet.hotkey default --no_prompt
 83 |         touch .wallets_setup
 84 |     fi
 85 | 
 86 | }
 87 | 
 88 | # Call setup_environment every time
 89 | setup_environment 
 90 | 
 91 | ## Setup localnet
 92 | # assumes we are in the bittensor-subnet-template/ directory
 93 | # Initialize your local subtensor chain in development mode. This command will set up and run a local subtensor network.
 94 | cd ../subtensor
 95 | 
 96 | # Start a new tmux session and create a new pane, but do not switch to it
 97 | echo "FEATURES='pow-faucet runtime-benchmarks' BT_DEFAULT_TOKEN_WALLET=$(cat ~/.bittensor/wallets/$wallet/coldkeypub.txt | grep -oP '"ss58Address": "\K[^"]+') bash scripts/localnet.sh" >> setup_and_run.sh
 98 | chmod +x setup_and_run.sh
 99 | tmux new-session -d -s localnet -n 'localnet'
100 | tmux send-keys -t localnet 'bash ../subtensor/setup_and_run.sh' C-m
101 | 
102 | # Notify the user
103 | echo ">> localnet.sh is running in a detached tmux session named 'localnet'"
104 | echo ">> You can attach to this session with: tmux attach-session -t localnet"
105 | 
106 | # Register a subnet (this needs to be run each time we start a new local chain)
107 | btcli subnet create --wallet.name $wallet --wallet.hotkey default --subtensor.chain_endpoint ws://127.0.0.1:9946 --no_prompt
108 | 
109 | # Transfer tokens to miner and validator coldkeys
110 | export BT_MINER_TOKEN_WALLET=$(cat ~/.bittensor/wallets/miner/coldkeypub.txt | grep -oP '"ss58Address": "\K[^"]+')
111 | export BT_VALIDATOR_TOKEN_WALLET=$(cat ~/.bittensor/wallets/validator/coldkeypub.txt | grep -oP '"ss58Address": "\K[^"]+')
112 | 
113 | btcli wallet transfer --subtensor.network ws://127.0.0.1:9946 --wallet.name $wallet --dest $BT_MINER_TOKEN_WALLET --amount 1000 --no_prompt
114 | btcli wallet transfer --subtensor.network ws://127.0.0.1:9946 --wallet.name $wallet --dest $BT_VALIDATOR_TOKEN_WALLET --amount 10000 --no_prompt
115 | 
116 | # Register wallet hotkeys to subnet
117 | btcli subnet register --wallet.name miner --netuid 1 --wallet.hotkey default --subtensor.chain_endpoint ws://127.0.0.1:9946 --no_prompt
118 | btcli subnet register --wallet.name validator --netuid 1 --wallet.hotkey default --subtensor.chain_endpoint ws://127.0.0.1:9946 --no_prompt
119 | 
120 | # Add stake to the validator
121 | btcli stake add --wallet.name validator --wallet.hotkey default --subtensor.chain_endpoint ws://127.0.0.1:9946 --amount 10000 --no_prompt
122 | 
123 | # Ensure both the miner and validator keys are successfully registered.
124 | btcli subnet list --subtensor.chain_endpoint ws://127.0.0.1:9946
125 | btcli wallet overview --wallet.name validator --subtensor.chain_endpoint ws://127.0.0.1:9946 --no_prompt
126 | btcli wallet overview --wallet.name miner --subtensor.chain_endpoint ws://127.0.0.1:9946 --no_prompt
127 | 
128 | cd ../bittensor-subnet-template
129 | 
130 | 
131 | # Check if inside a tmux session
132 | if [ -z "$TMUX" ]; then
133 |     # Start a new tmux session and run the miner in the first pane
134 |     tmux new-session -d -s bittensor -n 'miner' 'python neurons/miner.py --netuid 1 --subtensor.chain_endpoint ws://127.0.0.1:9946 --wallet.name miner --wallet.hotkey default --logging.debug'
135 |     
136 |     # Split the window and run the validator in the new pane
137 |     tmux split-window -h -t bittensor:miner 'python neurons/validator.py --netuid 1 --subtensor.chain_endpoint ws://127.0.0.1:9946 --wallet.name validator --wallet.hotkey default --logging.debug'
138 |     
139 |     # Attach to the new tmux session
140 |     tmux attach-session -t bittensor
141 | else
142 |     # If already in a tmux session, create two panes in the current window
143 |     tmux split-window -h 'python neurons/miner.py --netuid 1 --subtensor.chain_endpoint ws://127.0.0.1:9946 --wallet.name miner --wallet.hotkey default --logging.debug'
144 |     tmux split-window -v -t 0 'python neurons/validator.py --netuid 1 --subtensor.chain_endpoint ws://127.0.0.1:9946 --wallet.name3 validator --wallet.hotkey default --logging.debug'
145 | fi
146 | 


--------------------------------------------------------------------------------
/scripts/start_validator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script runs a validator process and automatically updates it when a new version is released.
  3 | Command-line arguments will be forwarded to validator (`neurons/validator.py`), so you can pass
  4 | them like this:
  5 |     python3 scripts/start_validator.py --wallet.name=my-wallet
  6 | Auto-updates are enabled by default and will make sure that the latest version is always running
  7 | by pulling the latest version from git and upgrading python packages. This is done periodically.
  8 | Local changes may prevent the update, but they will be preserved.
  9 | 
 10 | The script will use the same virtual environment as the one used to run it. If you want to run
 11 | validator within virtual environment, run this auto-update script from the virtual environment.
 12 | 
 13 | Pm2 is required for this script. This script will start a pm2 process using the name provided by
 14 | the --pm2_name argument.
 15 | """
 16 | import argparse
 17 | import logging
 18 | import subprocess
 19 | import sys
 20 | import time
 21 | from datetime import timedelta
 22 | from shlex import split
 23 | from typing import List
 24 | from pathlib import Path
 25 | 
 26 | log = logging.getLogger(__name__)
 27 | UPDATES_CHECK_TIME = timedelta(minutes=15)
 28 | 
 29 | ROOT_DIR = Path(__file__).parent.parent
 30 | 
 31 | def get_version() -> str:
 32 |     """Extract the version as current git commit hash"""
 33 |     result = subprocess.run(
 34 |         split("git rev-parse HEAD"),
 35 |         check=True,
 36 |         capture_output=True,
 37 |         cwd=ROOT_DIR,
 38 |     )
 39 |     commit = result.stdout.decode().strip()
 40 |     assert len(commit) == 40, f"Invalid commit hash: {commit}"
 41 |     return commit[:8]
 42 | 
 43 | 
 44 | def start_validator_process(pm2_name: str, args: List[str]) -> subprocess.Popen:
 45 |     """
 46 |     Spawn a new python process running neurons.validator.
 47 |     `sys.executable` ensures thet the same python interpreter is used as the one
 48 |     used to run this auto-updater.
 49 |     """
 50 |     assert sys.executable, "Failed to get python executable"
 51 | 
 52 |     log.info("Starting validator process with pm2, name: %s", pm2_name)
 53 |     process = subprocess.Popen(
 54 |         (
 55 |             "pm2",
 56 |             "start",
 57 |             sys.executable,
 58 |             "--name",
 59 |             pm2_name,
 60 |             "--",
 61 |             "-m",
 62 |             "neurons.validator",
 63 |             *args,
 64 |         ),
 65 |         cwd=ROOT_DIR,
 66 |     )
 67 |     process.pm2_name = pm2_name
 68 | 
 69 |     return process
 70 | 
 71 | 
 72 | def stop_validator_process(process: subprocess.Popen) -> None:
 73 |     """Stop the validator process"""
 74 |     subprocess.run(
 75 |         ("pm2", "delete", process.pm2_name), cwd=ROOT_DIR, check=True
 76 |     )
 77 | 
 78 | 
 79 | def pull_latest_version() -> None:
 80 |     """
 81 |     Pull the latest version from git.
 82 |     This uses `git pull --rebase`, so if any changes were made to the local repository,
 83 |     this will try to apply them on top of origin's changes. This is intentional, as we
 84 |     don't want to overwrite any local changes. However, if there are any conflicts,
 85 |     this will abort the rebase and return to the original state.
 86 |     The conflicts are expected to happen rarely since validator is expected
 87 |     to be used as-is.
 88 |     """
 89 |     try:
 90 |         subprocess.run(
 91 |             split("git pull --rebase --autostash"), check=True, cwd=ROOT_DIR
 92 |         )
 93 |     except subprocess.CalledProcessError as exc:
 94 |         log.error("Failed to pull, reverting: %s", exc)
 95 |         subprocess.run(split("git rebase --abort"), check=True, cwd=ROOT_DIR)
 96 | 
 97 | 
 98 | def upgrade_packages() -> None:
 99 |     """
100 |     Upgrade python packages by running `pip install --upgrade -r requirements.txt`.
101 |     Notice: this won't work if some package in `requirements.txt` is downgraded.
102 |     Ignored as this is unlikely to happen.
103 |     """
104 | 
105 |     log.info("Upgrading packages")
106 |     try:
107 |         subprocess.run(
108 |             split(f"{sys.executable} -m pip install -e ."),
109 |             check=True,
110 |             cwd=ROOT_DIR,
111 |         )
112 | 
113 |     except subprocess.CalledProcessError as exc:
114 |         log.error("Failed to upgrade packages, proceeding anyway. %s", exc)
115 | 
116 | 
117 | def main(pm2_name: str, args: List[str]) -> None:
118 |     """
119 |     Run the validator process and automatically update it when a new version is released.
120 |     This will check for updates every `UPDATES_CHECK_TIME` and update the validator
121 |     if a new version is available. Update is performed as simple `git pull --rebase`.
122 |     """
123 | 
124 |     validator = start_validator_process(pm2_name, args)
125 |     current_version = latest_version = get_version()
126 |     log.info("Current version: %s", current_version)
127 | 
128 |     try:
129 |         while True:
130 |             pull_latest_version()
131 |             latest_version = get_version()
132 |             log.info("Latest version: %s", latest_version)
133 | 
134 |             if latest_version != current_version:
135 |                 log.info(
136 |                     "Upgraded to latest version: %s -> %s",
137 |                     current_version,
138 |                     latest_version,
139 |                 )
140 |                 upgrade_packages()
141 | 
142 |                 stop_validator_process(validator)
143 |                 validator = start_validator_process(pm2_name, args)
144 |                 current_version = latest_version
145 | 
146 |             time.sleep(UPDATES_CHECK_TIME.total_seconds())
147 | 
148 |     finally:
149 |         stop_validator_process(validator)
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     logging.basicConfig(
154 |         level=logging.INFO,
155 |         format="%(asctime)s %(levelname)s %(message)s",
156 |         handlers=[logging.StreamHandler(sys.stdout)],
157 |     )
158 | 
159 |     parser = argparse.ArgumentParser(
160 |         description="Automatically update and restart the validator process when a new version is released.",
161 |         epilog="Example usage: python start_validator.py --pm2_name 'net9vali' --wallet_name 'wallet1' --wallet_hotkey 'key123'",
162 |     )
163 | 
164 |     parser.add_argument(
165 |         "--pm2_name", default="net9vali", help="Name of the PM2 process."
166 |     )
167 | 
168 |     flags, extra_args = parser.parse_known_args()
169 | 
170 |     main(flags.pm2_name, extra_args)


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # The MIT License (MIT)
  2 |  # Copyright © 2024 It's AI 
  3 |  
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
  5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation
  6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
  7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  8 | 
  9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of
 10 | # the Software.
 11 | 
 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 16 | # DEALINGS IN THE SOFTWARE.
 17 | 
 18 | import re
 19 | import os
 20 | import codecs
 21 | import pathlib
 22 | from os import path
 23 | from io import open
 24 | from setuptools import setup, find_packages
 25 | from pkg_resources import parse_requirements
 26 | 
 27 | import subprocess
 28 | 
 29 | 
 30 | def run_command(command):
 31 |     try:
 32 |         subprocess.run(command, check=True, shell=True)
 33 |         print(f"Successfully executed: {command}")
 34 |     except subprocess.CalledProcessError as e:
 35 |         print(f"Error executing command: {command}")
 36 |         print(f"Error details: {e}")
 37 | 
 38 | 
 39 | def read_requirements(path):
 40 |     with open(path, "r") as f:
 41 |         requirements = f.read().splitlines()
 42 |         processed_requirements = []
 43 | 
 44 |         for req in requirements:
 45 |             # For git or other VCS links
 46 |             if req.startswith("git+") or "@" in req:
 47 |                 pkg_name = re.search(r"(#egg=)([\w\-_]+)", req)
 48 |                 if pkg_name:
 49 |                     processed_requirements.append(pkg_name.group(2))
 50 |                 else:
 51 |                     # You may decide to raise an exception here,
 52 |                     # if you want to ensure every VCS link has an #egg=<package_name> at the end
 53 |                     continue
 54 |             else:
 55 |                 processed_requirements.append(req)
 56 |         return processed_requirements
 57 | 
 58 | 
 59 | requirements = read_requirements("requirements.txt")
 60 | here = path.abspath(path.dirname(__file__))
 61 | 
 62 | with open(path.join(here, "README.md"), encoding="utf-8") as f:
 63 |     long_description = f.read()
 64 | 
 65 | # loading version from setup.py
 66 | with codecs.open(
 67 |     os.path.join(here, "detection/__init__.py"), encoding="utf-8"
 68 | ) as init_file:
 69 |     version_match = re.search(
 70 |         r"^__version__ = ['\"]([^'\"]*)['\"]", init_file.read(), re.M
 71 |     )
 72 |     version_string = version_match.group(1)
 73 | 
 74 | commands = [
 75 |     "cd cc_net && make install",
 76 |     "cd cc_net && make install",
 77 |     "cd cc_net && pip uninstall cc_net",
 78 |     "cd cc_net && pip install -e .",
 79 |     "cd cc_net && make lang=en dl_lm",
 80 | ]
 81 | 
 82 | print('Setting up cc_net')
 83 | for cmd in commands:
 84 |     run_command(cmd)
 85 | 
 86 | setup(
 87 |     name="detection",
 88 |     version=version_string,
 89 |     description="Bittensor LLM Generated Content Detection",
 90 |     long_description=long_description,
 91 |     long_description_content_type="text/markdown",
 92 |     url="https://github.com/It-s-AI/llm-detection",
 93 |     author="Sergey Volnov & Nikita Dilman",
 94 |     packages=find_packages(),
 95 |     include_package_data=True,
 96 |     author_email="dalmannikita@gmail.com",
 97 |     license="MIT",
 98 |     python_requires=">=3.8",
 99 |     install_requires=requirements,
100 |     classifiers=[
101 |         "Development Status :: 3 - Alpha",
102 |         "Intended Audience :: Developers",
103 |         "Topic :: Software Development :: Build Tools",
104 |         # Pick your license as you wish
105 |         "License :: OSI Approved :: MIT License",
106 |         "Programming Language :: Python :: 3 :: Only",
107 |         "Programming Language :: Python :: 3.8",
108 |         "Programming Language :: Python :: 3.9",
109 |         "Programming Language :: Python :: 3.10",
110 |         "Topic :: Scientific/Engineering",
111 |         "Topic :: Scientific/Engineering :: Mathematics",
112 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
113 |         "Topic :: Software Development",
114 |         "Topic :: Software Development :: Libraries",
115 |         "Topic :: Software Development :: Libraries :: Python Modules",
116 |     ],
117 | )
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/tests/__init__.py


--------------------------------------------------------------------------------
/tests/helpers.py:
--------------------------------------------------------------------------------
  1 | # The MIT License (MIT)
  2 | # Copyright © 2023 Opentensor Foundation
  3 | 
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
  5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation
  6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
  7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  8 | 
  9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of
 10 | # the Software.
 11 | 
 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 16 | # DEALINGS IN THE SOFTWARE.
 17 | 
 18 | from typing import Union
 19 | from bittensor import (
 20 |     Balance,
 21 |     NeuronInfo,
 22 |     AxonInfo,
 23 |     PrometheusInfo,
 24 |     __ss58_format__,
 25 | )
 26 | from bittensor.mock.wallet_mock import MockWallet as _MockWallet
 27 | from bittensor.mock.wallet_mock import get_mock_coldkey as _get_mock_coldkey
 28 | from bittensor.mock.wallet_mock import get_mock_hotkey as _get_mock_hotkey
 29 | from bittensor.mock.wallet_mock import get_mock_keypair as _get_mock_keypair
 30 | from bittensor.mock.wallet_mock import get_mock_wallet as _get_mock_wallet
 31 | 
 32 | from rich.console import Console
 33 | from rich.text import Text
 34 | 
 35 | 
 36 | def __mock_wallet_factory__(*args, **kwargs) -> _MockWallet:
 37 |     """Returns a mock wallet object."""
 38 | 
 39 |     mock_wallet = _get_mock_wallet()
 40 | 
 41 |     return mock_wallet
 42 | 
 43 | 
 44 | class CLOSE_IN_VALUE:
 45 |     value: Union[float, int, Balance]
 46 |     tolerance: Union[float, int, Balance]
 47 | 
 48 |     def __init__(
 49 |         self,
 50 |         value: Union[float, int, Balance],
 51 |         tolerance: Union[float, int, Balance] = 0.0,
 52 |     ) -> None:
 53 |         self.value = value
 54 |         self.tolerance = tolerance
 55 | 
 56 |     def __eq__(self, __o: Union[float, int, Balance]) -> bool:
 57 |         # True if __o \in [value - tolerance, value + tolerance]
 58 |         # or if value \in [__o - tolerance, __o + tolerance]
 59 |         return (
 60 |             (self.value - self.tolerance) <= __o
 61 |             and __o <= (self.value + self.tolerance)
 62 |         ) or (
 63 |             (__o - self.tolerance) <= self.value
 64 |             and self.value <= (__o + self.tolerance)
 65 |         )
 66 | 
 67 | 
 68 | def get_mock_neuron(**kwargs) -> NeuronInfo:
 69 |     """
 70 |     Returns a mock neuron with the given kwargs overriding the default values.
 71 |     """
 72 | 
 73 |     mock_neuron_d = dict(
 74 |         {
 75 |             "netuid": -1,  # mock netuid
 76 |             "axon_info": AxonInfo(
 77 |                 block=0,
 78 |                 version=1,
 79 |                 ip=0,
 80 |                 port=0,
 81 |                 ip_type=0,
 82 |                 protocol=0,
 83 |                 placeholder1=0,
 84 |                 placeholder2=0,
 85 |             ),
 86 |             "prometheus_info": PrometheusInfo(
 87 |                 block=0, version=1, ip=0, port=0, ip_type=0
 88 |             ),
 89 |             "validator_permit": True,
 90 |             "uid": 1,
 91 |             "hotkey": "some_hotkey",
 92 |             "coldkey": "some_coldkey",
 93 |             "active": 0,
 94 |             "last_update": 0,
 95 |             "stake": {"some_coldkey": 1e12},
 96 |             "total_stake": 1e12,
 97 |             "rank": 0.0,
 98 |             "trust": 0.0,
 99 |             "consensus": 0.0,
100 |             "validator_trust": 0.0,
101 |             "incentive": 0.0,
102 |             "dividends": 0.0,
103 |             "emission": 0.0,
104 |             "bonds": [],
105 |             "weights": [],
106 |             "stake_dict": {},
107 |             "pruning_score": 0.0,
108 |             "is_null": False,
109 |         }
110 |     )
111 | 
112 |     mock_neuron_d.update(kwargs)  # update with kwargs
113 | 
114 |     if kwargs.get("stake") is None and kwargs.get("coldkey") is not None:
115 |         mock_neuron_d["stake"] = {kwargs.get("coldkey"): 1e12}
116 | 
117 |     if kwargs.get("total_stake") is None:
118 |         mock_neuron_d["total_stake"] = sum(mock_neuron_d["stake"].values())
119 | 
120 |     mock_neuron = NeuronInfo._neuron_dict_to_namespace(mock_neuron_d)
121 | 
122 |     return mock_neuron
123 | 
124 | 
125 | def get_mock_neuron_by_uid(uid: int, **kwargs) -> NeuronInfo:
126 |     return get_mock_neuron(
127 |         uid=uid,
128 |         hotkey=_get_mock_hotkey(uid),
129 |         coldkey=_get_mock_coldkey(uid),
130 |         **kwargs
131 |     )
132 | 
133 | 
134 | class MockStatus:
135 |     def __enter__(self):
136 |         return self
137 | 
138 |     def __exit__(self, exc_type, exc_value, traceback):
139 |         pass
140 | 
141 |     def start(self):
142 |         pass
143 | 
144 |     def stop(self):
145 |         pass
146 | 
147 |     def update(self, *args, **kwargs):
148 |         MockConsole().print(*args, **kwargs)
149 | 
150 | 
151 | class MockConsole:
152 |     """
153 |     Mocks the console object for status and print.
154 |     Captures the last print output as a string.
155 |     """
156 | 
157 |     captured_print = None
158 | 
159 |     def status(self, *args, **kwargs):
160 |         return MockStatus()
161 | 
162 |     def print(self, *args, **kwargs):
163 |         console = Console(
164 |             width=1000, no_color=True, markup=False
165 |         )  # set width to 1000 to avoid truncation
166 |         console.begin_capture()
167 |         console.print(*args, **kwargs)
168 |         self.captured_print = console.end_capture()
169 | 
170 |     def clear(self, *args, **kwargs):
171 |         pass
172 | 
173 |     @staticmethod
174 |     def remove_rich_syntax(text: str) -> str:
175 |         """
176 |         Removes rich syntax from the given text.
177 |         Removes markup and ansi syntax.
178 |         """
179 |         output_no_syntax = Text.from_ansi(Text.from_markup(text).plain).plain
180 | 
181 |         return output_no_syntax
182 | 


--------------------------------------------------------------------------------
/tests/test_template_validator.py:
--------------------------------------------------------------------------------
  1 | # The MIT License (MIT)
  2 |  # Copyright © 2024 It's AI# Copyright © 2023 Opentensor Foundation
  3 | 
  4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
  5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation
  6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
  7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  8 | 
  9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of
 10 | # the Software.
 11 | 
 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 16 | # DEALINGS IN THE SOFTWARE.
 17 | 
 18 | import sys
 19 | import torch
 20 | import unittest
 21 | import bittensor as bt
 22 | 
 23 | from neurons.validator import Neuron as Validator
 24 | 
 25 | from detection.protocol import Dummy
 26 | from detection.utils.uids import get_random_uids
 27 | from detection.validator.reward import get_rewards
 28 | from detection.base.validator import BaseValidatorNeuron
 29 | 
 30 | 
 31 | class TemplateValidatorNeuronTestCase(unittest.TestCase):
 32 |     """
 33 |     This class contains unit tests for the RewardEvent classes.
 34 | 
 35 |     The tests cover different scenarios where completions may or may not be successful and the reward events are checked that they don't contain missing values.
 36 |     The `reward` attribute of all RewardEvents is expected to be a float, and the `is_filter_model` attribute is expected to be a boolean.
 37 |     """
 38 | 
 39 |     def setUp(self):
 40 |         sys.argv = sys.argv[0] + ["--config", "tests/configs/validator.json"]
 41 | 
 42 |         config = BaseValidatorNeuron.config()
 43 |         config.wallet._mock = True
 44 |         config.metagraph._mock = True
 45 |         config.subtensor._mock = True
 46 |         self.neuron = Validator(config)
 47 |         self.miner_uids = get_random_uids(self, k=10)
 48 | 
 49 |     def test_run_single_step(self):
 50 |         # TODO: Test a single step
 51 |         pass
 52 | 
 53 |     def test_sync_error_if_not_registered(self):
 54 |         # TODO: Test that the validator throws an error if it is not registered on metagraph
 55 |         pass
 56 | 
 57 |     def test_forward(self):
 58 |         # TODO: Test that the forward function returns the correct value
 59 |         pass
 60 | 
 61 |     def test_dummy_responses(self):
 62 |         # TODO: Test that the dummy responses are correctly constructed
 63 | 
 64 |         responses = self.neuron.dendrite.query(
 65 |             # Send the query to miners in the network.
 66 |             axons=[
 67 |                 self.neuron.metagraph.axons[uid] for uid in self.miner_uids
 68 |             ],
 69 |             # Construct a dummy query.
 70 |             synapse=Dummy(dummy_input=self.neuron.step),
 71 |             # All responses have the deserialize function called on them before returning.
 72 |             deserialize=True,
 73 |         )
 74 | 
 75 |         for i, response in enumerate(responses):
 76 |             self.assertEqual(response, self.neuron.step * 2)
 77 | 
 78 |     def test_reward(self):
 79 |         # TODO: Test that the reward function returns the correct value
 80 |         responses = self.dendrite.query(
 81 |             # Send the query to miners in the network.
 82 |             axons=[self.metagraph.axons[uid] for uid in self.miner_uids],
 83 |             # Construct a dummy query.
 84 |             synapse=Dummy(dummy_input=self.neuron.step),
 85 |             # All responses have the deserialize function called on them before returning.
 86 |             deserialize=True,
 87 |         )
 88 | 
 89 |         rewards = get_rewards(self.neuron, responses)
 90 |         expected_rewards = torch.FloatTensor([1.0] * len(responses))
 91 |         self.assertEqual(rewards, expected_rewards)
 92 | 
 93 |     def test_reward_with_nan(self):
 94 |         # TODO: Test that NaN rewards are correctly sanitized
 95 |         # TODO: Test that a bt.logging.warning is thrown when a NaN reward is sanitized
 96 |         responses = self.dendrite.query(
 97 |             # Send the query to miners in the network.
 98 |             axons=[self.metagraph.axons[uid] for uid in self.miner_uids],
 99 |             # Construct a dummy query.
100 |             synapse=Dummy(dummy_input=self.neuron.step),
101 |             # All responses have the deserialize function called on them before returning.
102 |             deserialize=True,
103 |         )
104 | 
105 |         rewards = get_rewards(self.neuron, responses)
106 |         expected_rewards = rewards.clone()
107 |         # Add NaN values to rewards
108 |         rewards[0] = float("nan")
109 | 
110 |         with self.assertLogs(bt.logging, level="WARNING") as cm:
111 |             self.neuron.update_scores(rewards, self.miner_uids)
112 | 


--------------------------------------------------------------------------------