├── .circleci └── config.yml ├── .dependencies_installed ├── .gitignore ├── .python-version ├── LICENSE ├── README.md ├── cc_net ├── .gitignore ├── Makefile ├── README.md ├── cc_net │ ├── __init__.py │ ├── __main__.py │ ├── data │ │ └── cutoff.csv │ ├── dedup.py │ ├── execution.py │ ├── flat_hash_set.py │ ├── get_wiki_cirrus.py │ ├── jsonql.py │ ├── mine.py │ ├── minify.py │ ├── perplexity.py │ ├── process_wet_file.py │ ├── regroup.py │ ├── split_by_lang.py │ ├── stream_cc.py │ ├── text_normalizer.py │ ├── tokenizer.py │ └── tools │ │ ├── __init__.py │ │ ├── dl_cc_100.py │ │ ├── expand_corpus.py │ │ └── make_dmoz_corpus.py ├── collinfo.json ├── pyproject.toml └── setup.py ├── contrib ├── CODE_REVIEW_DOCS.md ├── CONTRIBUTING.md ├── DEVELOPMENT_WORKFLOW.md └── STYLE.md ├── core ├── detection ├── __init__.py ├── attacks │ ├── __init__.py │ ├── data_augmentation.py │ ├── delete.py │ ├── resources │ │ └── .gitkeep │ ├── spelling.py │ ├── synonym.py │ └── zero_width_space.py ├── base │ ├── __init__.py │ ├── miner.py │ ├── neuron.py │ └── validator.py ├── protocol.py ├── utils │ ├── __init__.py │ ├── config.py │ ├── misc.py │ ├── uids.py │ └── weight_version.py └── validator │ ├── __init__.py │ ├── cc_dataset.py │ ├── data_generator.py │ ├── forward.py │ ├── generate_version.py │ ├── models.py │ ├── my_datasets.py │ ├── reward.py │ ├── segmentation_processer.py │ ├── text_completion.py │ └── text_postprocessing.py ├── docs ├── FAQ.md ├── faq_1.png ├── incentive.md ├── logo.png ├── meet_its_ai.png ├── miner_solution.md ├── mining.md ├── raid_leaderboard.png ├── validating.md ├── vision_and_roadmap.md └── what_are_subnets.md ├── min_compute.yml ├── models └── ppl_model.pk ├── neurons ├── __init__.py ├── miner.py ├── miners │ ├── __init__.py │ ├── deberta_classifier.py │ └── ppl_model.py └── validator.py ├── prompting ├── __init__.py ├── agent.py ├── cleaners │ ├── __init__.py │ ├── all_cleaners.py │ └── cleaner.py ├── conversation.py ├── llm.py ├── mock.py ├── persona.py ├── tasks │ ├── __init__.py │ ├── date_qa.py │ ├── debugging.py │ ├── generic_instruction.py │ ├── math.py │ ├── qa.py │ ├── summarization.py │ └── task.py ├── tools │ ├── __init__.py │ ├── datasets │ │ ├── __init__.py │ │ ├── base.py │ │ ├── code.py │ │ ├── context.py │ │ ├── math.py │ │ ├── mock.py │ │ └── wiki.py │ └── selector.py └── utils │ ├── __init__.py │ └── exceptions.py ├── requirements.txt ├── run.sh ├── scripts ├── check_compatibility.sh ├── check_requirements_changes.sh ├── install_staging.sh └── start_validator.py ├── setup.py └── tests ├── __init__.py ├── helpers.py └── test_template_validator.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | orbs: 4 | python: circleci/python@2.1.1 5 | python-lib: dialogue/python-lib@0.1.55 6 | # coveralls: coveralls/coveralls@1.0.6 7 | 8 | jobs: 9 | black: 10 | resource_class: small 11 | parameters: 12 | python-version: 13 | type: string 14 | docker: 15 | - image: cimg/python:<< parameters.python-version >> 16 | 17 | steps: 18 | - checkout 19 | 20 | - restore_cache: 21 | name: Restore cached black venv 22 | keys: 23 | - v1-pypi-py-black-<< parameters.python-version >> 24 | 25 | - run: 26 | name: Update & Activate black venv 27 | command: | 28 | python -m venv env/ 29 | . env/bin/activate 30 | python -m pip install --upgrade pip 31 | pip install black 32 | 33 | - save_cache: 34 | name: Save cached black venv 35 | paths: 36 | - "env/" 37 | key: v1-pypi-py-black-<< parameters.python-version >> 38 | 39 | - run: 40 | name: Black format check 41 | command: | 42 | . env/bin/activate 43 | black --line-length 79 --exclude '(env|venv|.eggs)' --check . 44 | 45 | pylint: 46 | resource_class: small 47 | parameters: 48 | python-version: 49 | type: string 50 | docker: 51 | - image: cimg/python:<< parameters.python-version >> 52 | 53 | steps: 54 | - checkout 55 | 56 | - run: 57 | name: Install Pylint 58 | command: | 59 | python -m venv env/ 60 | . env/bin/activate 61 | pip install pylint 62 | 63 | - run: 64 | name: Pylint check 65 | command: | 66 | . env/bin/activate 67 | pylint --fail-on=W,E,F --exit-zero ./ 68 | 69 | check_compatibility: 70 | parameters: 71 | python_version: 72 | type: string 73 | docker: 74 | - image: cimg/python:3.10 75 | steps: 76 | - checkout 77 | - run: 78 | name: Check if requirements files have changed 79 | command: ./scripts/check_requirements_changes.sh 80 | - run: 81 | name: Install dependencies and Check compatibility 82 | command: | 83 | if [ "$REQUIREMENTS_CHANGED" == "true" ]; then 84 | sudo apt-get update 85 | sudo apt-get install -y jq curl 86 | ./scripts/check_compatibility.sh << parameters.python_version >> 87 | else 88 | echo "Skipping compatibility checks..." 89 | fi 90 | 91 | build: 92 | resource_class: medium 93 | parallelism: 2 94 | parameters: 95 | python-version: 96 | type: string 97 | docker: 98 | - image: cimg/python:<< parameters.python-version >> 99 | 100 | steps: 101 | - checkout 102 | 103 | - restore_cache: 104 | name: Restore cached venv 105 | keys: 106 | - v1-pypi-py<< parameters.python-version >>-{{ checksum "requirements.txt" }} 107 | - v1-pypi-py<< parameters.python-version >> 108 | 109 | - run: 110 | name: Update & Activate venv 111 | command: | 112 | python -m venv env/ 113 | . env/bin/activate 114 | python -m pip install --upgrade pip 115 | 116 | - save_cache: 117 | name: Save cached venv 118 | paths: 119 | - "env/" 120 | key: v1-pypi-py<< parameters.python-version >>-{{ checksum "requirements.txt" }} 121 | 122 | - run: 123 | name: Install Bittensor Subnet Template 124 | command: | 125 | . env/bin/activate 126 | pip install -e . 127 | 128 | - store_test_results: 129 | path: test-results 130 | - store_artifacts: 131 | path: test-results 132 | 133 | coveralls: 134 | docker: 135 | - image: cimg/python:3.10 136 | steps: 137 | - run: 138 | name: Combine Coverage 139 | command: | 140 | pip3 install --upgrade coveralls 141 | coveralls --finish --rcfile .coveragerc || echo "Failed to upload coverage" 142 | 143 | workflows: 144 | compatibility_checks: 145 | jobs: 146 | - check_compatibility: 147 | python_version: "3.8" 148 | name: check-compatibility-3.8 149 | - check_compatibility: 150 | python_version: "3.9" 151 | name: check-compatibility-3.9 152 | - check_compatibility: 153 | python_version: "3.10" 154 | name: check-compatibility-3.10 155 | - check_compatibility: 156 | python_version: "3.11" 157 | name: check-compatibility-3.11 158 | 159 | pr-requirements: 160 | jobs: 161 | - black: 162 | python-version: "3.8.12" 163 | - pylint: 164 | python-version: "3.8.12" 165 | - build: 166 | matrix: 167 | parameters: 168 | python-version: ["3.9.13", "3.10.6", "3.11.4"] 169 | -------------------------------------------------------------------------------- /.dependencies_installed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/.dependencies_installed -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | app.config.js 2 | 3 | commands.txt 4 | commands_testnet.txt 5 | check_vpermit.py 6 | setup_runpod.sh 7 | 8 | wandb/ 9 | wandb/* 10 | 11 | models/ 12 | models/* 13 | 14 | nohup.out 15 | 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | build/ 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib/ 33 | lib64/ 34 | parts/ 35 | sdist/ 36 | var/ 37 | wheels/ 38 | share/python-wheels/ 39 | *.egg-info/ 40 | .installed.cfg 41 | *.egg 42 | MANIFEST 43 | 44 | # PyInstaller 45 | # Usually these files are written by a python script from a template 46 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 47 | *.manifest 48 | *.spec 49 | 50 | # Installer logs 51 | pip-log.txt 52 | pip-delete-this-directory.txt 53 | 54 | # Unit test / coverage reports 55 | htmlcov/ 56 | .tox/ 57 | .nox/ 58 | .coverage 59 | .coverage.* 60 | .cache 61 | nosetests.xml 62 | coverage.xml 63 | *.cover 64 | *.py,cover 65 | .hypothesis/ 66 | .pytest_cache/ 67 | cover/ 68 | 69 | # Translations 70 | *.mo 71 | *.pot 72 | 73 | # Django stuff: 74 | *.log 75 | local_settings.py 76 | db.sqlite3 77 | db.sqlite3-journal 78 | 79 | # Flask stuff: 80 | instance/ 81 | .webassets-cache 82 | 83 | # Scrapy stuff: 84 | .scrapy 85 | 86 | # Sphinx documentation 87 | docs/_build/ 88 | 89 | # PyBuilder 90 | .pybuilder/ 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # IPython 97 | profile_default/ 98 | ipython_config.py 99 | 100 | # pyenv 101 | # For a library or package, you might want to ignore these files since the code is 102 | # intended to run in multiple environments; otherwise, check them in: 103 | # .python-version 104 | 105 | # pipenv 106 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 107 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 108 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 109 | # install all needed dependencies. 110 | #Pipfile.lock 111 | 112 | # poetry 113 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 114 | # This is especially recommended for binary packages to ensure reproducibility, and is more 115 | # commonly ignored for libraries. 116 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 117 | #poetry.lock 118 | 119 | # pdm 120 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 121 | #pdm.lock 122 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 123 | # in version control. 124 | # https://pdm.fming.dev/#use-with-ide 125 | .pdm.toml 126 | 127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 128 | __pypackages__/ 129 | 130 | # Celery stuff 131 | celerybeat-schedule 132 | celerybeat.pid 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Environments 138 | .env 139 | .venv 140 | env/ 141 | venv/ 142 | ENV/ 143 | env.bak/ 144 | venv.bak/ 145 | 146 | # Spyder project settings 147 | .spyderproject 148 | .spyproject 149 | 150 | # Rope project settings 151 | .ropeproject 152 | 153 | # mkdocs documentation 154 | /site 155 | 156 | # mypy 157 | .mypy_cache/ 158 | .dmypy.json 159 | dmypy.json 160 | 161 | # Pyre type checker 162 | .pyre/ 163 | 164 | # pytype static type analyzer 165 | .pytype/ 166 | 167 | # Cython debug symbols 168 | cython_debug/ 169 | 170 | # PyCharm 171 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 172 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 173 | # and can be added to the global gitignore or merged into this file. For a more nuclear 174 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 175 | #.idea/ 176 | 177 | testing/ -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10.14 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Opentensor 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cc_net/.gitignore: -------------------------------------------------------------------------------- 1 | # Dataset 2 | /data 3 | /test_data/ 4 | /test_data2/ 5 | /output/ 6 | 7 | # Binary files 8 | /bin/ 9 | 10 | # Third party code 11 | /third_party/ 12 | 13 | # Generic to python 14 | __pycache__/ 15 | *.pyc 16 | .mypy_cache/ 17 | 18 | /scratch/ 19 | /notebooks/ 20 | 21 | /build/ 22 | /cc_net.egg-info/ 23 | /config/ 24 | /dist/ 25 | /pip-wheel-metadata/ 26 | 27 | /.DS_Store 28 | -------------------------------------------------------------------------------- /cc_net/cc_net/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | -------------------------------------------------------------------------------- /cc_net/cc_net/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | 7 | 8 | import func_argparse 9 | 10 | import cc_net.mine 11 | 12 | 13 | def main(): 14 | func_argparse.parse_and_call(cc_net.mine.get_main_parser()) 15 | 16 | 17 | if __name__ == "__main__": 18 | main() 19 | -------------------------------------------------------------------------------- /cc_net/cc_net/get_wiki_cirrus.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | 7 | """ 8 | Creates mono-lingual corpus from Wikipedia. 9 | """ 10 | 11 | import functools 12 | import re 13 | import subprocess 14 | import urllib.request 15 | from pathlib import Path 16 | from typing import Dict 17 | 18 | import func_argparse 19 | from bs4 import BeautifulSoup # type: ignore 20 | 21 | from cc_net import jsonql, text_normalizer 22 | 23 | CIRRUS_URL = "https://dumps.wikimedia.org/other/cirrussearch" 24 | CIRRUS_DUMP_RE = re.compile(r"^(.*)wiki-\d+-cirrussearch-content\.json\.gz") 25 | 26 | 27 | def tmp(file: Path) -> Path: 28 | return file.parent / ("tmp." + file.name) 29 | 30 | 31 | def opening(file: Path, output: Path = None, n_docs: int = 1_000_000): 32 | """Will dump the tokenized opening text of the given Wikipedia. 33 | 34 | Args: 35 | - file: File containing the Wikipedia dump. 36 | - output: Output file. 37 | - n_docs: How many docs to parse 38 | - tokenize: whether to tokenize the text 39 | - lang: Language code used to chose the tokenizer 40 | """ 41 | assert file.exists() 42 | return jsonql.run_pipes( 43 | functools.partial(extract_opening_text, n_docs=n_docs), 44 | file=file, 45 | output=tmp(output) if output else None, 46 | ) 47 | if output: 48 | tmp(output).replace(output) 49 | 50 | 51 | def extract_opening_text(source, n_docs: int = 10_000): 52 | i = 0 53 | for doc in jsonql.read_jsons(source): 54 | if not doc: 55 | continue 56 | 57 | text = doc.get("opening_text") 58 | if not text: 59 | continue 60 | 61 | yield text_normalizer.normalize(text) 62 | i += 1 63 | if i >= n_docs: 64 | break 65 | 66 | 67 | def dl(lang: str, output_dir: Path, date: str = None): 68 | """Download the cirrus extract for the given lang. 69 | 70 | See https://dumps.wikimedia.org/other/cirrussearch for the full list of files. 71 | 72 | Args: 73 | - lang: The Wikipedia code for the language. 74 | - output_dir: Output directory. File will be `{lang}.json.gz` 75 | - date: Date of a specific Cirrus dump. 76 | """ 77 | 78 | urls = get_cirrus_urls(date) 79 | assert ( 80 | lang in urls 81 | ), f"--lang {lang} not found. Available languages are: {urls.keys()}" 82 | 83 | assert output_dir, "--output_dir folder needed." 84 | output_dir.mkdir(exist_ok=True) 85 | output = output_dir / (lang + ".json.gz") 86 | print(f"Downloading {lang} wiki from {urls[lang]} to {output}") 87 | wget(urls[lang], output) 88 | 89 | 90 | def get_cirrus_urls(date: str = None) -> Dict[str, str]: 91 | if date is None: 92 | cirrus_page = BeautifulSoup( 93 | urllib.request.urlopen(CIRRUS_URL), features="html.parser" 94 | ) 95 | dumps = [a.get("href").strip("/") for a in cirrus_page.findAll("a")] 96 | dumps.remove("..") 97 | dumps.remove("current") 98 | # We take the oldest dump since the most recent might be incomplete. 99 | # The page only link to the N latest dumps so the dump won't be too old. 100 | date = min(dumps) 101 | 102 | cirrus_url = "/".join((CIRRUS_URL, date)) 103 | print("Will use the Wikipedia dump from:", date, cirrus_url) 104 | cirrus_page = BeautifulSoup( 105 | urllib.request.urlopen(cirrus_url), features="html.parser" 106 | ) 107 | urls = {} 108 | for link in cirrus_page.findAll("a"): 109 | match = CIRRUS_DUMP_RE.match(link.get("href")) 110 | if not match: 111 | continue 112 | 113 | urls[match.group(1)] = "/".join([cirrus_url, link.get("href")]) 114 | assert urls, f"No valid download urls found at {cirrus_url}" 115 | return urls 116 | 117 | 118 | def wget(url: str, output: Path): 119 | subprocess.run(["wget", url, "-O", tmp(output), "-q"], check=True) 120 | tmp(output).replace(output) 121 | assert ( 122 | output.stat().st_size > 10_000 123 | ), f"File {output} downloaded from {url} looks too small" 124 | 125 | 126 | if __name__ == "__main__": 127 | func_argparse.main(dl, opening) 128 | -------------------------------------------------------------------------------- /cc_net/cc_net/regroup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | 7 | import logging 8 | import subprocess 9 | from pathlib import Path 10 | from typing import List 11 | 12 | import func_argparse 13 | import numpy as np 14 | 15 | from cc_net import jsonql 16 | 17 | 18 | def get_index(file: Path) -> Path: 19 | return file.parent / (file.name + ".index") 20 | 21 | 22 | def _get_tmp(output: Path) -> Path: 23 | return output.parent / (output.stem + ".tmp" + output.suffix) 24 | 25 | 26 | def reshard( 27 | inputs: List[Path], 28 | output: Path, 29 | tmp: Path = None, 30 | free_original: bool = False, 31 | rm_original: bool = False, 32 | ) -> Path: 33 | """Read the given files and concatenate them to the output file. 34 | 35 | Can remove original files on completion, or just write dummy content into them to free disk. 36 | """ 37 | if tmp is None: 38 | tmp = _get_tmp(output) 39 | logging.info(f"Resharding {inputs} to {tmp}, will move later to {output}") 40 | jsonql.run_pipes(file=inputs, output=tmp) 41 | tmp.replace(output) 42 | tmp_index = get_index(tmp) 43 | if tmp_index.exists(): 44 | tmp_index.replace(get_index(output)) 45 | 46 | if not (free_original or rm_original): 47 | return output 48 | 49 | for _input in inputs: 50 | if rm_original: 51 | _input.unlink() 52 | elif free_original: 53 | # Overwrite the previous file. 54 | # This frees up disk space and allows doit to properly track the success. 55 | _input.write_text(f"Resharded into {output}") 56 | if get_index(_input).is_file(): 57 | get_index(_input).unlink() 58 | 59 | return output 60 | 61 | 62 | def fast_reshard( 63 | inputs: List[Path], 64 | output: Path, 65 | tmp: Path = None, 66 | free_original: bool = False, 67 | rm_original: bool = False, 68 | ) -> Path: 69 | """Same as reshard but don't re-compress the output. 70 | 71 | This will lead to a bigger output file, especially if the shards are very small. 72 | """ 73 | if tmp is None: 74 | tmp = _get_tmp(output) 75 | with open(tmp, "wb") as o: 76 | subprocess.run(["cat"] + [str(f) for f in inputs], stdout=o) 77 | 78 | tmp.replace(output) 79 | indexes_files = [get_index(i) for i in inputs] 80 | existing_indexes = sum(i.exists() for i in indexes_files) 81 | assert ( 82 | existing_indexes == len(indexes_files) or existing_indexes == 0 83 | ), "some indexes don't exist." 84 | if existing_indexes > 0: 85 | indexes = [np.load(idx) for idx in indexes_files] 86 | for i in range(len(indexes) - 1): 87 | indexes[i + 1] += indexes[i][-1] 88 | with open(str(output) + ".index", "wb") as o: 89 | np.save(o, np.concatenate(indexes)) 90 | 91 | if not (free_original or rm_original): 92 | return output 93 | 94 | for _input in inputs: 95 | if rm_original: 96 | _input.unlink() 97 | elif free_original: 98 | # Overwrite the previous file. 99 | # This frees up disk space and allows doit to properly track the success. 100 | _input.write_text(f"Resharded into {output}") 101 | if get_index(_input).is_file(): 102 | get_index(_input).unlink() 103 | 104 | return output 105 | 106 | 107 | def determine_groups( 108 | inputs: List[Path], target_size: int = 4 * 1024 ** 3 109 | ) -> List[List[Path]]: 110 | if len(inputs) == 0: 111 | return [] 112 | 113 | sample = inputs[:10] 114 | typical_size = sum(s.stat().st_size for s in sample) / len(sample) 115 | group_size = min(target_size // typical_size, len(inputs)) 116 | group_size = max(group_size, 1) 117 | 118 | return jsonql.grouper(inputs, group_size) 119 | 120 | 121 | if __name__ == "__main__": 122 | func_argparse.single_main(reshard) 123 | -------------------------------------------------------------------------------- /cc_net/cc_net/split_by_lang.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # This source code is licensed under the license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | 7 | import argparse 8 | import collections 9 | from pathlib import Path 10 | from typing import Dict, Optional 11 | 12 | import fasttext # type: ignore 13 | 14 | from cc_net import jsonql 15 | 16 | 17 | def get_args(): 18 | parser = argparse.ArgumentParser( 19 | description="Read a list of json files and split them ", 20 | parents=[jsonql.io_parser()], 21 | ) 22 | parser.add_argument("--pattern", type=str) 23 | parser.add_argument("--field", type=str, default="raw_content") 24 | parser.add_argument("--threshold", type=float, default=0) 25 | parser.add_argument("--model", type=str, required=True) 26 | parser.add_argument("--out_field", type=str, default="language") 27 | parser.add_argument("--top", type=int, default=1) 28 | return vars(parser.parse_args()) 29 | 30 | 31 | def predict(model, text: str, k: int = 1): 32 | labels, scores = model.predict(text, k=k) 33 | labels = [l.replace("__label__", "") for l in labels] 34 | return labels, scores 35 | 36 | 37 | def avg_predict(model, text): 38 | # Overall gives the same results than predict(model, text.replace("\n", "")) 39 | text = text.split("\n") 40 | text_len = sum(len(line) for line in text) 41 | if text_len == 0: 42 | return None, 0 43 | scores = [predict(model, line) for line in text] 44 | scores_by_label: Dict[str, float] = collections.defaultdict(float) 45 | for (label, score), line in zip(scores, text): 46 | scores_by_label[label] += score * len(line) 47 | 48 | label, score = max(scores_by_label.items(), key=lambda kv: kv[1]) 49 | return label, score / text_len 50 | 51 | 52 | class Classifier(jsonql.Transformer): 53 | def __init__( 54 | self, 55 | model: Path, 56 | field: str, 57 | out_field: str, 58 | threshold: float = 0, 59 | top: int = 1, 60 | language: str = None, 61 | rounding: int = 2, 62 | ): 63 | super().__init__() 64 | self.model = model 65 | assert model.exists(), f"Model {model} doesn't exist." 66 | self.field = field 67 | self.out_field = out_field 68 | self.threshold = threshold 69 | self.top = top 70 | self.language = language 71 | self.rounding = rounding 72 | # Fasttext model is a C object and can't be pickled 73 | self.fasttext_model: fasttext._FastText = None 74 | self.n_doc, self.n_accepted, self.n_ignored, self.n_disagreement = 0, 0, 0, 0 75 | self.cnt: Dict[str, int] = {} 76 | 77 | def _prepare(self): 78 | self.log(f"Loading {self.model}") 79 | self.fasttext_model = fasttext.load_model(str(self.model)) 80 | 81 | def predict(self, text): 82 | return predict(self.fasttext_model, text.replace("\n", ""), k=self.top) 83 | 84 | def do(self, doc: dict) -> Optional[dict]: 85 | text = doc.get(self.field, None) 86 | if not text: 87 | return None 88 | 89 | if self.language and doc.get("language") != self.language: 90 | self.n_ignored += 1 91 | return doc 92 | 93 | self.n_doc += 1 94 | labels, scores = self.predict(text) 95 | scores.round(self.rounding, out=scores) 96 | for l in labels: 97 | self.cnt[l] = self.cnt.get(l, 0) + 1 98 | 99 | if self.top == 1: 100 | existing_label = doc.get(self.out_field, None) 101 | if existing_label and labels[0] != existing_label: 102 | self.n_disagreement += 1 103 | 104 | if all(s < self.threshold for s in scores): 105 | return None 106 | 107 | self.n_accepted += 1 108 | if self.top == 1: 109 | doc[self.out_field] = labels[0] 110 | doc[self.out_field + "_score"] = scores[0] 111 | else: 112 | doc[self.out_field] = {l: s for l, s in zip(labels, scores)} 113 | return doc 114 | 115 | def summary(self): 116 | n_doc, n_accepted, n_disagreement, cnt, out_field = ( 117 | self.n_doc, 118 | self.n_accepted, 119 | self.n_disagreement, 120 | self.cnt, 121 | self.out_field, 122 | ) 123 | summ = super().summary() 124 | if self.threshold > 0: 125 | ratio = n_accepted / n_doc if n_doc else 0 126 | summ.append(f"Kept {n_accepted} docs over {n_doc} ({ratio :.1%})") 127 | summ.append(f"Found {len(cnt)} {out_field} labels: {cnt}") 128 | 129 | disagreement = n_disagreement / n_doc if n_doc else 0 130 | if disagreement: 131 | summ.append(f"{out_field} disagreement is at {disagreement:.1%}.") 132 | return summ 133 | 134 | def __repr__(self): 135 | return f"Classifier({self.model})" 136 | 137 | 138 | def classify_and_split(file, output, pattern, **kwargs): 139 | classifier = Classifier(**kwargs) 140 | splitter = jsonql.split(pattern) 141 | jsonql.run_pipes(classifier, splitter, file=file, output=output) 142 | 143 | 144 | if __name__ == "__main__": 145 | args = get_args() 146 | pattern = args.get("pattern") 147 | if pattern: 148 | classify_and_split(**args) 149 | else: 150 | args.pop("pattern") 151 | jsonql.run_pipe(Classifier, args) 152 | -------------------------------------------------------------------------------- /cc_net/cc_net/stream_cc.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | from typing import Iterable, Optional, List, Dict, Any 4 | 5 | from cc_net import jsonql, process_wet_file, split_by_lang, perplexity, minify 6 | 7 | FILE_DIR = Path(__file__).parent 8 | CUTOFF_CSV = FILE_DIR / "data" / "cutoff.csv" 9 | 10 | class StreamMinifier(minify.Minifier): 11 | def __init__(self, remove: Optional[List[str]] = None, keep: Optional[List[str]] = None): 12 | super().__init__() 13 | self.remove = remove or [] 14 | self.keep = keep or [] 15 | 16 | def do(self, doc: Dict[str, Any]) -> Dict[str, Any]: 17 | # Remove fields that are not needed 18 | for f in self.remove: 19 | doc.pop(f, None) 20 | 21 | # Keep only the specified fields 22 | if self.keep: 23 | doc = {k: v for k, v in doc.items() if k in self.keep} 24 | 25 | return doc 26 | 27 | def stream_cc_segment( 28 | segment_url: str, 29 | output_dir: Path, 30 | lang_model: Path, 31 | lm_dir: Path, 32 | lang_whitelist: Optional[List[str]] = None, 33 | lang_threshold: float = 0.5, 34 | min_len: int = 300, 35 | ): 36 | # Set up the pipeline steps 37 | steps = [] 38 | 39 | # Language identification 40 | steps.append(split_by_lang.Classifier( 41 | model=lang_model, 42 | field="raw_content", 43 | out_field="language", 44 | top=1, 45 | threshold=lang_threshold, 46 | )) 47 | 48 | # Language filtering 49 | if lang_whitelist: 50 | steps.append(jsonql.where( 51 | [lambda doc: doc.get("language") in set(lang_whitelist)] 52 | )) 53 | 54 | # SentencePiece tokenization 55 | steps.append(perplexity.MultiSentencePiece( 56 | {l: lm_dir / f"{l}.sp.model" for l in (lang_whitelist or ["en", "fr", "de"])}, 57 | field="raw_content", 58 | output_field="tokenized", 59 | normalize=True, 60 | )) 61 | 62 | # Language model scoring 63 | steps.append(perplexity.DocLM( 64 | {l: lm_dir / f"{l}.arpa.bin" for l in (lang_whitelist or ["en", "fr", "de"])}, 65 | field="tokenized", 66 | output_field="perplexity", 67 | normalize=False, 68 | )) 69 | 70 | # Perplexity bucketing 71 | steps.append(perplexity.PerplexityBucket(CUTOFF_CSV)) 72 | 73 | # Minification (remove unnecessary fields) 74 | steps.append(StreamMinifier(remove=["tokenized"], keep=["url", "raw_content", "language", "perplexity", "bucket"])) 75 | 76 | # Set up the CC segment reader 77 | cc_reader = process_wet_file.CCSegmentsReader( 78 | [segment_url], 79 | min_len=min_len, 80 | ) 81 | 82 | # Set up the output 83 | output_pattern = str(output_dir / "{language}_{bucket}.json.gz") 84 | steps.append(jsonql.split(pattern=output_pattern, mkdir=True)) 85 | 86 | # Run the pipeline 87 | jsonql.run_pipes( 88 | *steps, 89 | inputs=cc_reader, 90 | processes=1, # Increase this if you want to use multiple processes 91 | chunksize=100, 92 | ) 93 | 94 | def main(): 95 | parser = argparse.ArgumentParser(description="Stream and process a CC segment") 96 | parser.add_argument("segment_url", type=str, help="URL of the CC segment to process") 97 | parser.add_argument("output_dir", type=Path, help="Directory to save processed files") 98 | parser.add_argument("--lang_model", type=Path, default=Path("bin/lid.bin"), help="Path to language identification model") 99 | parser.add_argument("--lm_dir", type=Path, required=True, help="Directory containing language models") 100 | parser.add_argument("--lang_whitelist", type=str, nargs="+", help="List of languages to process") 101 | parser.add_argument("--lang_threshold", type=float, default=0.5, help="Language identification threshold") 102 | parser.add_argument("--min_len", type=int, default=300, help="Minimum document length") 103 | 104 | args = parser.parse_args() 105 | 106 | stream_cc_segment( 107 | args.segment_url, 108 | args.output_dir, 109 | args.lang_model, 110 | args.lm_dir, 111 | args.lang_whitelist, 112 | args.lang_threshold, 113 | args.min_len, 114 | ) 115 | 116 | if __name__ == "__main__": 117 | main() -------------------------------------------------------------------------------- /cc_net/cc_net/text_normalizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | 7 | import re 8 | import unicodedata 9 | 10 | UNICODE_PUNCT = { 11 | ",": ",", 12 | "。": ".", 13 | "、": ",", 14 | "„": '"', 15 | "”": '"', 16 | "“": '"', 17 | "«": '"', 18 | "»": '"', 19 | "1": '"', 20 | "」": '"', 21 | "「": '"', 22 | "《": '"', 23 | "》": '"', 24 | "´": "'", 25 | "∶": ":", 26 | ":": ":", 27 | "?": "?", 28 | "!": "!", 29 | "(": "(", 30 | ")": ")", 31 | ";": ";", 32 | "–": "-", 33 | "—": " - ", 34 | ".": ". ", 35 | "~": "~", 36 | "’": "'", 37 | "…": "...", 38 | "━": "-", 39 | "〈": "<", 40 | "〉": ">", 41 | "【": "[", 42 | "】": "]", 43 | "%": "%", 44 | "►": "-", 45 | } 46 | 47 | UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]") 48 | 49 | 50 | def replace_unicode_punct(text: str) -> str: 51 | return "".join((UNICODE_PUNCT.get(c, c) for c in text)) 52 | 53 | 54 | def remove_unicode_punct(text: str) -> str: 55 | """More aggressive version of replace_unicode_punct but also faster.""" 56 | return UNICODE_PUNCT_RE.sub("", text) 57 | 58 | 59 | def strip_accents(line: str) -> str: 60 | """Strips accents from a piece of text.""" 61 | nfd = unicodedata.normalize("NFD", line) 62 | output = [c for c in nfd if unicodedata.category(c) != "Mn"] 63 | if len(output) == line: 64 | return line 65 | return "".join(output) 66 | 67 | 68 | # Build a regex matching all control characters. 69 | NON_PRINTING_CHARS_RE = re.compile( 70 | f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]" 71 | ) 72 | DIGIT_RE = re.compile(r"\d") 73 | PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile( 74 | (UNICODE_PUNCT_RE.pattern + NON_PRINTING_CHARS_RE.pattern).replace("][", "") 75 | ) 76 | 77 | 78 | def remove_non_printing_char(text: str) -> str: 79 | return NON_PRINTING_CHARS_RE.sub("", text) 80 | 81 | 82 | def normalize_spacing_for_tok(text: str, language: str = "en") -> str: 83 | res = ( 84 | text.replace("\r", "") 85 | # remove extra spaces 86 | .replace("(", " (") 87 | .replace(")", ") ") 88 | .replace(" +", " ") 89 | ) 90 | res = re.sub(r"\) ([\.\!\:\?\;\,])", r"\)\1", res) 91 | res = res.replace("( ", "(").replace(" )", ")") 92 | res = re.sub(r"(\d) \%", r"\1\%", res) 93 | res = res.replace(" :", ":").replace(" ;", ";") 94 | res = res.replace("`", "'").replace("''", ' " ') 95 | 96 | res = ( 97 | res.replace("„", '"') 98 | .replace("“", '"') 99 | .replace("”", '"') 100 | .replace("–", "-") 101 | .replace("—", " - ") 102 | .replace(" +", " ") 103 | .replace("´", "'") 104 | .replace("([a-z])‘([a-z])", r"\1'\2/") 105 | .replace("([a-z])’([a-z])", r"\1'\2/") 106 | .replace("‘", '"') 107 | .replace("‚", '"') 108 | .replace("’", '"') 109 | .replace("''", '"') 110 | .replace("´´", '"') 111 | .replace("…", "...") 112 | # French quotes 113 | .replace(" « ", ' "') 114 | .replace("« ", '"') 115 | .replace("«", '"') 116 | .replace(" » ", '" ') 117 | .replace(" »", '"') 118 | .replace("»", '"') 119 | # handle pseudo-spaces 120 | .replace(" %", "%") 121 | .replace("nº ", "nº ") 122 | .replace(" :", ":") 123 | .replace(" ºC", " ºC") 124 | .replace(" cm", " cm") 125 | .replace(" ?", "?") 126 | .replace(" !", "!") 127 | .replace(" ;", ";") 128 | .replace(", ", ", ") 129 | .replace(" +", " ") 130 | .replace(".", ". ") 131 | ) 132 | # English "quotation," followed by comma, style 133 | if language == "en": 134 | res = re.sub(r"\"([,\.]+)", r"\1\"", res) 135 | # Czech is confused 136 | elif language == "cs" or language == "cz": 137 | pass 138 | # German/Spanish/French "quotation", followed by comma, style 139 | else: 140 | res = res.replace(',"', '",') 141 | res = re.sub( 142 | r"(\.+)\"(\s*[^<])", r"\"\1\2", res 143 | ) # don't fix period at end of sentence 144 | 145 | if ( 146 | language == "de" 147 | or language == "es" 148 | or language == "cz" 149 | or language == "cs" 150 | or language == "fr" 151 | ): 152 | res = re.sub(r"(\d) (\d)", r"\1,\2", res) 153 | else: 154 | res = re.sub(r"(\d) (\d)", r"\1.\2", res) 155 | return res 156 | 157 | 158 | def normalize(line: str, accent=True, case=True, numbers=True, punct=1) -> str: 159 | line = line.strip() 160 | if not line: 161 | return line 162 | if case: 163 | line = line.lower() 164 | if accent: 165 | line = strip_accents(line) 166 | if numbers: 167 | line = DIGIT_RE.sub("0", line) 168 | if punct == 1: 169 | line = replace_unicode_punct(line) 170 | elif punct == 2: 171 | line = remove_unicode_punct(line) 172 | line = remove_non_printing_char(line) 173 | return line 174 | 175 | 176 | def slow_normalize_for_dedup(line: str) -> str: 177 | return normalize(line, accent=False, case=True, numbers=True, punct=2) 178 | 179 | 180 | def normalize_for_dedup(line: str) -> str: 181 | line = line.strip() 182 | if not line: 183 | return line 184 | # case 185 | line = line.lower() 186 | # numbers 187 | line = DIGIT_RE.sub("0", line) 188 | line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line) 189 | return line 190 | -------------------------------------------------------------------------------- /cc_net/cc_net/tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | 7 | import time 8 | from typing import Dict, Optional 9 | 10 | import sacremoses # type: ignore 11 | 12 | from cc_net import jsonql, text_normalizer 13 | 14 | 15 | class RobustTokenizer(jsonql.Transformer): 16 | """Moses tokenizer with the expected preprocessing.""" 17 | 18 | LANG_WITHOUT_ACCENT = {"en", "my"} 19 | 20 | def __init__(self, lang: str): 21 | super().__init__() 22 | self.lang = lang 23 | self.moses = sacremoses.MosesTokenizer(lang) 24 | self.rm_accent = lang in self.LANG_WITHOUT_ACCENT 25 | self.ready = True 26 | 27 | def do(self, text: str): 28 | text = text_normalizer.normalize( 29 | text, accent=self.rm_accent, case=False, numbers=False, punct=True 30 | ) 31 | text = text_normalizer.normalize_spacing_for_tok(text, language=self.lang) 32 | return self.moses.tokenize(text, return_str=True, escape=False) 33 | 34 | 35 | class DocTokenizer(jsonql.Transformer): 36 | """Tokenize the text found in `output_field and store the result in `output_field`.""" 37 | 38 | def __init__( 39 | self, 40 | field: str, 41 | output_field: str = "tokenized", 42 | language_field: str = "language", 43 | ): 44 | super().__init__() 45 | self.field = field 46 | self.output_field = output_field 47 | self.language_field = language_field 48 | self.n_docs = 0 49 | self.tokenizers: Dict[str, RobustTokenizer] = {} 50 | 51 | def get_tokenizer(self, lang: str) -> Optional[RobustTokenizer]: 52 | cache = self.tokenizers 53 | if lang in cache: 54 | return cache[lang] 55 | if lang in ("th", "zh", "ja"): 56 | # TODO find a tokenizer for those languages 57 | return None 58 | 59 | cache[lang] = RobustTokenizer(lang) 60 | return cache[lang] 61 | 62 | def do(self, document): 63 | lang = document[self.language_field] 64 | tok = self.get_tokenizer(lang) 65 | if not tok: 66 | return document 67 | 68 | self.n_docs += 1 69 | lines = document[self.field].split("\n") 70 | tokenized = "\n".join(tok(l) for l in lines) 71 | document[self.output_field] = tokenized 72 | return document 73 | 74 | def summary(self): 75 | delay = (time.time() - self.start_time) / 3600 76 | speed = self.n_docs / delay 77 | return [ 78 | f"Tokenized {self.n_docs:_} documents in {delay:.2}h ({speed:.1} doc/s)." 79 | ] 80 | -------------------------------------------------------------------------------- /cc_net/cc_net/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/cc_net/cc_net/tools/__init__.py -------------------------------------------------------------------------------- /cc_net/cc_net/tools/dl_cc_100.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | 7 | import contextlib 8 | import functools 9 | import gzip 10 | import logging 11 | import multiprocessing 12 | from collections import defaultdict 13 | from pathlib import Path 14 | from typing import Callable, Dict, Iterator, List, NamedTuple, Optional, Tuple 15 | 16 | import cc_net 17 | from cc_net import jsonql 18 | from cc_net.process_wet_file import CCSegmentsReader 19 | 20 | # Set this to a directory to use as cache for intermediary files. 21 | # This helps for debugging. 22 | WET_CACHE = None 23 | # WET_CACHE = Path("wet_cache") 24 | 25 | S3_BUCKET = "https://dl.fbaipublicfiles.com/cc100" 26 | VERSION = "1.0.0" 27 | 28 | CC_100_SNAPSHOTS = [ 29 | "2018-05", 30 | "2018-09", 31 | "2018-13", 32 | "2018-17", 33 | "2018-22", 34 | "2018-26", 35 | "2018-30", 36 | "2018-34", 37 | "2018-39", 38 | "2018-43", 39 | "2018-47", 40 | "2018-51", 41 | ] 42 | 43 | BIG_LANGUAGES = { 44 | "es_XX", 45 | "fr_XX", 46 | "de_DE", 47 | "ja_XX", 48 | "ru_RU", 49 | "zh_CN", 50 | "en_XX", 51 | "it_IT", 52 | "ar_AR", 53 | "nl_XX", 54 | "pl_PL", 55 | "pt_XX", 56 | "tr_TR", 57 | "zh_TW", 58 | } 59 | 60 | 61 | class Paragraph(NamedTuple): 62 | lang: str 63 | text: str 64 | lm_score: float 65 | 66 | 67 | def _dl_shard(snapshot: str, shard: int) -> Iterator[Paragraph]: 68 | """ 69 | Download metadata from a shards. 70 | 71 | Sample metadata: 72 | 73 | { 74 | "cc_segment": "crawl-data/CC-MAIN-2018-51/segments/1544376823009.19/wet/CC-MAIN-20181209185547-20181209211547-00000.warc.wet.gz", 75 | "digest": "sha1:222LWNHN5FM26XGS7WJSMI6IISTVWBKJ", 76 | "url": "http://personals.gearplay.com/ads/DRJONES.htm", 77 | "line_ids": [10], 78 | "languages": ["en_XX"], 79 | "lm_scores": [-2.658], 80 | } 81 | """ 82 | snapshot = snapshot.replace("-", "_") 83 | name = f"snap_{snapshot}_batch_{shard}.json.gz" 84 | url = "/".join([S3_BUCKET, VERSION, name]) 85 | shard_metadata: Dict[str, Dict[str, dict]] = defaultdict(dict) 86 | try: 87 | cache_file: Optional[Path] = None 88 | if WET_CACHE is not None: 89 | cache_file = WET_CACHE / name 90 | metadata_file = jsonql.open_remote_file(url, cache_file) 91 | except: 92 | logging.warning(f"Couldn't open {url}") 93 | return 94 | 95 | for meta in jsonql.read_jsons(metadata_file): 96 | shard_metadata[meta["cc_segment"]][meta["digest"]] = meta 97 | 98 | found_pars, missed_pars = 0, 0 99 | for seg, segment_metadata in shard_metadata.items(): 100 | for doc in CCSegmentsReader([seg], cache_dir=WET_CACHE): 101 | if doc["digest"] not in segment_metadata: 102 | continue 103 | 104 | meta = segment_metadata[doc["digest"]] 105 | full_pars = [doc["title"]] + doc["raw_content"].split("\n") 106 | 107 | assert len(meta["line_ids"]) == len(meta["languages"]) 108 | assert len(meta["line_ids"]) == len(meta["lm_scores"]) 109 | for i, lang, score in zip( 110 | meta["line_ids"], meta["languages"], meta["lm_scores"] 111 | ): 112 | if snapshot != "2018-51" and lang in BIG_LANGUAGES: 113 | # Big languages only come from "2018-51" snapshot 114 | continue 115 | if i >= len(full_pars): 116 | # This is because CC100 was created by saving only urls. 117 | # Some urls appears in different snapshot with slightly different 118 | # versions, but we don't know which one is correct. 119 | # Here we read both versions, but some index may end up 120 | # being incorrect. 121 | # This impact ~3% documents. 122 | missed_pars += 1 123 | continue 124 | 125 | yield Paragraph(lang, full_pars[i], score) 126 | found_pars += 1 127 | if missed_pars > 0: 128 | logging.warning( 129 | f"Missed {missed_pars} ({missed_pars / found_pars:%}) paragraphes." 130 | ) 131 | 132 | 133 | def _split_by_par( 134 | paragraphes: Iterator[Paragraph], snapshot: str, shard: int, outdir: Path 135 | ) -> int: 136 | outdir.mkdir(exist_ok=True) 137 | outfiles = {} 138 | num_pars = 0 139 | try: 140 | for par in paragraphes: 141 | # MODIFY ME: filter paragraph if needed (languages, score, ...) 142 | if par.lang not in outfiles: 143 | (outdir / par.lang).mkdir(exist_ok=True) 144 | outfile = outdir / par.lang / f"snap_{snapshot}_batch_{shard}.gz" 145 | outfiles[par.lang] = gzip.open(outfile, "wt") 146 | 147 | print(par.text, file=outfiles[par.lang]) 148 | num_pars += 1 149 | finally: 150 | for o in outfiles.values(): 151 | o.close() 152 | 153 | logging.info(f"Extracted {num_pars:_d} paragraphs from shard {snapshot}_{shard}") 154 | return num_pars 155 | 156 | 157 | def dl_shard(snapshot: str, shard: int, outdir: Path) -> int: 158 | return _split_by_par(_dl_shard(snapshot, shard), snapshot, shard, outdir) 159 | 160 | 161 | @contextlib.contextmanager 162 | def unordered_map(processes: int): 163 | if processes == 0: 164 | yield map 165 | return 166 | 167 | with multiprocessing.Pool(processes) as pool: 168 | yield pool.imap_unordered 169 | 170 | 171 | def dl_snapshot(snapshot: str, outdir: Path, processes: int = 1) -> None: 172 | _dl_shard = functools.partial(dl_shard, snapshot, outdir=outdir) 173 | 174 | with unordered_map(processes) as umap: 175 | num_pars = sum(umap(_dl_shard, range(500))) 176 | 177 | logging.info(f"Extracted {num_pars:_d} paragraphs from snapshot {snapshot}.") 178 | 179 | 180 | def dl( 181 | snapshot: str = None, outdir: Path = Path("data_cc100"), processes: int = 1 182 | ) -> None: 183 | """ 184 | Download CC100 corpus. 185 | Will create one text file per language and CC snapshot. 186 | 187 | - snapshot: restrict to one snapshot. Useful for parallelization. 188 | - outdir: output directory 189 | - processes: number of processes to use 190 | """ 191 | if snapshot is None: 192 | snapshots = CC_100_SNAPSHOTS 193 | else: 194 | snapshots = snapshot.split(",") 195 | 196 | invalids = [s for s in snapshots if s not in CC_100_SNAPSHOTS] 197 | assert not invalids, f"Invalid snapshots {invalids}, chose from {CC_100_SNAPSHOTS}" 198 | 199 | for snapshot in snapshots: 200 | dl_snapshot(snapshot, outdir, processes) 201 | 202 | 203 | if __name__ == "__main__": 204 | import func_argparse 205 | 206 | func_argparse.single_main(dl) 207 | -------------------------------------------------------------------------------- /cc_net/cc_net/tools/make_dmoz_corpus.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | # 6 | 7 | """ 8 | This code is used to train a fastText classifier to label document with DMOZ categories. 9 | 10 | The data, distributed under the cc-by 3.0 license 11 | (https://web.archive.org/web/20140605215533/http://www.dmoz.org/license.html), 12 | can be downloaded from 13 | https://web.archive.org/web/20140617145301/http://rdf.dmoz.org/rdf/content.rdf.u8.gz. 14 | """ 15 | 16 | import urllib.request 17 | from io import StringIO 18 | from pathlib import Path 19 | from typing import Dict, Set 20 | from urllib.parse import urlparse 21 | 22 | import func_argparse 23 | from lxml import etree # type: ignore 24 | 25 | from cc_net import jsonql 26 | 27 | TaggedUrls = Dict[str, Set[str]] 28 | DMOZ_TAGS_URL = "https://web.archive.org/web/20140617145301/http://rdf.dmoz.org/rdf/content.rdf.u8.gz" 29 | 30 | 31 | def add_tags(url: str, tags: Set[str], url2tags: TaggedUrls): 32 | if url in url2tags: 33 | url2tags[url] &= tags 34 | else: 35 | url2tags[url] = tags 36 | 37 | 38 | def load_tags(filename: Path = None) -> TaggedUrls: 39 | if filename is None: 40 | with StringIO("".join(jsonql.open_remote_file(DMOZ_TAGS_URL))) as dmoz: 41 | tree = etree.parse(dmoz) 42 | else: 43 | tree = etree.parse(str(filename)) 44 | 45 | root = tree.getroot() 46 | url2tags: Dict[str, Set[str]] = {} 47 | for external_page in root.iterfind("{http://dmoz.org/rdf/}ExternalPage"): 48 | url = external_page.get("about") 49 | domain = urlparse(url).netloc 50 | for topic in external_page.iterfind("{http://dmoz.org/rdf/}topic"): 51 | # print(url, topic.text) 52 | # Tags looks like Top/Arts/Animation/Anime/Collectibles 53 | tags = set(topic.text.split("/")[1:]) 54 | add_tags(url, tags, url2tags) 55 | add_tags(domain, tags, url2tags) 56 | return url2tags 57 | 58 | 59 | def dl(output: Path) -> None: 60 | urllib.request.urlretrieve(DMOZ_TAGS_URL, output) 61 | 62 | 63 | def make_corpus(file: Path, tags_file: Path = None, output: Path = None) -> None: 64 | """ 65 | Loads a tags file and create a training dataset using the given webpages. 66 | 67 | Arguments: 68 | - file: CC shard file 69 | - tags_file: dmoz tagging file, (like the one produced by `dl`) 70 | - output: "" 71 | """ 72 | url2tags = load_tags(tags_file) 73 | with jsonql.open_write(output) as o: 74 | for document in jsonql.read_jsons(file): 75 | if not document: 76 | continue 77 | url = document["url"] 78 | domain = document["source_domain"] 79 | 80 | if url in url2tags: 81 | tags = url2tags[url] 82 | elif domain in url2tags: 83 | tags = url2tags[domain] 84 | else: 85 | continue 86 | 87 | if len(tags) == 0: 88 | continue 89 | 90 | fasttext_tags = ["__label__" + tag for tag in tags] 91 | content = document["tokenized"].replace("\n", " ").lower() 92 | if len(content) > 200: 93 | print(" ".join(fasttext_tags), content, file=o) # type: ignore 94 | 95 | 96 | if __name__ == "__main__": 97 | func_argparse.single_main(make_corpus) 98 | -------------------------------------------------------------------------------- /cc_net/pyproject.toml: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = "tests" 3 | 4 | [tool.black] 5 | line-length = 88 6 | target_version = ["py37"] 7 | 8 | [tool.isort] 9 | multi_line_output = 3 10 | include_trailing_comma = true 11 | force_grid_wrap = 0 12 | use_parentheses = true 13 | line_length = 88 14 | known_third_party = ["func_argparse"] 15 | skip = ["third_party", "data"] 16 | 17 | [mypy] 18 | python_version = 3.7 19 | check_untyped_defs = true 20 | 21 | [mypy-numpy] 22 | ignore_missing_imports = true 23 | [mypy-pytest] 24 | ignore_missing_imports = true 25 | -------------------------------------------------------------------------------- /cc_net/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # This source code is licensed under the license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | from pathlib import Path 7 | 8 | from setuptools import setup # type: ignore 9 | 10 | setup( 11 | name="cc_net", 12 | version="1.0.0", 13 | packages=["cc_net"], 14 | # metadata to display on PyPI 15 | author="Guillaume Wenzek", 16 | author_email="guw@fb.com", 17 | description="Tools to download and clean Common Crawl", 18 | keywords="common crawl dataset", 19 | url="https://github.com/facebookresearch/cc_net", 20 | license="CC-BY-NC-4.0", 21 | long_description=Path("README.md").read_text(), 22 | long_description_content_type="text/markdown", 23 | project_urls={ 24 | "Bug Tracker": "https://github.com/facebookresearch/cc_net/issues", 25 | "Source Code": "https://github.com/facebookresearch/cc_net", 26 | }, 27 | classifiers=[ 28 | "Development Status :: 4 - Beta", 29 | "Programming Language :: Python :: 3.7", 30 | ], 31 | python_requires=">=3.7", 32 | install_requires=[ 33 | "beautifulsoup4>=4.7.1", 34 | "pandas>=0.23.4", 35 | "requests>=2.22.0", 36 | "fasttext>=0.9.1", 37 | "sentencepiece>=0.1.82", 38 | "kenlm @ git+https://github.com/kpu/kenlm.git@master", 39 | "func_argparse>=1.1.1", 40 | "psutil>=5.6.3", 41 | "sacremoses", 42 | "submitit>=1.0.0", 43 | "typing_extensions", 44 | ], 45 | extras_require={ 46 | "dev": ["mypy==0.790", "pytest", "black==19.3b0", "isort==5.6.4"], 47 | # To use scripts inside cc_net/tools 48 | "tools": ["lxml", "sentence_splitter"], 49 | # Memory-efficient hashset. 50 | # This fork only compiles the kind of dict used by cc_net. 51 | # Full version is at https://github.com/atom-moyer/getpy 52 | "getpy": ["getpy @ git+https://github.com/gwenzek/getpy.git@v0.9.10-subset"], 53 | }, 54 | package_data={"cc_net": ["data/*"]}, 55 | ) 56 | -------------------------------------------------------------------------------- /contrib/CODE_REVIEW_DOCS.md: -------------------------------------------------------------------------------- 1 | # Code Review 2 | ### Conceptual Review 3 | 4 | A review can be a conceptual review, where the reviewer leaves a comment 5 | * `Concept (N)ACK`, meaning "I do (not) agree with the general goal of this pull 6 | request", 7 | * `Approach (N)ACK`, meaning `Concept ACK`, but "I do (not) agree with the 8 | approach of this change". 9 | 10 | A `NACK` needs to include a rationale why the change is not worthwhile. 11 | NACKs without accompanying reasoning may be disregarded. 12 | After conceptual agreement on the change, code review can be provided. A review 13 | begins with `ACK BRANCH_COMMIT`, where `BRANCH_COMMIT` is the top of the PR 14 | branch, followed by a description of how the reviewer did the review. The 15 | following language is used within pull request comments: 16 | 17 | - "I have tested the code", involving change-specific manual testing in 18 | addition to running the unit, functional, or fuzz tests, and in case it is 19 | not obvious how the manual testing was done, it should be described; 20 | - "I have not tested the code, but I have reviewed it and it looks 21 | OK, I agree it can be merged"; 22 | - A "nit" refers to a trivial, often non-blocking issue. 23 | 24 | ### Code Review 25 | Project maintainers reserve the right to weigh the opinions of peer reviewers 26 | using common sense judgement and may also weigh based on merit. Reviewers that 27 | have demonstrated a deeper commitment and understanding of the project over time 28 | or who have clear domain expertise may naturally have more weight, as one would 29 | expect in all walks of life. 30 | 31 | Where a patch set affects consensus-critical code, the bar will be much 32 | higher in terms of discussion and peer review requirements, keeping in mind that 33 | mistakes could be very costly to the wider community. This includes refactoring 34 | of consensus-critical code. 35 | 36 | Where a patch set proposes to change the Bittensor consensus, it must have been 37 | discussed extensively on the discord server and other channels, be accompanied by a widely 38 | discussed BIP and have a generally widely perceived technical consensus of being 39 | a worthwhile change based on the judgement of the maintainers. 40 | 41 | ### Finding Reviewers 42 | 43 | As most reviewers are themselves developers with their own projects, the review 44 | process can be quite lengthy, and some amount of patience is required. If you find 45 | that you've been waiting for a pull request to be given attention for several 46 | months, there may be a number of reasons for this, some of which you can do something 47 | about: 48 | 49 | - It may be because of a feature freeze due to an upcoming release. During this time, 50 | only bug fixes are taken into consideration. If your pull request is a new feature, 51 | it will not be prioritized until after the release. Wait for the release. 52 | - It may be because the changes you are suggesting do not appeal to people. Rather than 53 | nits and critique, which require effort and means they care enough to spend time on your 54 | contribution, thundering silence is a good sign of widespread (mild) dislike of a given change 55 | (because people don't assume *others* won't actually like the proposal). Don't take 56 | that personally, though! Instead, take another critical look at what you are suggesting 57 | and see if it: changes too much, is too broad, doesn't adhere to the 58 | [developer notes](DEVELOPMENT_WORKFLOW.md), is dangerous or insecure, is messily written, etc. 59 | Identify and address any of the issues you find. Then ask e.g. on IRC if someone could give 60 | their opinion on the concept itself. 61 | - It may be because your code is too complex for all but a few people, and those people 62 | may not have realized your pull request even exists. A great way to find people who 63 | are qualified and care about the code you are touching is the 64 | [Git Blame feature](https://docs.github.com/en/github/managing-files-in-a-repository/managing-files-on-github/tracking-changes-in-a-file). Simply 65 | look up who last modified the code you are changing and see if you can find 66 | them and give them a nudge. Don't be incessant about the nudging, though. 67 | - Finally, if all else fails, ask on IRC or elsewhere for someone to give your pull request 68 | a look. If you think you've been waiting for an unreasonably long time (say, 69 | more than a month) for no particular reason (a few lines changed, etc.), 70 | this is totally fine. Try to return the favor when someone else is asking 71 | for feedback on their code, and the universe balances out. 72 | - Remember that the best thing you can do while waiting is give review to others! -------------------------------------------------------------------------------- /core: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/core -------------------------------------------------------------------------------- /detection/__init__.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # Copyright © 2024 It's AI 3 | 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of 10 | # the Software. 11 | 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 16 | # DEALINGS IN THE SOFTWARE. 17 | 18 | # Define the version of the template module. 19 | __version__ = "3.11.2" 20 | __least_acceptable_version__ = "3.8.0" 21 | version_split = __version__.split(".") 22 | __spec_version__ = ( 23 | (1000 * int(version_split[0])) 24 | + (10 * int(version_split[1])) 25 | + (1 * int(version_split[2])) 26 | ) 27 | version_url = "https://raw.githubusercontent.com/it-s-ai/llm-detection/main/detection/__init__.py" 28 | 29 | # Import all submodules. 30 | from . import protocol 31 | from . import base 32 | from . import validator 33 | 34 | WANDB_PROJECT = "subnet32" 35 | WANDB_ENTITY = "itsai-dev" 36 | MAX_RUN_STEPS_PER_WANDB_RUN = 1 37 | -------------------------------------------------------------------------------- /detection/attacks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/detection/attacks/__init__.py -------------------------------------------------------------------------------- /detection/attacks/data_augmentation.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import numpy as np 3 | 4 | from detection.attacks.delete import DeleteAttack 5 | from detection.attacks.spelling import SpellingAttack 6 | from detection.attacks.synonym import SynonymAttack 7 | from detection.attacks.zero_width_space import ZeroWidthSpaceAttack 8 | 9 | nltk.download('punkt') 10 | nltk.download('stopwords') 11 | nltk.download('averaged_perceptron_tagger') 12 | 13 | 14 | class DataAugmentator: 15 | def __init__(self, device=0): 16 | self.attacks = [{'attacker': SynonymAttack(device=device), 'p': 0.05, 'pass_labels': True}, 17 | {'attacker': ZeroWidthSpaceAttack(), 'p': 0.05}, 18 | {'attacker': SpellingAttack(), 'p': 0.4}, 19 | {'attacker': DeleteAttack(), 'p': 0.1}, 20 | ] 21 | 22 | # {'attacker': ParaphraseAttack(), 'p': 0.2, 'apply_label': 1}, - needs too much GPU 23 | 24 | def __call__(self, text, labels): 25 | text = text.strip() 26 | 27 | applied_augs = [] 28 | for augmentation_step in self.attacks: 29 | if np.random.random() > augmentation_step['p']: 30 | continue 31 | 32 | if augmentation_step.get('pass_labels'): 33 | text = augmentation_step['attacker'].attack(text, labels) 34 | else: 35 | text = augmentation_step['attacker'].attack(text) 36 | applied_augs.append(type(augmentation_step['attacker']).__name__) 37 | 38 | n_auged = len(text.split()) 39 | 40 | if not sum(labels): 41 | labels_auged = [0] * n_auged 42 | else: 43 | first_zeros = 0 44 | for i in range(len(labels)): 45 | if labels[i] == 0: 46 | first_zeros += 1 47 | else: 48 | break 49 | last_zeros = 0 50 | for i in range(len(labels) - 1, -1, -1): 51 | if labels[i] == 0: 52 | last_zeros += 1 53 | else: 54 | break 55 | new_first_zeros = int(n_auged * first_zeros / len(labels)) 56 | new_last_zeros = int(n_auged * last_zeros / len(labels)) 57 | new_middle_ones = n_auged - new_first_zeros - new_last_zeros 58 | labels_auged = [0] * new_first_zeros + [1] * new_middle_ones + [0] * new_last_zeros 59 | 60 | return text, applied_augs, labels_auged 61 | -------------------------------------------------------------------------------- /detection/attacks/delete.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from nltk import pos_tag 4 | 5 | 6 | class DeleteAttack: 7 | def __init__(self, max_remove_words=5): 8 | self.max_remove_words = max_remove_words 9 | 10 | def remove_random_adjective(self, text): 11 | tokens = text.split() 12 | tagged_tokens = pos_tag(tokens) 13 | 14 | adjectives = [word for word, tag in tagged_tokens if tag in ('JJ', 'JJR', 'JJS')] 15 | 16 | if not adjectives: 17 | return ' '.join(tokens) 18 | 19 | adjective_to_remove = random.choice(adjectives) 20 | tokens.remove(adjective_to_remove) 21 | return ' '.join(tokens) 22 | 23 | def attack(self, text): 24 | n = random.randint(1, self.max_remove_words) 25 | for i in range(n): 26 | text = self.remove_random_adjective(text) 27 | 28 | return text 29 | -------------------------------------------------------------------------------- /detection/attacks/resources/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/detection/attacks/resources/.gitkeep -------------------------------------------------------------------------------- /detection/attacks/spelling.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import nltk 4 | import numpy as np 5 | 6 | nltk.download('punkt') 7 | nltk.download('averaged_perceptron_tagger') 8 | 9 | 10 | class SpellingAttack: 11 | def __init__(self, max_cycles=5): 12 | 13 | self.char_changes = [ 14 | {'name': 'typo_char_swap', 'p': 0.1}, 15 | {'name': 'typo_missing_char', 'p': 0.1}, 16 | {'name': 'typo_extra_char', 'p': 0.1}, 17 | {'name': 'typo_nearby_char', 'p': 0.1}, 18 | {'name': 'typo_similar_char', 'p': 0.1}, 19 | {'name': 'typo_skipped_space', 'p': 0.1}, 20 | {'name': 'typo_random_space', 'p': 0.1}, 21 | {'name': 'typo_repeated_char', 'p': 0.1}, 22 | {'name': 'typo_unichar', 'p': 0.1}, 23 | {'name': 'decapitalize_char', 'p': 0.1}, 24 | {'name': 'capitalize_char', 'p': 0.1}, 25 | ] 26 | 27 | self.max_cycles = max_cycles 28 | 29 | def decapitalize_char(self, text): 30 | capital_indices = [i for i, char in enumerate(text) if char.isupper()] 31 | if len(capital_indices) == 0: 32 | return text 33 | 34 | random_index = np.random.choice(capital_indices) 35 | 36 | modified_text = text[:random_index] + text[random_index].lower() + text[random_index + 1:] 37 | return modified_text 38 | 39 | def capitalize_char(self, text): 40 | lower_indices = [i for i, char in enumerate(text) if char.islower()] 41 | if len(lower_indices) == 0: 42 | return text 43 | 44 | random_index = np.random.choice(lower_indices) 45 | modified_text = text[:random_index] + text[random_index].upper() + text[random_index + 1:] 46 | return modified_text 47 | 48 | def attack(self, text): 49 | augs = [] 50 | n_repeated = random.randint(1, self.max_cycles) 51 | for i in range(n_repeated): 52 | augs += self.char_changes 53 | np.random.shuffle(augs) 54 | 55 | for augmentation_step in augs: 56 | if np.random.random() > augmentation_step['p']: 57 | continue 58 | 59 | if augmentation_step['name'] == 'decapitalize_char': 60 | text = self.decapitalize_char(text) 61 | elif augmentation_step['name'] == 'capitalize_char': 62 | text = self.capitalize_char(text) 63 | elif 'typo_' in augmentation_step['name']: 64 | error_type_name = augmentation_step['name'][5:] 65 | try: 66 | text = eval(f'typo.StrErrer(text).{error_type_name}().result') 67 | except: 68 | pass 69 | else: 70 | raise Exception("Unexpected augmentation name: {}".format(augmentation_step['name'])) 71 | 72 | return text 73 | -------------------------------------------------------------------------------- /detection/attacks/zero_width_space.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | class ZeroWidthSpaceAttack: 5 | def __init__(self, max_p=0.2): 6 | self.max_p = max_p 7 | 8 | def attack(self, text): 9 | cur_p = self.max_p * random.random() 10 | 11 | res = "" 12 | for word in text.split(): 13 | res += word 14 | if random.random() > cur_p: 15 | res += ' ' 16 | 17 | return res 18 | -------------------------------------------------------------------------------- /detection/base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/detection/base/__init__.py -------------------------------------------------------------------------------- /detection/protocol.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # Copyright © 2024 It's AI 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 4 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation 5 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 6 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 7 | 8 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of 9 | # the Software. 10 | 11 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 12 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 13 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 14 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 15 | # DEALINGS IN THE SOFTWARE. 16 | 17 | import pydantic 18 | from typing import List, Optional 19 | import bittensor as bt 20 | 21 | from detection import __version__ 22 | 23 | 24 | class TextSynapse(bt.Synapse): 25 | """ 26 | A protocol representation which uses bt.Synapse as its base. 27 | This protocol helps in handling request and response communication between 28 | the miner and the validator. 29 | 30 | Attributes: 31 | - texts: List of texts that needs to be evaluated for AI generation 32 | - predictions: List of probabilities in response to texts 33 | 34 | """ 35 | 36 | texts: List[str] = pydantic.Field( 37 | ..., 38 | title="Texts", 39 | description="A list of texts to check. Immuatable.", 40 | allow_mutation=False, 41 | ) 42 | 43 | predictions: List[List[float]] = pydantic.Field( 44 | ..., 45 | title="Predictions", 46 | description="List of predicted probabilities. This attribute is mutable and can be updated.", 47 | ) 48 | 49 | version: str = "" 50 | 51 | def deserialize(self) -> float: 52 | """ 53 | Deserialize output. This method retrieves the response from 54 | the miner in the form of self.text, deserializes it and returns it 55 | as the output of the dendrite.query() call. 56 | 57 | Returns: 58 | - List[float]: The deserialized response, which in this case is the list of preidictions. 59 | """ 60 | return self 61 | -------------------------------------------------------------------------------- /detection/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import config 2 | from . import misc 3 | from . import uids 4 | -------------------------------------------------------------------------------- /detection/utils/misc.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # Copyright © 2024 It's AI# Copyright © 2023 Opentensor Foundation 3 | 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of 10 | # the Software. 11 | 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 16 | # DEALINGS IN THE SOFTWARE. 17 | 18 | import time 19 | import math 20 | import hashlib as rpccheckhealth 21 | from math import floor 22 | from typing import Callable, Any 23 | from functools import lru_cache, update_wrapper 24 | 25 | 26 | # LRU Cache with TTL 27 | def ttl_cache(maxsize: int = 128, typed: bool = False, ttl: int = -1): 28 | """ 29 | Decorator that creates a cache of the most recently used function calls with a time-to-live (TTL) feature. 30 | The cache evicts the least recently used entries if the cache exceeds the `maxsize` or if an entry has 31 | been in the cache longer than the `ttl` period. 32 | 33 | Args: 34 | maxsize (int): Maximum size of the cache. Once the cache grows to this size, subsequent entries 35 | replace the least recently used ones. Defaults to 128. 36 | typed (bool): If set to True, arguments of different types will be cached separately. For example, 37 | f(3) and f(3.0) will be treated as distinct calls with distinct results. Defaults to False. 38 | ttl (int): The time-to-live for each cache entry, measured in seconds. If set to a non-positive value, 39 | the TTL is set to a very large number, effectively making the cache entries permanent. Defaults to -1. 40 | 41 | Returns: 42 | Callable: A decorator that can be applied to functions to cache their return values. 43 | 44 | The decorator is useful for caching results of functions that are expensive to compute and are called 45 | with the same arguments frequently within short periods of time. The TTL feature helps in ensuring 46 | that the cached values are not stale. 47 | 48 | Example: 49 | @ttl_cache(ttl=10) 50 | def get_data(param): 51 | # Expensive data retrieval operation 52 | return data 53 | """ 54 | if ttl <= 0: 55 | ttl = 65536 56 | hash_gen = _ttl_hash_gen(ttl) 57 | 58 | def wrapper(func: Callable) -> Callable: 59 | @lru_cache(maxsize, typed) 60 | def ttl_func(ttl_hash, *args, **kwargs): 61 | return func(*args, **kwargs) 62 | 63 | def wrapped(*args, **kwargs) -> Any: 64 | th = next(hash_gen) 65 | return ttl_func(th, *args, **kwargs) 66 | 67 | return update_wrapper(wrapped, func) 68 | 69 | return wrapper 70 | 71 | 72 | def _ttl_hash_gen(seconds: int): 73 | """ 74 | Internal generator function used by the `ttl_cache` decorator to generate a new hash value at regular 75 | time intervals specified by `seconds`. 76 | 77 | Args: 78 | seconds (int): The number of seconds after which a new hash value will be generated. 79 | 80 | Yields: 81 | int: A hash value that represents the current time interval. 82 | 83 | This generator is used to create time-based hash values that enable the `ttl_cache` to determine 84 | whether cached entries are still valid or if they have expired and should be recalculated. 85 | """ 86 | start_time = time.time() 87 | while True: 88 | yield floor((time.time() - start_time) / seconds) 89 | 90 | 91 | # 12 seconds updating block. 92 | @ttl_cache(maxsize=1, ttl=12) 93 | def ttl_get_block(self) -> int: 94 | """ 95 | Retrieves the current block number from the blockchain. This method is cached with a time-to-live (TTL) 96 | of 12 seconds, meaning that it will only refresh the block number from the blockchain at most every 12 seconds, 97 | reducing the number of calls to the underlying blockchain interface. 98 | 99 | Returns: 100 | int: The current block number on the blockchain. 101 | 102 | This method is useful for applications that need to access the current block number frequently and can 103 | tolerate a delay of up to 12 seconds for the latest information. By using a cache with TTL, the method 104 | efficiently reduces the workload on the blockchain interface. 105 | 106 | Example: 107 | current_block = ttl_get_block(self) 108 | 109 | Note: self here is the miner or validator instance 110 | """ 111 | return self.subtensor.get_current_block() 112 | -------------------------------------------------------------------------------- /detection/utils/uids.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | import bittensor as bt 4 | from typing import List 5 | 6 | 7 | def check_uid_availability( 8 | metagraph: "bt.metagraph.Metagraph", uid: int, vpermit_tao_limit: int 9 | ) -> bool: 10 | """Check if uid is available. The UID should be available if it is serving and has less than vpermit_tao_limit stake 11 | Args: 12 | metagraph (:obj: bt.metagraph.Metagraph): Metagraph object 13 | uid (int): uid to be checked 14 | vpermit_tao_limit (int): Validator permit tao limit 15 | Returns: 16 | bool: True if uid is available, False otherwise 17 | """ 18 | 19 | # Filter non serving axons. 20 | if not metagraph.axons[uid].is_serving: 21 | return False 22 | 23 | # Filter validator permit > 1024 stake. 24 | if metagraph.validator_permit[uid]: 25 | if metagraph.S[uid] > vpermit_tao_limit: 26 | return False 27 | 28 | # Available otherwise. 29 | return True 30 | 31 | 32 | def get_random_uids( 33 | self, k: int, exclude: List[int] = None 34 | ) -> torch.LongTensor: 35 | """Returns k available random uids from the metagraph. 36 | Args: 37 | k (int): Number of uids to return. 38 | exclude (List[int]): List of uids to exclude from the random sampling. 39 | Returns: 40 | uids (torch.LongTensor): Randomly sampled available uids. 41 | Notes: 42 | If `k` is larger than the number of available `uids`, set `k` to the number of available `uids`. 43 | """ 44 | candidate_uids = [] 45 | avail_uids = [] 46 | 47 | for uid in range(self.metagraph.n.item()): 48 | 49 | uid_is_available = check_uid_availability( 50 | self.metagraph, uid, self.config.neuron.vpermit_tao_limit 51 | ) 52 | uid_is_not_excluded = exclude is None or uid not in exclude 53 | 54 | if uid_is_available: 55 | avail_uids.append(uid) 56 | if uid_is_not_excluded: 57 | candidate_uids.append(uid) 58 | 59 | # Check if candidate_uids contain enough for querying, if not grab all avaliable uids 60 | available_uids = candidate_uids 61 | 62 | # If k is larger than the number of available uids, set k to the number of available uids. 63 | k = min(k, len(available_uids)) 64 | uids = torch.tensor(random.sample(available_uids, k)) 65 | return uids 66 | -------------------------------------------------------------------------------- /detection/utils/weight_version.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def version_to_tuple(version): 5 | return tuple(map(int, version.split('.'))) 6 | 7 | 8 | def is_valid_version_format(version): 9 | return bool(re.match(r'^\d+\.\d+\.\d+$', version)) 10 | 11 | 12 | def is_version_in_range(version, version1, version2): 13 | if not is_valid_version_format(version): 14 | return False 15 | 16 | v = version_to_tuple(version) 17 | v1 = version_to_tuple(version1) 18 | v2 = version_to_tuple(version2) 19 | 20 | if v1 > v2: 21 | v1, v2 = v2, v1 22 | 23 | return v1 <= v <= v2 -------------------------------------------------------------------------------- /detection/validator/__init__.py: -------------------------------------------------------------------------------- 1 | from .forward import forward 2 | from .reward import reward 3 | -------------------------------------------------------------------------------- /detection/validator/generate_version.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def version_to_tuple(version): 5 | return tuple(map(int, version.split('.'))) 6 | 7 | 8 | def tuple_to_version(version_tuple): 9 | return '.'.join(map(str, version_tuple)) 10 | 11 | 12 | def generate_random_version(version1, version2): 13 | v1 = version_to_tuple(version1) 14 | v2 = version_to_tuple(version2) 15 | 16 | if v1 > v2: 17 | v1, v2 = v2, v1 18 | 19 | def random_version_near(v): 20 | return tuple( 21 | max(v[i] + random.choice([-1, 1]), 0) 22 | if random.random() > 0.5 else v[i] 23 | for i in range(len(v)) 24 | ) 25 | 26 | def is_in_range(v): 27 | return v1 <= v <= v2 28 | 29 | while True: 30 | random_near_v1 = random_version_near(v1) 31 | if not is_in_range(random_near_v1): 32 | return tuple_to_version(random_near_v1) 33 | 34 | random_near_v2 = random_version_near(v2) 35 | if not is_in_range(random_near_v2): 36 | return tuple_to_version(random_near_v2) 37 | -------------------------------------------------------------------------------- /detection/validator/models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class ValDataRow(BaseModel): 5 | text: str 6 | text_auged: str | None = None 7 | label: bool 8 | segmentation_labels: list[bool] 9 | auged_segmentation_labels: list[bool] 10 | prompt: str | None = None 11 | data_source: str | None = None 12 | model_name: str | None = None 13 | model_params: dict | None = None 14 | topic: str | None = None 15 | 16 | augmentations: list[str] = [] 17 | 18 | -------------------------------------------------------------------------------- /detection/validator/my_datasets.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import random 4 | import time 5 | from abc import abstractmethod 6 | from pathlib import Path 7 | 8 | import bittensor as bt 9 | import numpy as np 10 | from datasets import load_dataset 11 | from collections.abc import Iterator 12 | 13 | from detection.validator.cc_dataset import CCDataset, get_2023_dumps 14 | from neurons.miners.deberta_classifier import DebertaClassifier 15 | 16 | PILE_COUNT = 80 17 | CC_COUNT = 40 18 | PILE_PROB = PILE_COUNT / (PILE_COUNT + CC_COUNT) 19 | 20 | 21 | class TextDataset(Iterator): 22 | def __init__(self, max_prompt_len, text_field): 23 | super().__init__() 24 | self.max_prompt_len = max_prompt_len 25 | self.text_field = text_field 26 | self.name = 'CommonCrawlDataset' if text_field == 'raw_content' else 'PileDataset' 27 | self.dataset = self.init_dataset() 28 | 29 | @abstractmethod 30 | def get_iter(self): 31 | ... 32 | 33 | def filter_rules_pass(self, prompt, completion): 34 | if random.random() > 0.01: 35 | return False 36 | return True 37 | 38 | def init_dataset(self): 39 | try: 40 | dataset = self.get_iter() 41 | return dataset 42 | except Exception as e: 43 | bt.logging.error("Got exception during {} dataset initializing: {}, retrying...".format(self.name, e)) 44 | time.sleep(60) 45 | return self.init_dataset() 46 | 47 | def __next__(self): 48 | while True: 49 | try: 50 | el = next(self.dataset) 51 | el[self.text_field] = el[self.text_field].replace('\x00', '') 52 | 53 | document_text = el[self.text_field][:int(self.max_prompt_len * 1.25)] 54 | context_len = int(len(document_text) * np.random.uniform(0.25, 0.75)) 55 | prompt = document_text[:context_len] 56 | completion = el[self.text_field][context_len:] 57 | 58 | if not self.filter_rules_pass(prompt, completion): 59 | continue 60 | 61 | return {'prompt': prompt, 'real_completion': completion} 62 | except Exception as e: 63 | if type(e) == StopIteration: 64 | bt.logging.info(f'{self.name} with ended: reinitializing it') 65 | else: 66 | bt.logging.error("Got exception during loading data from {}, reinitializing it: {}".format(self.name, e)) 67 | bt.logging.exception(e) 68 | 69 | self.dataset = self.init_dataset() 70 | continue 71 | 72 | 73 | class PileDataset(TextDataset): 74 | def __init__(self, max_prompt_len): 75 | super().__init__(max_prompt_len, 'text') 76 | 77 | def get_iter(self): 78 | seed = int(time.time()) 79 | dataset = iter( 80 | load_dataset("monology/pile-uncopyrighted", streaming=True)['train'].shuffle( 81 | seed=seed, buffer_size=100000 82 | ) 83 | ) 84 | return dataset 85 | 86 | 87 | class CommonCrawlDataset(TextDataset): 88 | def __init__(self, max_prompt_len): 89 | self.dumps_2023 = get_2023_dumps() 90 | logging.info(f"Found {len(self.dumps_2023)} dumps from 2023: {self.dumps_2023}") 91 | super().__init__(max_prompt_len, 'raw_content') 92 | 93 | def get_iter(self): 94 | seed = int(time.time()) 95 | random.seed(seed) 96 | logging.info('Using seed {}'.format(seed)) 97 | dataset = CCDataset( 98 | dumps=self.dumps_2023, 99 | num_segments=10, 100 | lang_model=Path("cc_net/bin/lid.bin"), 101 | lm_dir=Path("cc_net/data/lm_sp/"), 102 | lang_whitelist=['en'], 103 | lang_threshold=0.5, 104 | min_len=300, 105 | cache_dir=None, 106 | tmp_dir=Path("cc_net/tmp_segments"), 107 | ) 108 | return dataset 109 | 110 | def filter_rules_pass(self, prompt, completion): 111 | if random.random() > 0.1: 112 | return False 113 | return True 114 | 115 | 116 | class HumanDataset(Iterator): 117 | def __init__(self, max_prompt_len=1500): 118 | super().__init__() 119 | self.pile_dataset = PileDataset(max_prompt_len) 120 | self.common_crawl = CommonCrawlDataset(max_prompt_len) 121 | 122 | def __next__(self) -> dict: 123 | res = {} 124 | if random.random() < PILE_PROB: 125 | el = next(self.pile_dataset) 126 | res['data_source'] = 'pile' 127 | else: 128 | el = next(self.common_crawl) 129 | res['data_source'] = 'common_crawl' 130 | 131 | res['text'] = el['real_completion'] 132 | return res 133 | 134 | 135 | class PromptDataset(Iterator): 136 | def __init__(self, max_prompt_len=1500): 137 | super().__init__() 138 | self.pile_dataset = PileDataset(max_prompt_len) 139 | self.common_crawl = CommonCrawlDataset(max_prompt_len) 140 | self.max_prompt_len = max_prompt_len 141 | 142 | def __next__(self) -> dict: 143 | while True: 144 | res = {} 145 | if random.random() < PILE_PROB: 146 | el = next(self.pile_dataset) 147 | res['data_source'] = 'pile' 148 | else: 149 | el = next(self.common_crawl) 150 | res['data_source'] = 'common_crawl' 151 | 152 | if len(el['prompt']) > self.max_prompt_len: 153 | bt.logging.info("Prompt has len {}, truncating it to {} chars".format(len(el['prompt']), self.max_prompt_len)) 154 | 155 | res['prompt'] = el["prompt"][:self.max_prompt_len] 156 | if res['prompt'].strip(): 157 | return res 158 | 159 | 160 | if __name__ == '__main__': 161 | dataset = HumanDataset() 162 | print(next(dataset)) 163 | 164 | dataset = PromptDataset() 165 | for i in range(2): 166 | print(next(dataset)) 167 | -------------------------------------------------------------------------------- /detection/validator/segmentation_processer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | 5 | HUMAN_THEN_AI_PERCENT = 40 6 | AI_PERCENT = 25 7 | 8 | class SegmentationProcesser: 9 | def __init__(self, ): 10 | pass 11 | 12 | def merge_prompt_text(self, prompt, text): 13 | now = {} 14 | el = {'prompt': prompt, 'text': text} 15 | if not prompt: 16 | raise Exception("There is should be a prompt during merging") 17 | 18 | if np.random.random() < HUMAN_THEN_AI_PERCENT / (HUMAN_THEN_AI_PERCENT + AI_PERCENT): 19 | now['text'] = el['prompt'] + el['text'] 20 | now['cnt_first_human'] = len(el['prompt'].split()) 21 | else: 22 | now['cnt_first_human'] = 0 23 | now['text'] = el['text'] 24 | 25 | return now['text'], now['cnt_first_human'] 26 | 27 | def subsample_words(self, text, labels, min_cnt=35, max_cnt=350): 28 | words = text.split() 29 | if len(words) <= min_cnt: 30 | return ' '.join(words), labels 31 | 32 | cnt = random.randint(min_cnt, min(max_cnt, len(words))) 33 | 34 | has_01 = False 35 | has_10 = False 36 | 37 | for i in range(len(labels) - 1): 38 | if labels[i] == 0 and labels[i + 1] == 1: 39 | has_01 = True 40 | if labels[i] == 1 and labels[i + 1] == 0: 41 | has_10 = True 42 | 43 | if has_01 and has_10: 44 | # if random.random() < 0.5: 45 | # currently we always take ai the first and then human 46 | ind = None 47 | for i in range(len(labels) - 1): 48 | if labels[i] == 0 and labels[i + 1] == 1: 49 | ind = i + 1 50 | break 51 | return self.subsample_words(' '.join(words[ind:]), labels[ind:]) 52 | # else: 53 | # ind = None 54 | # for i in range(len(labels) - 1): 55 | # if labels[i] == 1 and labels[i + 1] == 0: 56 | # ind = i + 1 57 | # break 58 | # return self.subsample_words(' '.join(words[:ind]), labels[:ind]) 59 | 60 | split_index = None 61 | for i in range(len(labels) - 1): 62 | if labels[i] != labels[i + 1]: 63 | split_index = i 64 | break 65 | 66 | if split_index is not None: # for two class case 67 | ind = random.randint(max(split_index - cnt, 0), min(len(words) - cnt, split_index)) 68 | else: # for one class case 69 | ind = random.randint(0, len(words) - cnt) 70 | 71 | res = words[ind:ind + cnt] 72 | labels = labels[ind:ind + cnt] 73 | 74 | if random.random() > 0.5 and len(res): 75 | sent_ind = random.randint(0, len(res[0]) - 1) 76 | res[0] = res[0][sent_ind:] 77 | 78 | if random.random() > 0.5: 79 | sent_ind = random.randint(0, len(res[-1]) - 1) 80 | res[-1] = res[-1][:sent_ind] 81 | 82 | return ' '.join(res), labels 83 | -------------------------------------------------------------------------------- /detection/validator/text_completion.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | import time 4 | 5 | import bittensor as bt 6 | import numpy as np 7 | import requests 8 | from langchain_ollama.llms import OllamaLLM 9 | 10 | from detection.validator.text_postprocessing import TextCleaner 11 | 12 | 13 | class OllamaModel: 14 | def __init__(self, model_name, num_predict=900, base_url="http://127.0.0.1:11434", in_the_middle_generation=False): 15 | """ 16 | available models you can find on https://github.com/ollama/ollama 17 | before running model install ollama and run 'ollama pull ' 18 | """ 19 | self.model_name = model_name 20 | self.base_url = base_url 21 | self.num_predict = num_predict 22 | self.in_the_middle_generation = in_the_middle_generation 23 | 24 | bt.logging.info(f'Initializing OllamaModel {model_name}') 25 | if num_predict > 1000: 26 | raise Exception("You're trying to set num_predict to more than 1000, it can lead to context overloading and Ollama hanging") 27 | 28 | pulled_models = [el['name'] for el in self.ollama_list()['models']] if self.ollama_list() is not None else [] 29 | if model_name not in pulled_models and model_name + ':latest' not in pulled_models: 30 | bt.logging.info("Model {} cannot be found locally - downloading it...".format(model_name)) 31 | self.ollama_pull(model_name) 32 | bt.logging.info("Successfully downloaded {}".format(model_name)) 33 | else: 34 | bt.logging.info("Found model {} locally, pulling in case of updates".format(model_name)) 35 | self.ollama_pull(model_name) 36 | 37 | self.model = None 38 | self.params = {} 39 | self.init_model() 40 | 41 | self.text_cleaner = TextCleaner() 42 | 43 | def ollama_list(self): 44 | req = requests.get('{}/api/tags'.format(self.base_url)) 45 | return req.json() 46 | 47 | def ollama_pull(self, model_name): 48 | req = requests.post('{}/api/pull'.format(self.base_url), json={'model': model_name}) 49 | 50 | def init_model(self): 51 | # sapmling order in ollama: top_k, tfs_z, typical_p, top_p, min_p, temperature 52 | sampling_temperature = np.clip(np.random.normal(loc=1, scale=0.3), a_min=0, a_max=2) 53 | # Centered around 1 because that's what's hardest for downstream classification models. 54 | 55 | frequency_penalty = np.random.uniform(low=0.7, high=1.6) 56 | top_k = int(np.random.choice([-1, 20, 40, 80])) 57 | # top_k = top_k if top_k != -1 else None 58 | top_p = np.random.uniform(low=0.5, high=1) 59 | 60 | if random.random() < 0.1: 61 | # greedy strategy 62 | sampling_temperature = 0 63 | 64 | self.model = OllamaLLM(model=self.model_name, 65 | base_url=self.base_url, 66 | timeout=200, 67 | num_thread=1, 68 | num_predict=self.num_predict, 69 | temperature=sampling_temperature, 70 | repeat_penalty=frequency_penalty, 71 | top_p=top_p, 72 | top_k=top_k, 73 | ) 74 | 75 | self.params = {'top_k': top_k, 'top_p': top_p, 'temperature': sampling_temperature, 'repeat_penalty': frequency_penalty} 76 | 77 | def __call__(self, prompt: str, text_completion_mode=False) -> str | None: 78 | while True: 79 | try: 80 | if text_completion_mode: 81 | if 'text' not in self.model_name: 82 | system_message = "You're a text completion model, just complete text that user sended you" # . Return text without any supportive - we write add your result right after the user text 83 | text = self.model.invoke([{'role': 'system', 'content': system_message}, 84 | {'role': 'user', 'content': prompt}]) 85 | else: 86 | text = self.model.invoke(prompt) 87 | else: 88 | assert 'text' not in self.model_name 89 | text = self.model.invoke(prompt) 90 | 91 | return self.text_cleaner.clean_text(text) 92 | except Exception as e: 93 | bt.logging.info("Couldn't get response from Ollama, probably it's restarting now: {}".format(e)) 94 | time.sleep(1) 95 | 96 | def classic_invoke(self, messages: list[dict]) -> str | None: 97 | while True: 98 | try: 99 | return self.model.invoke(messages) 100 | except Exception as e: 101 | bt.logging.info("Couldn't get response from Ollama, probably it's restarting now: {}".format(e)) 102 | time.sleep(1) 103 | 104 | def __repr__(self) -> str: 105 | return f"{self.model_name}" 106 | 107 | 108 | if __name__ == '__main__': 109 | bt.logging.info("started") 110 | model = OllamaModel('llama2') 111 | bt.logging.info("finished") 112 | print(model.model) 113 | -------------------------------------------------------------------------------- /detection/validator/text_postprocessing.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import numpy as np 4 | 5 | 6 | class TextCleaner: 7 | def __init__(self): 8 | pass 9 | 10 | def _remove_emoji(self, text: str) -> str: 11 | # remove emojies 12 | emoji_pattern = re.compile("[" 13 | u"\U0001F600-\U0001F64F" # emoticons 14 | u"\U0001F300-\U0001F5FF" # symbols & pictographs 15 | u"\U0001F680-\U0001F6FF" # transport & map symbols 16 | u"\U0001F1E0-\U0001F1FF" # flags (iOS) 17 | "]+", flags=re.UNICODE) 18 | 19 | text = emoji_pattern.sub(r'', text) 20 | return text 21 | 22 | def _remove_subtext(self, text: str) -> str: 23 | # remove words like *smiling*, *adjusts glasses*, etc 24 | last = None 25 | mask = np.ones(len(text)) 26 | for i, c in enumerate(text): 27 | if c == '*': 28 | if last is None or (i - last) > 50: 29 | last = i 30 | else: 31 | mask[last:i + 1] = 0 32 | last = None 33 | return ''.join([c for i, c in enumerate(text) if mask[i]]) 34 | 35 | def clean_text(self, text: str) -> str: 36 | text = text.strip() 37 | text = self._remove_emoji(text) 38 | text = self._remove_subtext(text) 39 | return text -------------------------------------------------------------------------------- /docs/FAQ.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## Miners 4 | ### Is there a leaderboard where I can see the performance rankings of miners? 5 | Yes, we do have a leaderboard based on OTF scores: https://huggingface.co/spaces/Infin/ai-detection-leaderboard 6 | 7 | ### Do you have a wandb? 8 | Yes, https://wandb.ai/itsai-dev/subnet32 9 | 10 | ### I deployed my miner, but it hasn't received any requests 11 | It's okay to have no requests in first several hours - validators need some time see your miner. 12 | If you already waited 2-3 hours and still has no queries, check that you've properly exposed your ports to the internet. 13 | 14 | ### The miner has been running for 9 hours, but taostats shows that emissions are still 0 15 | If you get queries from validators than just wait for a several hours (around 5-6) and incentive should show up. 16 | If don't check question about not receiving requests. 17 | 18 | ### Import issues and errors inside installed python packages 19 | If you see error and it's happening inside other library most probably you haven't install right version of it. 20 | Try to install correct packages from requirements.txt inside a new venv and rerun your miner. 21 | Also we suggest you to use python 3.10 version. 22 | 23 | ### How can I evaluate the competitiveness of a model in this network without registration? Is there a local evaluator or something? 24 | You can collect data in the way validator does using detection/data_generator.py and locally validate your model on it. 25 | Other way is to run model on testnet and check it's scores in wandb. 26 | 27 | ### Does the baseline still valid for the miner? 28 | Baseline model is not survivable, it may even not pass a minimum out_of_domain_f1_score (which was selected based on the current alive miners scores). 29 | 30 | ### Why am I getting zero scores from validators sometimes? 31 | Here are some possible ways to get zero score: 32 | a) answered to validator with wrong version 33 | b) your miner answers on small batch texts don't match predictions on the same texts in big batch 34 | c) mean f1-score on out of domain validation is less than 0.9 35 | d) maybe there were an internet problem on your or validator side and you didn't recieve the request 36 | 37 | ### How can I improve quality of the model? What models should I use to be in the top? 38 | It’s the goal of every miner to find out the way to be better and stay at the top. Even we don’t know which models are the best at the moment at your subnet. 39 | 40 | ### Is testnet validator running? 41 | Most probably yes - it should be running on SN87 on testnet. 42 | 43 | ## Validators 44 | 45 | ### The CommonCrawlDataset has rate limits and randomly stops working. Is there any alternative? 46 | ![img.png](faq_1.png) 47 | 48 | It's working fine if you're not running several instances (of data generator or validators) at the same time, because it overwrites files in "cc_net/tmp_segments" 49 | 50 | -------------------------------------------------------------------------------- /docs/faq_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/docs/faq_1.png -------------------------------------------------------------------------------- /docs/incentive.md: -------------------------------------------------------------------------------- 1 | # Incentive mechanism 2 | 3 | For validating we use two types of data, which is balanced in proportion 1:1. 4 | 5 | ### Human-written texts 6 | To gather human-written validation data we use the Pile dataset. 7 | 8 | The Pile is a 825 GiB diverse, open source language modelling data set that consists of 22 smaller, high-quality datasets combined together. It includes web-crawled data, financial, med, law, arxiv, github and also about 15 different topics. 9 | 10 | ### AI-generated texts 11 | For AI-generated text collection, we need to obtain prompts and then generate texts based on these prompts. While for human texts we take samples from Pile dataset we have to generate ai-samples from the same data-source, so that the only difference between them was human/ai written. 12 | 13 | So, as prompts we take a random sample and then use part of it as text begging and ask LLMs to generate a completion for it. 14 | 15 | We use the Ollama GitHub repository to run Large Language Models and generate completions for these prompts. As LLMs we use 30+ SOTA models from the top of LLM-Arena. 16 | 17 | We also randomly select generation parameters for LLM during validation to make the dataset more diverse. 18 | 19 | ### Data augmentation to prevent cheating 20 | To prevent remembering Pile dataset and make it stablier to overfitting we add some augmentation to both ai-generated and human-written texts. First of all we select a random sequence of consecutive sentences from a given text. Then we add in a random place (or two) misspelling (about 10 different char-based augs) or remove a random adjective. 21 | 22 | These augmentations don't allow miners to precalculate hashes on Pile dataset and then use them to determine whether this text is present in the human set of data or not. 23 | 24 | ## Reward counting 25 | Based on [Detecting LLM-Generated Text in Computing Education](https://arxiv.org/pdf/2307.07411.pdf) 26 | article we decided to dived our reward on 3 parts: 27 | 28 | #### F1 score 29 | We decided to use it instead of classic accuracy, because 30 | it better represent quality of model especially on binary-classification tasks. 31 | 32 | #### False Positive score 33 | FP_score = 1 - FP / len(samples). 34 | 35 | It is usually more important not to mistakenly classify human-written text as AI-generated than the other way around. 36 | It is preferable to tolerate a few more instances of student cheating or read some AI-generated emails than to wrongly penalize a real student or miss an important letter. 37 | 38 | #### AP score 39 | AP summarizes a precision-recall curve by calculating the weighted mean of precisions achieved at each threshold. 40 | This allows us to evaluate the quality of the model's ranking. 41 | 42 | 43 | The final reward is the average of these three values. 44 | -------------------------------------------------------------------------------- /docs/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/docs/logo.png -------------------------------------------------------------------------------- /docs/meet_its_ai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/docs/meet_its_ai.png -------------------------------------------------------------------------------- /docs/miner_solution.md: -------------------------------------------------------------------------------- 1 | 2 | ## Perplexity approach 3 | 4 | We made a solid baseline solution based on counting [perplexity of fixed-length models](https://huggingface.co/docs/transformers/perplexity). 5 | For counting PPL we use a fresh phi-2 model from microsoft, which has been released at the end of 2023. 6 | We also trained a linear model on the phi-2 outputs, to make probabilities more representative. 7 | 8 | On our local validation with baseline model got overall accuracy about 89%, you can find accuracy per data source below: 9 | 10 | | Data Source | Accuracy | 11 | |---------------------------|----------| 12 | | LLM (gemma:7b) | 0.939 | 13 | | LLM (neural-chat) | 0.856 | 14 | | LLM (zephyr:7b-beta) | 0.964 | 15 | | LLM (vicuna) | 0.981 | 16 | | LLM (mistral) | 0.963 | 17 | | Human-data | 0.841 | -------------------------------------------------------------------------------- /docs/mining.md: -------------------------------------------------------------------------------- 1 | # ⛏️ Mining 2 | 3 | ## FAQ 4 | 5 | We've collected some frequently asked questions in the Discord Channel and made a FAQ page, hope this help you to run your miners easier. We'll be updating it with fresh questions as they appear: 6 | 7 | https://piquant-door-af5.notion.site/FAQ-0de42be01aa948c08cbfe982f2112aa8?pvs=4 8 | 9 | ## System Requirements 10 | 11 | Miners will need enough processing power to inference models. The device the models are inferenced on is recommended to be a GPU (atleast NVIDIA RTX A4000) with minimum 16 GB of VRAM. 12 | 13 | 14 | ## Installation 15 | 16 | 1. Clone the repo 17 | 18 | ```bash 19 | apt update && apt upgrade -y 20 | git clone https://github.com/It-s-AI/llm-detection 21 | ``` 22 | 23 | 2. Setup your python [virtual environment](https://docs.python.org/3/library/venv.html) or [Conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands). 24 | 25 | 3. Install the requirements. From your virtual environment, run 26 | ```shell 27 | cd llm-detection 28 | python -m pip install -e . 29 | ``` 30 | 31 | 4. Download models for LLM classification 32 | ```commandline 33 | wget https://huggingface.co/sergak0/sn32/resolve/main/deberta-large-ls03-ctx1024.pth -O models/deberta-large-ls03-ctx1024.pth 34 | wget https://huggingface.co/sergak0/sn32/resolve/main/deberta-v3-large-hf-weights.zip -O models/deberta-v3-large-hf-weights.zip 35 | apt install zip unzip 36 | unzip models/deberta-v3-large-hf-weights.zip -d models/deberta-v3-large-hf-weights 37 | ``` 38 | 39 | 4. Make sure you've [created a Wallet](https://docs.bittensor.com/getting-started/wallets) and [registered a hotkey](https://docs.bittensor.com/subnets/register-and-participate). 40 | 41 | ```bash 42 | btcli w new_coldkey 43 | btcli w new_hotkey 44 | btcli s register --netuid 32 --wallet.name YOUR_COLDKEY --wallet.hotkey YOUR_HOTKEY 45 | ``` 46 | 47 | 5. (Optional) Run a Subtensor instance: 48 | Your node will run better if you are connecting to a local Bittensor chain entrypoint node rather than using Opentensor's. 49 | We recommend running a local node as follows and passing the ```--subtensor.network local``` flag to your running miners/validators. 50 | To install and run a local subtensor node follow the commands below with Docker and Docker-Compose previously installed. 51 | ```bash 52 | git clone https://github.com/opentensor/subtensor.git 53 | cd subtensor 54 | docker compose up --detach 55 | ``` 56 | 57 | ## Running the Miner 58 | 59 | 60 | 61 | > **Note:** Recently, the public RPC endpoint has been under high load, so it's strongly advised that you use your local Subtensor instance! 62 | 63 | 64 | Install PM2 and the jq package on your system. 65 | ```bash 66 | sudo apt update && sudo apt install jq && sudo apt install npm && sudo npm install pm2 -g && pm2 update 67 | ``` 68 | 69 | To start your miner basic command is 70 | 71 | ```bash 72 | pm2 start --name net32-miner --interpreter python3 ./neurons/miner.py -- --wallet.name YOUR_COLDKEY --wallet.hotkey YOUR_HOTKEY --neuron.device cuda:0 --axon.port 70000 73 | ``` 74 | 75 | ## Running the Miner on TESTNET 76 | 77 | We have testnet subnet with netuid **87**. There is our validator running with uid 52 and hotkey `5Eo4PQvU4fhGLhk91UKpAaaEH59aHsVsw2jZ6ZhRT12s6JRA`. 78 | 79 | To start miner on testnet you have to run the following command 80 | 81 | ```bash 82 | pm2 start --name net32-miner --interpreter python3 ./neurons/miner.py -- --wallet.name YOUR_COLDKEY --wallet.hotkey YOUR_HOTKEY --neuron.device cuda:0 --axon.port 70000 --subtensor.network test --netuid 87 --blacklist.minimum_stake_requirement 0 83 | ``` 84 | 85 | > IMPORTANT: you should set `blacklist.minimum_stake_requirement` argument to 0 so our validator won't get blacklisted 86 | -------------------------------------------------------------------------------- /docs/raid_leaderboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/docs/raid_leaderboard.png -------------------------------------------------------------------------------- /docs/validating.md: -------------------------------------------------------------------------------- 1 | # 🧑‍🏫 Validating 2 | 3 | # System Requirements 4 | 5 | Validators will need enough processing power to inference multiple models. It is required to have a GPU (we commend NVIDIA A100) with minimum 80GB of VRAM. 6 | Also you need to have at least 1T of disk space. 7 | 8 | ## Installation 9 | 10 | Make sure that your server provider support systemd (RunPod doesn't support it). 11 | Otherwise ollama service won't be restarting automatically and you'll have to restart it on your own from time to time. 12 | 13 | 1. Clone the repo 14 | 15 | ```bash 16 | apt update && apt upgrade -y 17 | git clone https://github.com/It-s-AI/llm-detection 18 | ``` 19 | 20 | 2. Setup your python [virtual environment](https://docs.python.org/3/library/venv.html) or [Conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-with-commands). 21 | 22 | 3. Install the requirements. From your virtual environment, run 23 | ```shell 24 | cd llm-detection 25 | python3 -m pip install -e . 26 | python3 -m pip uninstall mathgenerator -y 27 | python3 -m pip install git+https://github.com/synapse-alpha/mathgenerator.git 28 | ``` 29 | 30 | 4. Make sure you've [created a Wallet](https://docs.bittensor.com/getting-started/wallets) and [registered a hotkey](https://docs.bittensor.com/subnets/register-and-participate). 31 | 32 | ```bash 33 | btcli w new_coldkey 34 | btcli w new_hotkey 35 | btcli s register --netuid 32 --wallet.name YOUR_COLDKEY --wallet.hotkey YOUR_HOTKEY 36 | ``` 37 | 38 | ## Install driver 39 | 40 | Install PM2 and the jq package on your system. 41 | ```bash 42 | sudo apt update && sudo apt install jq && sudo apt install npm && sudo npm install pm2 -g && pm2 update 43 | ``` 44 | 45 | Make `run.sh` file executable. 46 | ```bash 47 | chmod +x run.sh 48 | ``` 49 | 50 | So Ollama models can detect GPUs on your system 51 | ```bash 52 | apt update 53 | apt install lshw -y 54 | ``` 55 | 56 | ## Download models 57 | 58 | Install Ollama 59 | ```bash 60 | curl -fsSL https://ollama.com/install.sh | sh 61 | ``` 62 | 63 | Run ollama service in background (make sure that you don't have any running instances of ollama before running this command) 64 | ``` 65 | pm2 start --name ollama "ollama serve" 66 | ``` 67 | 68 | If you want to update your pulled models run this: 69 | ``` 70 | ollama list | tail -n +2 | awk '{print $1}' | while read -r model; do 71 | ollama pull $model 72 | done 73 | ``` 74 | 75 | Install cc_net 76 | ```bash 77 | sudo apt-get install build-essential libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev zip unzip -y 78 | pip install -e . 79 | ``` 80 | 81 | ## Running the Validator 82 | Note (from bittensor docs): the validator needs to serve an Axon with their IP or they may be blacklisted by the firewall of serving peers on the network. 83 | 84 | If you want to properly serve your Axon you need to change --axon.port from 70000 to a real one. 85 | 86 | ```bash 87 | pm2 start run.sh --name llm_detection_validators_autoupdate -- --wallet.name YOUR_COLDKEY --wallet.hotkey YOUR_HOTKEY --axon.port 70000 --neuron.device cuda:0 88 | ``` 89 | 90 | -------------------------------------------------------------------------------- /docs/vision_and_roadmap.md: -------------------------------------------------------------------------------- 1 | # Vision & Roadmap 2 | 3 | At the moment, many subnets have tasks for which they have implemented SOTA models in their miner codes to instantly achieve high quality. For such tasks, implementing better solutions could give miners only basis points of improvement. Almost no room to grow. 4 | 5 | But our subnet is different. AI detection is a hard task to achieve high quality. That's why we aimed not just make "marketplace for inference SOTA models" as other subnets did but rather to create a constantly evolving environment where miners have to get better over time and not just run the same models for months. 6 | 7 | In order to implement such an environment, we need to do the following. 8 | 9 | ## Validators 10 | 11 | Currently, validators use one large dataset with human data and two models (mistral and vicuna) to generate AI texts. What could be done to improve that: 12 | 13 | 0. Use softmax on miners' scores for higher miners motivation 14 | 1. Add more models. By increasing the number and diversity of models, we will improve the overall quality of detection 15 | 2. Add more languages 16 | 3. Paraphrasing of AI texts 17 | 4. Make it resilient to tricks and attacks 18 | 5. Various types of text: differentiate articles/comments/posts/etc., in order to improve quality on each distinct type 19 | 6. Save all data that validator generates into cloud to make an open-source dataset in future 20 | 21 | ## Miners 22 | 23 | Generally speaking, improving miners is not our task. Miners should get better themselves. But there are a few things we can do to help them: 24 | 25 | 1. Host testnet validators so miners can start without wasting TAO. 26 | 2. Make leaderboard and local dataset: we will list miners' metrics and allow people who want to start mining to evaluate their solution on a local dataset to compare them with existing ones before going to the mainnet. 27 | 3. Create Kaggle competition to introduce some of the best ML engineers to our subnet and make them run their top solution on-chain. 28 | 4. Despite the fact that solving LLM detection is a miner's problem, we are going to continue our own researches in this field to improve baseline solution and increase overall subnet's quality. 29 | ## Applications 30 | 31 | One of the important tasks for us as subnet owners is to apply the subnet for real usage. Given the relevance of the problem, there is clearly a request for such solutions. That’s what we’re going to do: 32 | 33 | ### Web service 34 | We’ve already developed an MVP version of a website for our subnet, where you can write some texts and then get miners' predictions with probability of this text to be ai-generated. But we’re going to develop a full version of web service, which will provide users even outside bittensor community availability to detect ai-generated texts. 35 | 36 | ### Twitter extension 37 | Today, X/Twitter is among the top 6 social networking apps in the United States. And boasts over 500 million users worldwide. With the rapid growth of Large Language Models like ChatGpt and more and more content on the internet are generated by them. 38 | We’re going to build an extension for twitter, which will mark tweets and comments that you’re reading with ai-generated/human-written tags based on miners predictions from the subnet, so that people can know what content is qualitative and which texts are just auto-generated. 39 | 40 | ### Browser extension 41 | We also found it very useful to have an ability to instantly check whether some peace of text that you’re reading is ai-generated or human-written, so one of the application that we want to develop is a browser extension, with which users can just highlight some text and see a probability of this text to be ai-generated. 42 | 43 | ### API 44 | As mentioned above we’re going to develop several applications based on our subnet, but there are of course many more use cases for llm-detection in particular situations/businesses. So, we are also going to provide an API service that can be used by developers for their own integrations or for making predictions on a big amount of text (for example by AI engineers to clean up their datasets). 45 | 46 | ### Commerce 47 | All of the mentioned above services will have their own subscription plans to commercialize SN32. They will be based on api, which will be run by validators to provide access for miners and on which validators will be able to earn additional money. 48 | 49 | By commercializing our product, we will become less reliant on emissions and start gaining real usage. Also, by the time when dynamic tao is introduced and validators' emission becomes zero, our token will already have great utility, and validators will be earning from the mentioned services. 50 | -------------------------------------------------------------------------------- /docs/what_are_subnets.md: -------------------------------------------------------------------------------- 1 | # What is Bittensor? 2 | Bittensor is a network where computers validate the work that other computers contribute to the network - the work what is most valuable to the collective will be rewarded 3 | 4 | Bittensor is a catalyst to the open-source developers and smaller AI research labs now have a financial incentive for fine-tuning open foundational models 5 | 6 | Bittensor is a library of machine intelligence that continuously grows and shares knowledge amongst peers 7 | 8 | # What is a subnet? 9 | 10 | Bittensor is releasing its own language for creating incentive mechanisms. This allows developers to build incentive systems on Bittensor, tapping into our web of intelligence to develop markets of the developer’s choosings 11 | 12 | Subnet 1, an incentive system for machine intelligence production, showcases the enormous potential of markets to procure huge amounts of resources. Releasing user-created subnets is set to create a cambrian explosion of additional resources into the Bittensor ecosystem 13 | 14 | # Why should you care? 15 | 16 | As an open-source developer, you now have the ability to write your own incentive mechanisms without creating an entirely new chain. By tapping into Bittensor’s network of intelligence, you can incentivize AI models from all over the world to perform tasks of your choosing (i.e., image generation, storage, compute access, etc.) - the possibilities are truly endless 17 | 18 | The release of subnets also offers the potential to pull these tools into a shared network, making all the ingredients necessary to create intelligence available within one network, governed by one token 19 | 20 | You get to play a vital role in helping bootstrap what could one day become one of the most powerful networks in the world - and you make money by doing so! 21 | 22 | By incentivizing developers to create their own markets, Bittensor is set to become a one-stop-shop for those seeking all the compute requirements for building unstoppable applications on top of an incentivized infrastructure 23 | 24 | # Deeper dive 25 | Check out the Bittensor about page [here](https://bittensor.com/about) for more details about what the bittensor paradigm is and why subnets are revolutionary technology. 26 | 27 | Also see our [linktree](https://linktr.ee/opentensor) for more information. -------------------------------------------------------------------------------- /min_compute.yml: -------------------------------------------------------------------------------- 1 | # Use this document to specify the minimum compute requirements. 2 | # This document will be used to generate a list of recommended hardware for your subnet. 3 | 4 | # This is intended to give a rough estimate of the minimum requirements 5 | # so that the user can make an informed decision about whether or not 6 | # they want to run a miner or validator on their machine. 7 | 8 | # NOTE: Specification for miners may be different from validators 9 | 10 | version: '1.0' # update this version key as needed, ideally should match your release version 11 | 12 | compute_spec: 13 | 14 | miner: 15 | 16 | cpu: 17 | min_cores: 4 # Minimum number of CPU cores 18 | min_speed: 2.5 # Minimum speed per core (GHz) 19 | recommended_cores: 8 # Recommended number of CPU cores 20 | recommended_speed: 3.5 # Recommended speed per core (GHz) 21 | architecture: "x86_64" # Architecture type (e.g., x86_64, arm64) 22 | 23 | gpu: 24 | required: True # Does the application require a GPU? 25 | min_vram: 24 # Minimum GPU VRAM (GB) 26 | recommended_vram: 24 # Recommended GPU VRAM (GB) 27 | cuda_cores: 1024 # Minimum number of CUDA cores (if applicable) 28 | min_compute_capability: 6.0 # Minimum CUDA compute capability 29 | recommended_compute_capability: 7.0 # Recommended CUDA compute capability 30 | recommended_gpu: "NVIDIA RTX 4090" # provide a recommended GPU to purchase/rent 31 | 32 | memory: 33 | min_ram: 24 # Minimum RAM (GB) 34 | min_swap: 4 # Minimum swap space (GB) 35 | recommended_swap: 8 # Recommended swap space (GB) 36 | ram_type: "DDR4" # RAM type (e.g., DDR4, DDR3, etc.) 37 | 38 | storage: 39 | min_space: 20 # Minimum free storage space (GB) 40 | recommended_space: 100 # Recommended free storage space (GB) 41 | type: "SSD" # Preferred storage type (e.g., SSD, HDD) 42 | min_iops: 1000 # Minimum I/O operations per second (if applicable) 43 | recommended_iops: 5000 # Recommended I/O operations per second 44 | 45 | os: 46 | name: "Ubuntu" # Name of the preferred operating system(s) 47 | version: 22.04 # Version of the preferred operating system(s) 48 | 49 | validator: 50 | 51 | cpu: 52 | min_cores: 4 # Minimum number of CPU cores 53 | min_speed: 2.5 # Minimum speed per core (GHz) 54 | recommended_cores: 8 # Recommended number of CPU cores 55 | recommended_speed: 3.5 # Recommended speed per core (GHz) 56 | architecture: "x86_64" # Architecture type (e.g., x86_64, arm64) 57 | 58 | gpu: 59 | required: True # Does the application require a GPU? 60 | min_vram: 80 # Minimum GPU VRAM (GB) 61 | recommended_vram: 80 # Recommended GPU VRAM (GB) 62 | cuda_cores: 1024 # Minimum number of CUDA cores (if applicable) 63 | min_compute_capability: 6.0 # Minimum CUDA compute capability 64 | recommended_compute_capability: 7.0 # Recommended CUDA compute capability 65 | recommended_gpu: "NVIDIA A100" # provide a recommended GPU to purchase/rent 66 | 67 | memory: 68 | min_ram: 80 # Minimum RAM (GB) 69 | min_swap: 4 # Minimum swap space (GB) 70 | recommended_swap: 8 # Recommended swap space (GB) 71 | ram_type: "DDR4" # RAM type (e.g., DDR4, DDR3, etc.) 72 | 73 | storage: 74 | min_space: 1000 # Minimum free storage space (GB) 75 | recommended_space: 1000 # Recommended free storage space (GB) 76 | type: "SSD" # Preferred storage type (e.g., SSD, HDD) 77 | min_iops: 1000 # Minimum I/O operations per second (if applicable) 78 | recommended_iops: 5000 # Recommended I/O operations per second 79 | 80 | os: 81 | name: "Ubuntu" # Name of the preferred operating system(s) 82 | version: 22.04 # Version of the preferred operating system(s) 83 | 84 | network_spec: 85 | bandwidth: 86 | download: 100 # Minimum download bandwidth (Mbps) 87 | upload: 20 # Minimum upload bandwidth (Mbps) 88 | -------------------------------------------------------------------------------- /models/ppl_model.pk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/models/ppl_model.pk -------------------------------------------------------------------------------- /neurons/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/neurons/__init__.py -------------------------------------------------------------------------------- /neurons/miners/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/neurons/miners/__init__.py -------------------------------------------------------------------------------- /neurons/miners/deberta_classifier.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding 4 | from torch.utils.data import Dataset 5 | from tqdm import tqdm 6 | 7 | 8 | class SimpleTestDataset(Dataset): 9 | def __init__(self, strings, tokenizer, max_sequence_length): 10 | self.Strings = strings 11 | self.Tokenizer = tokenizer 12 | self.MaxSequenceLength = max_sequence_length 13 | 14 | def __len__(self): 15 | return len(self.Strings) 16 | 17 | def __getitem__(self, idx): 18 | string = self.Strings[idx].strip() 19 | token_ids = self.Tokenizer(string, max_length=self.MaxSequenceLength, truncation=True).input_ids 20 | 21 | return { 22 | 'input_ids': token_ids, 23 | } 24 | 25 | 26 | def GeneratePredictions(model, tokenizer, test_dataset, device): 27 | data_loader = torch.utils.data.DataLoader( 28 | test_dataset, 29 | batch_size=4, 30 | shuffle=False, 31 | num_workers=1, 32 | collate_fn=DataCollatorWithPadding(tokenizer)) 33 | 34 | all_predictions = [] 35 | with torch.no_grad(): 36 | for batch in data_loader: 37 | token_sequences = batch.input_ids.to(device) 38 | attention_masks = batch.attention_mask.to(device) 39 | 40 | with torch.cuda.amp.autocast(): 41 | raw_predictions = model(token_sequences, attention_masks).logits 42 | 43 | scaled_predictions = raw_predictions.softmax(dim = 1)[:,1] 44 | all_predictions.append(scaled_predictions.cpu().numpy()) 45 | 46 | all_predictions = np.concatenate(all_predictions) 47 | 48 | return all_predictions 49 | 50 | 51 | class DebertaClassifier: 52 | def __init__(self, foundation_model_path, model_path, device): 53 | self.tokenizer = AutoTokenizer.from_pretrained(foundation_model_path) 54 | self.max_length = 1024 55 | self.device = device 56 | 57 | model = AutoModelForSequenceClassification.from_pretrained( 58 | foundation_model_path, 59 | state_dict=torch.load(model_path), 60 | attention_probs_dropout_prob=0, 61 | hidden_dropout_prob=0).to(device) 62 | 63 | self.model = model.eval() 64 | 65 | def predict_batch(self, texts): 66 | test_dataset = SimpleTestDataset(texts, self.tokenizer, self.max_length) 67 | return GeneratePredictions(self.model, self.tokenizer, test_dataset, self.device) 68 | 69 | def __call__(self, text): 70 | return self.predict_batch([text])[0] 71 | -------------------------------------------------------------------------------- /neurons/miners/ppl_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | from sklearn.linear_model import LogisticRegression 4 | from tqdm import tqdm 5 | import pickle 6 | import numpy as np 7 | import bittensor as bt 8 | 9 | import logging 10 | logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR) 11 | 12 | 13 | class PPLModel: 14 | def __init__(self, device="cuda", model_id="microsoft/phi-2"): 15 | self.device = device 16 | self.model_id = model_id 17 | self.model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to(device) 18 | self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) 19 | 20 | self.max_length = 512 #self.model.config.n_positions 21 | self.stride = 512 22 | self.logreg = LogisticRegression(class_weight='balanced') 23 | 24 | def __call__(self, text): 25 | ppl = self.getPPL(text) 26 | if ppl is None: 27 | # print('None ppl') 28 | bt.logging.info('Got none ppl on text: {}'.format(text)) 29 | return 0 30 | 31 | features = [(100 - ppl) / 100] 32 | return self.logreg.predict_proba([features])[0][1] 33 | 34 | def predict_batch(self, texts): 35 | preds = [] 36 | for text in texts: 37 | preds.append(self.__call__(text)) 38 | return preds 39 | 40 | def fit(self, X, y): 41 | features = [] 42 | mask = [] 43 | for text in tqdm(X): 44 | ppl = self.getPPL(text) 45 | ppl = (100 - ppl) / 100 if ppl is not None else None 46 | features.append(ppl) 47 | mask.append(ppl is not None) 48 | 49 | features = np.array(features) 50 | mask = np.array(mask) 51 | print("Number of not-none ppl: {}".format(mask.sum())) 52 | 53 | features = features[mask] 54 | y = y[mask] 55 | self.logreg.fit(features.reshape(-1, 1), y) 56 | 57 | def save(self, path): 58 | with open(path, 'wb') as f: 59 | pickle.dump(self.logreg, f) 60 | 61 | def load_pretrained(self, path): 62 | with open(path, 'rb') as f: 63 | self.logreg = pickle.load(f) 64 | 65 | def getPPL(self, text): 66 | text = '.'.join(text.split('.')[:30]) 67 | encodings = self.tokenizer(text, return_tensors="pt") 68 | seq_len = encodings.input_ids.size(1) 69 | 70 | nlls = [] 71 | likelihoods = [] 72 | prev_end_loc = 0 73 | for begin_loc in range(0, seq_len, self.stride): 74 | end_loc = min(begin_loc + self.max_length, seq_len) 75 | trg_len = end_loc - prev_end_loc 76 | input_ids = encodings.input_ids[:, begin_loc:end_loc].to(self.device) 77 | target_ids = input_ids.clone() 78 | target_ids[:, :-trg_len] = -100 79 | 80 | with torch.no_grad(): 81 | loss = self.model(input_ids, labels=target_ids).loss 82 | neg_log_likelihood = loss * trg_len 83 | likelihoods.append(neg_log_likelihood) 84 | 85 | nlls.append(neg_log_likelihood) 86 | 87 | prev_end_loc = end_loc 88 | if end_loc == seq_len: 89 | break 90 | 91 | if torch.isnan(torch.Tensor(nlls)).any() or len(nlls) == 0: 92 | return None 93 | 94 | ppl = int(torch.exp(torch.stack(nlls).sum() / end_loc)) 95 | return ppl 96 | 97 | 98 | if __name__ == '__main__': 99 | model = PPLModel(device='cpu') 100 | model.load_pretrained('neurons/miners/ppl_model.pk') 101 | text = 'Hello world, i am here' 102 | res = model(text) 103 | print(res) 104 | -------------------------------------------------------------------------------- /prompting/__init__.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # Copyright © 2024 Yuma Rao 3 | 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of 10 | # the Software. 11 | 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 16 | # DEALINGS IN THE SOFTWARE. 17 | 18 | # Define the version of the template module. 19 | __version__ = "1.1.3" 20 | version_split = __version__.split(".") 21 | __spec_version__ = ( 22 | (10000 * int(version_split[0])) 23 | + (100 * int(version_split[1])) 24 | + (1 * int(version_split[2])) 25 | ) 26 | 27 | # Import all submodules. 28 | from . import tasks 29 | from . import tools 30 | from . import agent 31 | from . import conversation 32 | from . import llm 33 | -------------------------------------------------------------------------------- /prompting/agent.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # Copyright © 2024 Yuma Rao 3 | 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of 10 | # the Software. 11 | 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 16 | # DEALINGS IN THE SOFTWARE. 17 | import textwrap 18 | import time 19 | import bittensor as bt 20 | from dataclasses import asdict 21 | from prompting.tasks import Task 22 | from prompting.llm import HuggingFaceLLM 23 | from prompting.cleaners.cleaner import CleanerPipeline 24 | 25 | from prompting.persona import Persona, create_persona 26 | 27 | from transformers import Pipeline 28 | 29 | 30 | class HumanAgent(HuggingFaceLLM): 31 | "Agent that impersonates a human user and makes queries based on its goal." 32 | 33 | @property 34 | def progress(self): 35 | return int(self.task.complete) 36 | 37 | @property 38 | def finished(self): 39 | return self.progress == 1 40 | 41 | system_prompt_template = textwrap.dedent( 42 | """This is a roleplaying game where you are impersonating {mood} human user with a specific persona. As a human, you are using AI assistant to {desc} related to {topic} ({subtopic}) in a {tone} tone. You don't need to greet the assistant or be polite, unless this is part of your persona. The spelling and grammar of your messages should also reflect your persona. 43 | 44 | Your singular focus is to use the assistant to {goal}: {query} 45 | """ 46 | ) 47 | 48 | def __init__( 49 | self, 50 | task: Task, 51 | llm_pipeline: Pipeline, 52 | system_template: str = None, 53 | persona: Persona = None, 54 | begin_conversation=True, 55 | ): 56 | if persona is None: 57 | persona = create_persona() 58 | 59 | self.persona = persona 60 | self.task = task 61 | self.llm_pipeline = llm_pipeline 62 | 63 | if system_template is not None: 64 | self.system_prompt_template = system_template 65 | 66 | self.system_prompt = self.system_prompt_template.format( 67 | mood=self.persona.mood, 68 | tone=self.persona.tone, 69 | **self.task.__state_dict__(), 70 | ) 71 | 72 | super().__init__( 73 | llm_pipeline=llm_pipeline, 74 | system_prompt=self.system_prompt, 75 | max_new_tokens=256, 76 | ) 77 | 78 | if begin_conversation: 79 | bt.logging.debug("🤖 Generating challenge query...") 80 | # initiates the conversation with the miner 81 | self.challenge = self.create_challenge() 82 | 83 | def create_challenge(self) -> str: 84 | """Creates the opening question of the conversation which is based on the task query but dressed in the persona of the user.""" 85 | t0 = time.time() 86 | 87 | cleaner = None 88 | if hasattr(self.task, "cleaning_pipeline"): 89 | cleaner = CleanerPipeline(cleaning_pipeline=self.task.cleaning_pipeline) 90 | 91 | self.challenge = super().query( 92 | message="Ask a question related to your goal", cleaner=cleaner 93 | ) 94 | self.challenge = self.task.format_challenge(self.challenge) 95 | self.challenge_time = time.time() - t0 96 | 97 | return self.challenge 98 | 99 | def __state_dict__(self, full=False): 100 | return { 101 | "challenge": self.challenge, 102 | "challenge_time": self.challenge_time, 103 | **self.task.__state_dict__(full=full), 104 | **asdict(self.persona), 105 | "system_prompt": self.system_prompt, 106 | } 107 | 108 | def __str__(self): 109 | return self.system_prompt 110 | 111 | def __repr__(self): 112 | return str(self) 113 | 114 | def continue_conversation(self, miner_response: str): 115 | # Generates response to miner response 116 | self.query(miner_response) 117 | # Updates current prompt with new state of conversation 118 | # self.prompt = self.get_history_prompt() 119 | 120 | def update_progress( 121 | self, top_reward: float, top_response: str, continue_conversation=False 122 | ): 123 | if top_reward > self.task.reward_threshold: 124 | self.task.complete = True 125 | self.messages.append({"content": top_response, "role": "user"}) 126 | 127 | bt.logging.debug("Agent finished its goal") 128 | return 129 | 130 | if continue_conversation: 131 | bt.logging.debug( 132 | "↪ Agent did not finish its goal, continuing conversation..." 133 | ) 134 | self.continue_conversation(miner_response=top_response) 135 | -------------------------------------------------------------------------------- /prompting/cleaners/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/prompting/cleaners/__init__.py -------------------------------------------------------------------------------- /prompting/cleaners/all_cleaners.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import bittensor as bt 3 | import re 4 | 5 | 6 | class BaseCleaner(ABC): 7 | @abstractmethod 8 | def __init__(self, **kwargs): 9 | pass 10 | 11 | @abstractmethod 12 | def apply(self, generation: str) -> str: 13 | pass 14 | 15 | 16 | class RemoveQuotes(BaseCleaner): 17 | def __init__(self, **kwargs) -> None: 18 | pass 19 | 20 | def apply(self, generation: str) -> str: 21 | bt.logging.debug("Pruning unfinished sentence.") 22 | return generation.strip("\"'") 23 | 24 | 25 | class PruneEnding(BaseCleaner): 26 | def __init__(self, **kwargs): 27 | pass 28 | 29 | def apply(self, generation: str) -> str: 30 | punctuation_chars = [".", "?", "!"] 31 | 32 | if not any(char in generation for char in punctuation_chars): 33 | return generation 34 | 35 | if ( 36 | not generation.endswith(".") 37 | and not generation.endswith("?") 38 | and not generation.endswith("!") 39 | ): 40 | index = max(generation.rfind(char) for char in punctuation_chars) 41 | return generation[ 42 | : index + 1 43 | ] # Go to the index of where the punctuation is, and include it (+1) 44 | else: 45 | return generation 46 | 47 | 48 | class RemoveRoles(BaseCleaner): 49 | def __init__(self, **kwargs): 50 | pass 51 | 52 | def capitalize_sentences(self, input_string): 53 | """capitalize the first character after .!?""" 54 | sentences = re.split(r"(?<=[.!?])\s+", input_string) 55 | capitalized_sentences = [sentence.capitalize() for sentence in sentences] 56 | result_string = " ".join(capitalized_sentences) 57 | return result_string 58 | 59 | def apply(self, generation: str) -> str: 60 | roles = [ 61 | "User: ", 62 | "System: ", 63 | "Assistant: ", 64 | "Assistant, ", 65 | "Dear AI, ", 66 | "Dear AI ", 67 | "#Question: ", 68 | ] 69 | for role in roles: 70 | if role in generation: 71 | generation = generation.replace(role, "") 72 | 73 | return self.capitalize_sentences( 74 | input_string=generation 75 | ) # LLMs are good at being formal. Do the same if we remove a prefix. 76 | -------------------------------------------------------------------------------- /prompting/cleaners/cleaner.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | 3 | import bittensor as bt 4 | 5 | from prompting.cleaners.all_cleaners import RemoveQuotes, RemoveRoles, PruneEnding 6 | 7 | SUPPORTED_CLEANERS = { 8 | "remove_quotes": RemoveQuotes, 9 | "remove_roles": RemoveRoles, 10 | "prune_ending": PruneEnding, 11 | } 12 | 13 | 14 | class CleanerPipeline: 15 | def __init__(self, cleaning_pipeline: List[Dict]) -> None: 16 | """CleanerPipeline is a pipeline that can be applied to any string to 17 | clean it of unwanted characters, punctuation, etc. 18 | 19 | cleaning_pipeline (List[Dict]): List of Dicts that define the cleaning pipeline. 20 | Dictionaries MUST have the keyword "name" to be valid. 21 | Example: [{"name": "remove_quotes", "kwargs": {}}, {"name": "prune_ending", "kwargs": {}}] 22 | 23 | """ 24 | self.cleaning_pipeline = cleaning_pipeline 25 | 26 | def apply(self, generation: str) -> str: 27 | """Apply cleaning steps to generation listed in cleaning_pipeline. 28 | 29 | Args: 30 | generation (str): string generated from LLM or otherwise. 31 | Returns: 32 | str: Clean generated string. 33 | """ 34 | try: 35 | for cleaner in self.cleaning_pipeline: 36 | if "name" not in cleaner or cleaner["name"] not in SUPPORTED_CLEANERS: 37 | raise ValueError( 38 | f"Cleaning pipeline step {cleaner} must have a name, or must be in SUPPORTED_CLEANERS." 39 | ) 40 | 41 | func = SUPPORTED_CLEANERS[cleaner["name"]] 42 | 43 | kwargs = cleaner.get("kwargs", {}) 44 | func = func(**kwargs) # instantiate the cleaner with the kwargs 45 | 46 | # apply all the filters for the specific task. 47 | generation = func.apply(generation=generation) 48 | 49 | return generation 50 | 51 | except Exception as E: 52 | bt.logging.error( 53 | f"Failed to apply cleaning pipeline {cleaner['name']}. {E}," 54 | ) 55 | return generation 56 | -------------------------------------------------------------------------------- /prompting/conversation.py: -------------------------------------------------------------------------------- 1 | from prompting.tasks import ( 2 | Task, 3 | DebuggingTask, 4 | QuestionAnsweringTask, 5 | SummarizationTask, 6 | MathTask, 7 | DateQuestionAnsweringTask, 8 | ) 9 | from prompting.tools import ( 10 | WikiDataset, 11 | HFCodingDataset, 12 | MathDataset, 13 | WikiDateDataset, 14 | ) 15 | 16 | from transformers import Pipeline 17 | 18 | 19 | def create_task(llm_pipeline: Pipeline, task_name: str) -> Task: 20 | wiki_based_tasks = ["summarization", "qa"] 21 | coding_based_tasks = ["debugging"] 22 | # TODO Add math and date_qa to this structure 23 | 24 | # TODO: Abstract dataset classes into common dynamic interface 25 | if task_name in wiki_based_tasks: 26 | dataset = WikiDataset() 27 | 28 | elif task_name in coding_based_tasks: 29 | dataset = HFCodingDataset() 30 | 31 | elif task_name == "math": 32 | dataset = MathDataset() 33 | 34 | elif task_name == "date_qa": 35 | dataset = WikiDateDataset() 36 | 37 | if task_name == "summarization": 38 | task = SummarizationTask(llm_pipeline=llm_pipeline, context=dataset.next()) 39 | 40 | elif task_name == "qa": 41 | task = QuestionAnsweringTask(llm_pipeline=llm_pipeline, context=dataset.next()) 42 | 43 | elif task_name == "debugging": 44 | task = DebuggingTask(llm_pipeline=llm_pipeline, context=dataset.next()) 45 | 46 | elif task_name == "math": 47 | task = MathTask(llm_pipeline=llm_pipeline, context=dataset.next()) 48 | 49 | elif task_name == "date_qa": 50 | task = DateQuestionAnsweringTask( 51 | llm_pipeline=llm_pipeline, context=dataset.next() 52 | ) 53 | 54 | else: 55 | raise ValueError(f"Task {task_name} not supported. Please choose a valid task") 56 | 57 | return task 58 | -------------------------------------------------------------------------------- /prompting/llm.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # Copyright © 2024 Yuma Rao 3 | 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of 10 | # the Software. 11 | 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 16 | # DEALINGS IN THE SOFTWARE. 17 | 18 | import time 19 | 20 | from typing import List, Dict 21 | import bittensor as bt 22 | 23 | from transformers import Pipeline, pipeline 24 | from prompting.mock import MockPipeline 25 | 26 | from prompting.cleaners.cleaner import CleanerPipeline 27 | 28 | 29 | def load_pipeline( 30 | model_id, device=None, torch_dtype=None, mock=False, model_kwargs: dict = None 31 | ): 32 | """Loads the HuggingFace pipeline for the LLM, or a mock pipeline if mock=True""" 33 | 34 | if mock or model_id == "mock": 35 | return MockPipeline(model_id) 36 | 37 | if not device.startswith("cuda"): 38 | bt.logging.warning("Only crazy people run this on CPU. It is not recommended.") 39 | 40 | # model_kwargs torch type definition conflicts with pipeline torch_dtype, so we need to differentiate them 41 | if model_kwargs is None: 42 | llm_pipeline = pipeline( 43 | "text-generation", 44 | model=model_id, 45 | device=device, 46 | torch_dtype=torch_dtype, 47 | ) 48 | else: 49 | llm_pipeline = pipeline( 50 | "text-generation", 51 | model=model_id, 52 | device_map=device, 53 | model_kwargs=model_kwargs, 54 | ) 55 | 56 | return llm_pipeline 57 | 58 | 59 | class HuggingFaceLLM: 60 | def __init__( 61 | self, 62 | llm_pipeline: Pipeline, 63 | system_prompt, 64 | max_new_tokens=256, 65 | do_sample=True, 66 | temperature=0.7, 67 | top_k=50, 68 | top_p=0.95, 69 | ): 70 | self.llm_pipeline = llm_pipeline 71 | self.system_prompt = system_prompt 72 | self.kwargs = dict( 73 | do_sample=do_sample, 74 | temperature=temperature, 75 | top_k=top_k, 76 | top_p=top_p, 77 | max_new_tokens=max_new_tokens, 78 | ) 79 | 80 | self.messages = [{"content": self.system_prompt, "role": "system"}] 81 | self.times = [0] 82 | 83 | def query( 84 | self, 85 | message: List[Dict[str, str]], 86 | role: str = "user", 87 | disregard_system_prompt: bool = False, 88 | cleaner: CleanerPipeline = None, 89 | ): 90 | messages = self.messages + [{"content": message, "role": role}] 91 | 92 | if disregard_system_prompt: 93 | messages = messages[1:] 94 | 95 | tbeg = time.time() 96 | response = self.forward(messages=messages) 97 | 98 | if cleaner is not None: 99 | clean_response = cleaner.apply(generation=response) 100 | if clean_response != response: 101 | bt.logging.debug( 102 | f"Response cleaned, chars removed: {len(response) - len(clean_response)}..." 103 | ) 104 | response = clean_response 105 | 106 | self.messages = messages + [{"content": response, "role": "assistant"}] 107 | self.times = self.times + [0, time.time() - tbeg] 108 | 109 | return response 110 | 111 | def __call__(self, messages: List[Dict[str, str]]): 112 | return self.forward(messages=messages) 113 | 114 | def _make_prompt(self, messages: List[Dict[str, str]]): 115 | return self.llm_pipeline.tokenizer.apply_chat_template( 116 | messages, tokenize=False, add_generation_prompt=True 117 | ) 118 | 119 | def forward(self, messages: List[Dict[str, str]], preformat_messages: bool = False): 120 | prompt = self._make_prompt(messages) 121 | outputs = self.llm_pipeline(prompt, **self.kwargs) 122 | response = outputs[0]["generated_text"] 123 | 124 | response = response.replace(prompt, "").strip() 125 | 126 | bt.logging.debug( 127 | f"{self.__class__.__name__} generated the following output:\n{response}" 128 | ) 129 | return response 130 | -------------------------------------------------------------------------------- /prompting/mock.py: -------------------------------------------------------------------------------- 1 | import time 2 | import uuid 3 | import torch 4 | import asyncio 5 | import random 6 | import bittensor as bt 7 | 8 | from typing import List 9 | 10 | 11 | class MockTokenizer: 12 | def __init__(self): 13 | super().__init__() 14 | 15 | self.role_expr = "<|mock-{role}|>" 16 | 17 | def apply_chat_template(self, messages, **kwargs): 18 | prompt = "" 19 | for m in messages: 20 | role = self.role_expr.format(role=m["role"]) 21 | content = m["content"] 22 | prompt += f"<|mock-{role}|> {content}\n" 23 | 24 | return "\n".join(prompt) 25 | 26 | 27 | class MockModel(torch.nn.Module): 28 | def __init__(self, phrase): 29 | super().__init__() 30 | 31 | self.tokenizer = MockTokenizer() 32 | self.phrase = phrase 33 | 34 | def __call__(self, messages): 35 | return self.forward(messages) 36 | 37 | def forward(self, messages): 38 | role_tag = self.tokenizer.role_expr.format(role="assistant") 39 | return f"{role_tag} {self.phrase}" 40 | 41 | 42 | class MockPipeline: 43 | @property 44 | def tokenizer(self): 45 | return self.model.tokenizer 46 | 47 | def __init__( 48 | self, 49 | phrase="Mock llm output", 50 | model_kwargs=None, 51 | ): 52 | super().__init__() 53 | 54 | self.model_kwargs = model_kwargs or {} 55 | self.model = MockModel(phrase) 56 | 57 | def __repr__(self): 58 | return f"{self.__class__.__name__}(phrase={self.model.phrase})" 59 | 60 | def __call__(self, messages, **kwargs): 61 | return self.forward(messages, **kwargs) 62 | 63 | def forward(self, messages, **kwargs): 64 | output = self.model(messages) 65 | return self.postprocess(output) 66 | 67 | def postprocess(self, output, **kwargs): 68 | output = output.split(self.model.tokenizer.role_expr.format(role="assistant"))[ 69 | -1 70 | ].strip() 71 | return [{"generated_text": output}] 72 | 73 | def preprocess(self, **kwargs): 74 | pass 75 | 76 | 77 | class MockSubtensor(bt.MockSubtensor): 78 | def __init__(self, netuid, n=16, wallet=None): 79 | 80 | super().__init__() 81 | # reset the underlying subtensor state 82 | self.chain_state = None 83 | self.setup() 84 | 85 | if not self.subnet_exists(netuid): 86 | self.create_subnet(netuid) 87 | 88 | # Register ourself (the validator) as a neuron at uid=0 89 | if wallet is not None: 90 | self.force_register_neuron( 91 | netuid=netuid, 92 | hotkey=wallet.hotkey.ss58_address, 93 | coldkey=wallet.coldkey.ss58_address, 94 | balance=100000, 95 | stake=100000, 96 | ) 97 | 98 | # Register n mock neurons who will be miners 99 | for i in range(1, n + 1): 100 | self.force_register_neuron( 101 | netuid=netuid, 102 | hotkey=f"miner-hotkey-{i}", 103 | coldkey="mock-coldkey", 104 | balance=100000, 105 | stake=100000, 106 | ) 107 | 108 | 109 | class MockMetagraph(bt.metagraph): 110 | 111 | default_ip = "127.0.0.0" 112 | default_port = 8091 113 | 114 | def __init__(self, netuid=1, network="mock", subtensor=None): 115 | super().__init__(netuid=netuid, network=network, sync=False) 116 | 117 | if subtensor is not None: 118 | self.subtensor = subtensor 119 | self.sync(subtensor=subtensor) 120 | 121 | for axon in self.axons: 122 | axon.ip = self.default_ip 123 | axon.port = self.default_port 124 | 125 | 126 | class MockDendrite(bt.dendrite): 127 | """ 128 | Replaces a real bittensor network request with a mock request that just returns some static completion for all axons that are passed and adds some random delay. 129 | """ 130 | 131 | min_time: float = 0 132 | max_time: float = 1 133 | 134 | def __init__(self, wallet): 135 | super().__init__(wallet) 136 | 137 | async def forward( 138 | self, 139 | axons: List[bt.axon], 140 | synapse: bt.Synapse = bt.Synapse(), 141 | timeout: float = 12, 142 | deserialize: bool = True, 143 | run_async: bool = True, 144 | streaming: bool = False, 145 | ): 146 | 147 | if streaming: 148 | raise NotImplementedError("Streaming not implemented yet.") 149 | 150 | async def query_all_axons(streaming: bool): 151 | """Queries all axons for responses.""" 152 | 153 | async def single_axon_response(i, axon): 154 | """Queries a single axon for a response.""" 155 | 156 | t0 = time.time() 157 | s = synapse.copy() 158 | # Attach some more required data so it looks real 159 | s = self.preprocess_synapse_for_request(axon, s, timeout) 160 | # We just want to mock the response, so we'll just fill in some data 161 | process_time = ( 162 | random.random() * (self.max_time - self.min_time) + self.min_time 163 | ) 164 | await asyncio.sleep(process_time) 165 | if process_time < timeout: 166 | # Update the status code and status message of the dendrite to match the axon 167 | s.completion = f"Mock miner completion {i}" 168 | s.dendrite.status_code = 200 169 | s.dendrite.status_message = "OK" 170 | else: 171 | s.completion = "" 172 | s.dendrite.status_code = 408 173 | s.dendrite.status_message = "Timeout" 174 | 175 | s.dendrite.process_time = str(time.time() - t0) 176 | 177 | # Return the updated synapse object after deserializing if requested 178 | if deserialize: 179 | return s.deserialize() 180 | else: 181 | return s 182 | 183 | return await asyncio.gather( 184 | *( 185 | single_axon_response(i, target_axon) 186 | for i, target_axon in enumerate(axons) 187 | ) 188 | ) 189 | 190 | return await query_all_axons(streaming) 191 | 192 | def __str__(self) -> str: 193 | """ 194 | Returns a string representation of the Dendrite object. 195 | 196 | Returns: 197 | str: The string representation of the Dendrite object in the format "dendrite()". 198 | """ 199 | return "MockDendrite({})".format(self.keypair.ss58_address) 200 | -------------------------------------------------------------------------------- /prompting/persona.py: -------------------------------------------------------------------------------- 1 | import random 2 | from dataclasses import dataclass 3 | 4 | 5 | @dataclass 6 | class Persona: 7 | profile: str 8 | mood: str 9 | tone: str 10 | 11 | 12 | def create_persona() -> Persona: 13 | """Defines the persona of the user. This is used to create the system prompt. It dictates the style of the agent's questions and communication.""" 14 | profiles = [ 15 | "student", 16 | "teacher", 17 | "parent", 18 | "hacker", 19 | "programmer", 20 | "scientist", 21 | ] 22 | # profiles = ["16 year old highschool student", ... 23 | 24 | # TODO: more terse, less verbose 25 | mood = [ 26 | "an interested", 27 | "a concerned", 28 | "an impatient", 29 | "a tired", 30 | "a confused", 31 | "an annoyed", 32 | "a curious", 33 | "an upbeat", 34 | "a lazy", 35 | ] 36 | tone = [ 37 | "formal", 38 | "informal", 39 | "indifferent", 40 | "casual", 41 | "rushed", 42 | "polite", 43 | "impolite", 44 | "friendly", 45 | "unfriendly", 46 | "positive", 47 | "negative", 48 | ] 49 | # TODO: we can lower case the human messages, add common grammar and spelling mistakes... 50 | 51 | return Persona( 52 | profile=random.choice(profiles), 53 | mood=random.choice(mood), 54 | tone=random.choice(tone), 55 | ) 56 | -------------------------------------------------------------------------------- /prompting/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from .task import Task 2 | from .debugging import DebuggingTask 3 | from .summarization import SummarizationTask 4 | from .qa import QuestionAnsweringTask 5 | from .date_qa import DateQuestionAnsweringTask 6 | from .generic_instruction import GenericInstructionTask 7 | from .math import MathTask 8 | 9 | 10 | TASKS = { 11 | "qa": QuestionAnsweringTask, 12 | "summarization": SummarizationTask, 13 | "date_qa": DateQuestionAnsweringTask, 14 | "debugging": DebuggingTask, 15 | "math": MathTask, 16 | } 17 | -------------------------------------------------------------------------------- /prompting/tasks/date_qa.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from prompting.tasks import Task 3 | from prompting.cleaners.cleaner import CleanerPipeline 4 | 5 | 6 | SECTION_MESSAGES = {"Births": " was born ", "Deaths": " died ", "Events": " "} 7 | 8 | 9 | @dataclass 10 | class DateQuestionAnsweringTask(Task): 11 | 12 | name = "date-based question answering" 13 | desc = "get help answering a specific date-based question" 14 | goal = "to get the answer to the following date-based question" 15 | reward_definition = [ 16 | dict(name="date", weight=1.0), 17 | ] 18 | penalty_definition = [] 19 | cleaning_pipeline = [ 20 | dict(name="remove_quotes"), 21 | dict(name="remove_roles"), 22 | ] 23 | static_reference = True 24 | static_query = True 25 | 26 | def __init__(self, llm_pipeline, context, create_reference=True): 27 | 28 | self.context = context 29 | 30 | self.query = ( 31 | context.content + SECTION_MESSAGES[context.topic] + "on what exact date?" 32 | ) 33 | self.reference = self.context.title.replace("_", " ") + ", " + context.subtopic 34 | 35 | self.topic = context.title 36 | self.subtopic = context.topic 37 | self.tags = context.tags 38 | -------------------------------------------------------------------------------- /prompting/tasks/debugging.py: -------------------------------------------------------------------------------- 1 | import random 2 | import bittensor as bt 3 | from dataclasses import dataclass 4 | from prompting.tasks import Task 5 | import difflib 6 | 7 | 8 | def corrupt( 9 | code, 10 | n_remove=0, 11 | n_swap=0, 12 | seed=None, 13 | sep=" ", 14 | min_length=1, 15 | max_length=10, 16 | remove_comment_lines=False, 17 | ): 18 | """ 19 | Corrupt a piece of code by removing and/or swapping chunks of it. 20 | TODO: Ignore comments and strings(?) when corrupting the code. 21 | 22 | Args: 23 | code (str): The code to corrupt. 24 | n_remove (int): The number of chunks to remove. 25 | n_swap (int): The number of chunks to swap. 26 | seed (int): The random seed to use. 27 | sep (str): The separator to use when splitting the code into chunks. Recommended values are '', ' ', '\n'. 28 | min_length (int): The minimum length of a chunk. 29 | max_length (int): The maximum length of a chunk. 30 | """ 31 | 32 | # set seed for reproducibility 33 | random.seed(seed) 34 | 35 | assert n_remove + n_swap > 0, "Must specify at least one corruption type." 36 | 37 | def remove(code, n, sep=" ", min_length=1, max_length=10): 38 | """Remove n random chunks from the code. Chunks can be characters, words, or lines.""" 39 | 40 | chunks = code.split(sep) if sep else list(code) 41 | 42 | # select n random chunks to remove 43 | indices = random.sample( 44 | [ 45 | i 46 | for i, chunk in enumerate(chunks) 47 | if min_length <= len(chunk) <= max_length 48 | ], 49 | n, 50 | ) 51 | bt.logging.info( 52 | f"Removing the following {len(indices)} chunks: {[chunks[i] for i in indices]} at indices {indices}" 53 | ) 54 | 55 | return sep.join([chunk for i, chunk in enumerate(chunks) if i not in indices]) 56 | 57 | def swap(code, sep=" ", min_length=1, max_length=10): 58 | """Swap two random chunks in the code. Chunks can be characters, words, or lines.""" 59 | chunks = code.split(sep) if sep else list(code) 60 | 61 | # select 2 random chunks to swap 62 | indices = random.sample( 63 | [ 64 | i 65 | for i, chunk in enumerate(chunks) 66 | if min_length <= len(chunk) <= max_length 67 | ], 68 | 2, 69 | ) 70 | 71 | bt.logging.info( 72 | f"Swapping chunk {chunks[indices[0]]!r} at index {indices[0]} with chunk {chunks[indices[1]]!r} at index {indices[1]}" 73 | ) 74 | 75 | chunks[indices[0]], chunks[indices[1]] = ( 76 | chunks[indices[1]], 77 | chunks[indices[0]], 78 | ) 79 | 80 | return sep.join(chunks) 81 | 82 | # Do this at your peril. It doesn't catch multiline comments or strings. 83 | if remove_comment_lines: 84 | code = "\n".join( 85 | [ 86 | line 87 | for line in code.splitlines() 88 | if not line.strip() or line.strip().startswith("#", "//") 89 | ] 90 | ) 91 | 92 | # spread n corruptions across the code 93 | for i in range(n_remove): 94 | code = remove(code, n=1, sep=sep, min_length=min_length, max_length=max_length) 95 | for i in range(n_swap): 96 | code = swap(code, sep=sep, min_length=min_length, max_length=max_length) 97 | 98 | return code 99 | 100 | 101 | def diff(query, reference): 102 | """Get the diff between two strings.""" 103 | return "\n".join(difflib.unified_diff(query.splitlines(), reference.splitlines())) 104 | 105 | 106 | @dataclass 107 | class DebuggingTask(Task): 108 | 109 | name = "debugging" 110 | desc = "get help with debugging" 111 | goal = "ask for help fixing broken code." 112 | 113 | reward_definition = [dict(name="diff", weight=1.0)] 114 | 115 | penalty_definition = [] 116 | 117 | static_reference = True 118 | static_query = True 119 | 120 | def __init__(self, llm_pipeline, context, create_reference=True): 121 | 122 | self.context = context 123 | 124 | # No LLM involved in generating the query, we just apply some language-independent corruption to the code 125 | self.query = corrupt( 126 | context.content, 127 | n_remove=random.randint(1, 3), 128 | n_swap=random.randint(0, 2), 129 | sep=random.choices(["", " ", "\n"], weights=[0.3, 0.6, 0.1], k=1)[0], 130 | ) 131 | self.reference = context.content 132 | self.delimiter = "```" 133 | self.topic = context.title 134 | self.subtopic = context.subtopic 135 | self.tags = context.tags 136 | 137 | def format_challenge(self, challenge): 138 | return f"{challenge}\n{self.delimiter}\n{self.query}\n{self.delimiter}" 139 | -------------------------------------------------------------------------------- /prompting/tasks/generic_instruction.py: -------------------------------------------------------------------------------- 1 | import re 2 | import bittensor as bt 3 | from dataclasses import dataclass 4 | from tenacity import retry, stop_after_attempt 5 | from prompting.tasks import Task 6 | from typing import Tuple 7 | 8 | CRITERIA_GENERATION_PROMPT = """\ 9 | We are brainstorming criteria with which to grade a language model on its responses in 10 | diverse situations. 11 | A ‘criteria‘ is some useful, real-world objective, and associated rubric for scores 1-5, that 12 | tests a capability. 13 | 14 | Please brainstorm a new criteria and scoring rubrics. 15 | Be creative and create new but useful criteria that people in different settings or industries 16 | might find practical. 17 | Please format the output as same as the above examples with no extra or surrounding text. 18 | Write [END] after you are done. 19 | New Criteria: 20 | """ 21 | 22 | 23 | INSTRUCTION_GENERATION_PROMPT = """\ 24 | Your job is to generate a new novel problem and a response that is related to the given score 25 | rubric. 26 | The score rubric: 27 | {CRITERIA} 28 | * Problem 29 | - The problem should inherently be related to the score criteria and score rubric given above. 30 | Specifically, the score criteria should be the core attributes required to solve the problem. 31 | - The problem itself should not be too generic or easy to solve. 32 | - If the score rubric is related to logical abilities, generate problems that require math or 33 | coding abilities. 34 | - Try to make the person who might solve the problem not notice the existence of the score 35 | rubric by not explicitly mentioning it, and also provide additional inputs and options if 36 | needed. 37 | - Assume a situation where a user is interacting with an AI model. The user would try to 38 | ask in a first-person point of view, but not using terms like ”I”, ”A User” or ”You” in the 39 | first sentence. 40 | - Do not give a role to the AI, assume that the user is asking a question from his point of 41 | view. 42 | - Do not include any phrase related to AI model in the problem. 43 | * Response 44 | - The response should be a response that would get a score of 5 from the score rubric. 45 | - The response should be as detailed as possible unless the score rubric is related to 46 | conciseness or brevity. It should consist of multiple paragraphs, a list of items, or a 47 | step-by-step reasoning process. 48 | - The response should look like how a well-prompted GPT-4 would normally answer your 49 | problem. 50 | * Format 51 | - DO NOT WRITE ANY GREETING MESSAGES, just write the problem and response 52 | only. 53 | - In front of the problem, append the phrase ”Problem:” and in front of the response, append 54 | the phrase ”Response:”. 55 | - Write in the order of ”Problem” - ”Response”, where the two items are separated by the 56 | phrase ”[NEXT]”. 57 | - Write [END] after you are done. 58 | Data Generation: 59 | """ 60 | 61 | 62 | @dataclass 63 | class GenericInstructionTask(Task): 64 | reward_definition = [ 65 | dict(name="rouge", ngram="rouge-1", metric="f", weight=1.0), 66 | dict(name="relevance", threshold=None, weight=1.0), 67 | ] 68 | 69 | def __init__(self, llm_pipeline): 70 | super().__init__( 71 | name="generic_instruction", 72 | goal="to get the answer to a instruction", 73 | delimiter="```", 74 | reward_types=[ 75 | "CRITERIA_REWARD", 76 | ], 77 | reward_threshold=0.5, 78 | use_challenge_as_prompt=True, 79 | desc="", 80 | topics={}, 81 | topic="", 82 | subtopic="", 83 | challenge="", 84 | reference="", 85 | criteria="", 86 | ) 87 | 88 | self.criteria = self.create_criteria(llm_pipeline) 89 | instruction, reference = self.create_instruction_and_reference(llm_pipeline) 90 | self.challenge = instruction 91 | self.reference = reference 92 | 93 | def extract_instruction_and_reference_from_text(self, text: str) -> Tuple[str, str]: 94 | # Split the text into problem and response using regular expression 95 | split_text = re.split(r"\nResponse:\n", text) 96 | 97 | # Extract problem and response 98 | problem = split_text[0].strip() 99 | response = split_text[1].strip() 100 | 101 | return problem, response 102 | 103 | def create_criteria(self, llm) -> str: 104 | bt.logging.debug("🎲 Creating a generic criteria-scoring rubric ...") 105 | 106 | # Generate a score rubric with defined criterias 107 | criteria_generation_response = llm(CRITERIA_GENERATION_PROMPT) 108 | return criteria_generation_response 109 | 110 | @retry(stop=stop_after_attempt(5)) 111 | def create_instruction_and_reference(self, llm) -> Tuple[str, str]: 112 | try: 113 | bt.logging.debug("📋 🎯 Creating instruction and referece text...") 114 | 115 | if not self.criteria: 116 | raise ValueError( 117 | "Criteria must be defined before creating a generic instruction." 118 | ) 119 | 120 | # Create generic instruction with the score rubric 121 | instruction_generation_prompt_with_criteria = ( 122 | INSTRUCTION_GENERATION_PROMPT.format(CRITERIA=self.criteria) 123 | ) 124 | instruction_generation_response = llm( 125 | instruction_generation_prompt_with_criteria 126 | ) 127 | 128 | # Extract generic instruction and reference response from the generated text 129 | ( 130 | instruction, 131 | reference, 132 | ) = self.extract_instruction_and_reference_from_text( 133 | instruction_generation_response 134 | ) 135 | 136 | return instruction, reference 137 | except Exception as e: 138 | bt.logging.error( 139 | f"Failed to create instruction and reference text: {e}, retrying..." 140 | ) 141 | raise e 142 | -------------------------------------------------------------------------------- /prompting/tasks/math.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import bittensor as bt 3 | from dataclasses import dataclass 4 | from prompting.tasks import Task 5 | 6 | 7 | @dataclass 8 | class MathTask(Task): 9 | 10 | name = "math" 11 | desc = "get help solving a math problem" 12 | goal = "to get the answer to the following math question" 13 | 14 | reward_definition = [ 15 | dict(name="float_diff", weight=1.0), 16 | ] 17 | penalty_definition = [] 18 | 19 | static_reference = True 20 | static_query = True 21 | 22 | def __init__(self, llm_pipeline, context, create_reference=True): 23 | 24 | self.context = context 25 | 26 | self.query = ( 27 | "How can I solve the following problem, " 28 | + context.content 29 | + "? Make sure to include the whole problem when you ask your question." 30 | ) 31 | self.reference = context.extra["solution"] 32 | self.topic = context.title 33 | self.subtopic = context.topic 34 | self.tags = context.tags 35 | -------------------------------------------------------------------------------- /prompting/tasks/qa.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from prompting.tasks import Task 3 | 4 | # TODO: introduce criteria for the query and reference answer (length, layout, etc.) and make these arguments 5 | # TODO 6 | 7 | # Used to instruct the LLM to provide a good query when given a context 8 | QUERY_SYSTEM_PROMPT = """\ 9 | You are a question-generating expert, focusing on delivering comprehensive and accurate questions with depth and clarity. The questions you generate should be based on the context that is provided. 10 | You will maintain a neutral tone in your questions. 11 | You will adhere to a word limit of 50 words for each question. 12 | """ 13 | 14 | # Used to obtain the query (which is a question about the context) 15 | QUERY_PROMPT_TEMPLATE = """\ 16 | Ask a specific question about the following context: 17 | 18 | #Context: 19 | {context} 20 | """ 21 | 22 | # Used to instruct the LLM to provide a good answer to the query when given a context 23 | REFERENCE_SYSTEM_PROMPT = """\ 24 | You are a question-answering expert, focusing on delivering comprehensive and accurate responses with depth and clarity. 25 | You will maintain a neutral tone in your explanations. 26 | You will adhere to a word limit of 150 words for each response. Where applicable, include references to credible sources to support your answers. 27 | """ 28 | 29 | # Used to obtain reference answer 30 | REFERENCE_PROMPT_TEMPLATE = """\ 31 | Answer the question you will receive in detail, utilizing the following context. 32 | 33 | #Context: 34 | {context} 35 | 36 | # Question: 37 | {question} 38 | """ 39 | 40 | 41 | @dataclass 42 | class QuestionAnsweringTask(Task): 43 | 44 | name = "question-answering" 45 | desc = "get help on answering a question" 46 | goal = "to get the answer to the following question" 47 | 48 | reward_definition = [ 49 | dict(name="rouge", ngram="rouge-1", metric="f", weight=0.5), 50 | dict(name="relevance", weight=0.5), 51 | ] 52 | penalty_definition = [ 53 | dict(name="rouge", ngram="rouge-1", metric="f", weight=0.5), 54 | ] 55 | 56 | cleaning_pipeline = [ 57 | dict(name="remove_quotes"), 58 | dict(name="prune_ending"), 59 | dict(name="remove_roles"), 60 | ] 61 | 62 | def __init__(self, llm_pipeline, context, create_reference=True): 63 | 64 | self.context = context 65 | 66 | self.query_system_prompt = QUERY_SYSTEM_PROMPT 67 | self.query_prompt = QUERY_PROMPT_TEMPLATE.format(context=context.content) 68 | self.query = self.generate_query(llm_pipeline) 69 | 70 | self.reference_system_prompt = REFERENCE_SYSTEM_PROMPT 71 | self.reference_prompt = REFERENCE_PROMPT_TEMPLATE.format( 72 | context=context.content, question=self.query 73 | ) 74 | if create_reference: 75 | self.reference = self.generate_reference(llm_pipeline) 76 | 77 | self.topic = context.title 78 | self.subtopic = context.topic 79 | self.tags = context.tags 80 | -------------------------------------------------------------------------------- /prompting/tasks/summarization.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from prompting.tasks import Task 3 | from transformers import Pipeline 4 | 5 | 6 | # TODO: introduce criteria for the query and reference answer (length, layout, etc.) and make these arguments 7 | 8 | # TODO: Also add a query system prompt and a query prompt template 9 | # TODO: Add the option to generate the summary query from the context. e.g. "the childhood of Abraham Lincoln" which is more specific than summarizing the entire article (Abraham Lincoln) 10 | 11 | # Used to instruct the LLM to provide a good answer to the query when given a context 12 | SUMMARIZATION_SYSTEM_PROMPT = """\ 13 | You are a summarization AI assistant. You make excellent and concise summaries that adhere to your given instructions. 14 | You will maintain a neutral tone in your summaries. 15 | You will adhere to a word limit of 250 words for each response. 16 | """ 17 | 18 | # Used to obtain reference answer 19 | REFERENCE_PROMPT_TEMPLATE = """\ 20 | Summarize the following context in a concise and accurate manner: 21 | 22 | ## Context 23 | {context} 24 | """ 25 | 26 | 27 | @dataclass 28 | class SummarizationTask(Task): 29 | 30 | name = "summarization" 31 | desc = "get help with summarization" 32 | goal = "summarize the following topic" 33 | 34 | reward_definition = [ 35 | dict(name="rouge", ngram="rouge-l", metric="f", weight=0.5), 36 | dict(name="relevance", weight=0.5), 37 | ] 38 | penalty_definition = [dict(name="rouge", ngram="rouge-1", metric="f", weight=0.5)] 39 | 40 | # This is where you define cleaning procedures for the generation. 41 | # Can be used when wanting to clean the challenge. 42 | cleaning_pipeline = [ 43 | dict(name="remove_quotes"), 44 | dict(name="prune_ending"), 45 | dict(name="remove_roles"), 46 | ] 47 | 48 | static_query = True 49 | 50 | def __init__(self, llm_pipeline: Pipeline, context: str, create_reference=True): 51 | 52 | self.context = context 53 | 54 | # Query is just the article title and section name 55 | self.query = context.title + ", " + context.topic 56 | 57 | self.reference_system_prompt = SUMMARIZATION_SYSTEM_PROMPT 58 | self.reference_prompt = REFERENCE_PROMPT_TEMPLATE.format( 59 | context=context.content 60 | ) 61 | if create_reference: 62 | self.reference = self.generate_reference(llm_pipeline) 63 | 64 | self.topic = context.title 65 | self.subtopic = context.topic 66 | self.tags = context.tags 67 | -------------------------------------------------------------------------------- /prompting/tasks/task.py: -------------------------------------------------------------------------------- 1 | import time 2 | import bittensor as bt 3 | from abc import ABC 4 | from dataclasses import dataclass, asdict 5 | from enum import Enum 6 | from typing import List, Union, Dict 7 | from prompting.llm import HuggingFaceLLM 8 | from transformers import Pipeline 9 | from prompting.cleaners.cleaner import CleanerPipeline 10 | 11 | 12 | class TaskEvaluationType(Enum): 13 | REWARD_STACK = "reward" 14 | FILTER_STACK = "filter" 15 | PENALTY_STACK = "penalty" 16 | SIMILARITY_STACK = "similarity" 17 | RELEVANCE_STACK = "relevance" 18 | 19 | 20 | @dataclass 21 | class Task(ABC): 22 | # topics: dict 23 | name: str 24 | desc: str 25 | goal: str 26 | query: str 27 | topic: str 28 | subtopic: str 29 | tags: List[str] 30 | context: dict 31 | reward_definition: List[dict] 32 | penalty_definition: List[dict] = None 33 | reward_threshold: float = 0.0 34 | reference: Union[str, List[str]] = "" 35 | criteria: str = ("",) 36 | delimiter: str = "" 37 | complete: bool = False 38 | static_reference: bool = False 39 | static_query: bool = False 40 | reference_system_prompt = "" 41 | reference_prompt = "" 42 | query_system_prompt = "" 43 | query_prompt = "" 44 | cleaner = None 45 | 46 | def __str__(self): 47 | return f"{self.__class__.__name__}(name={self.name!r}, desc={self.desc!r}, goal={self.goal!r}, query={self.query!r}, reference={self.reference!r}, topic={self.topic!r}, subtopic={self.subtopic!r}, tags={self.tags!r})" 48 | 49 | def __repr__(self): 50 | return str(self) 51 | 52 | def __state_dict__(self, full=False): 53 | state = { 54 | "task": self.name, 55 | "desc": self.desc, 56 | "goal": self.goal, 57 | "query": self.query, # For now we just use the raw query but should add delimiters again 58 | "query_time": getattr(self, "query_time", 0), 59 | "reference": self.reference, 60 | "reference_time": getattr(self, "reference_time", 0), 61 | "topic": self.topic, 62 | "subtopic": self.subtopic, 63 | "context_time": self.context.stats.get("fetch_time", 0.0), 64 | } 65 | if full: 66 | state.update(asdict(self.context)) 67 | 68 | return state 69 | 70 | def generate(self, system: str, prompt: str, llm: Pipeline, clean=True) -> str: 71 | """Uses the llm to generate a response to a prompt""" 72 | 73 | cleaner = ( 74 | CleanerPipeline(cleaning_pipeline=self.cleaning_pipeline) if clean else None 75 | ) 76 | return HuggingFaceLLM(llm, system_prompt=system).query( 77 | message=prompt, cleaner=cleaner 78 | ) 79 | 80 | def generate_reference(self, llm: Pipeline, clean=True) -> str: 81 | """Generates a reference answer to be used for scoring miner completions""" 82 | t0 = time.time() 83 | if not self.static_reference: 84 | bt.logging.debug("🤖 Generating reference...") 85 | 86 | self.reference = self.generate( 87 | system=self.reference_system_prompt, 88 | prompt=self.reference_prompt, 89 | llm=llm, 90 | clean=clean, 91 | ) 92 | 93 | self.reference_time = time.time() - t0 94 | return self.reference 95 | 96 | def generate_query(self, llm: Pipeline, clean=True) -> str: 97 | """Generates a query to be used for generating the challenge""" 98 | t0 = time.time() 99 | if not self.static_query: 100 | bt.logging.debug("🤖 Generating query...") 101 | self.query = self.generate( 102 | system=self.query_system_prompt, 103 | prompt=self.query_prompt, 104 | llm=llm, 105 | clean=clean, 106 | ) 107 | 108 | self.query_time = time.time() - t0 109 | return self.query 110 | 111 | def format_challenge(self, challenge) -> str: 112 | """Formats the challenge to be used for the conversation""" 113 | return challenge 114 | -------------------------------------------------------------------------------- /prompting/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .datasets import ( 2 | Context, 3 | Dataset, 4 | MockDataset, 5 | HFCodingDataset, 6 | WikiDataset, 7 | StackOverflowDataset, 8 | WikiDateDataset, 9 | MathDataset, 10 | ) 11 | from .selector import Selector 12 | -------------------------------------------------------------------------------- /prompting/tools/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .context import Context 2 | from .base import Dataset 3 | from .code import HFCodingDataset, StackOverflowDataset 4 | from .math import MathDataset 5 | from .mock import MockDataset 6 | from .wiki import WikiDataset, WikiDateDataset 7 | -------------------------------------------------------------------------------- /prompting/tools/datasets/base.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # Copyright © 2024 Yuma Rao 3 | # Copyright © 2023 Opentensor Foundation 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 6 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of 11 | # the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 14 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 17 | # DEALINGS IN THE SOFTWARE. 18 | 19 | import time 20 | from abc import ABC, abstractmethod 21 | from typing import Dict 22 | import bittensor as bt 23 | 24 | from ..selector import Selector 25 | from .context import Context 26 | from prompting.utils.exceptions import MaxRetryError 27 | 28 | 29 | class Dataset(ABC): 30 | """Base class for datasets.""" 31 | 32 | max_tries: int = 10 33 | 34 | @abstractmethod 35 | def search(self, name): ... 36 | 37 | @abstractmethod 38 | def random(self, name): ... 39 | 40 | @abstractmethod 41 | def get(self, name): ... 42 | 43 | def next( 44 | self, method: str = "random", selector: Selector = Selector(), **kwargs 45 | ) -> Dict: 46 | tries = 1 47 | t0 = time.time() 48 | 49 | while True: 50 | 51 | # TODO: Multithread the get method so that we don't have to suffer nonexistent pages 52 | info = {} 53 | if method == "random": 54 | info = self.random(selector=selector, **kwargs) 55 | elif method == "search": 56 | info = self.search(selector=selector, **kwargs) 57 | elif method == "get": 58 | info = self.get(selector=selector, **kwargs) 59 | else: 60 | raise ValueError(f"Unknown dataset get method {method!r}") 61 | 62 | if info: 63 | break 64 | 65 | bt.logging.debug( 66 | f"Could not find any samples which meet {self.__class__.__name__} requirements after {tries} tries. Retrying... ({self.max_tries - tries} tries remaining.)" 67 | ) 68 | 69 | tries += 1 70 | if tries >= self.max_tries: 71 | raise MaxRetryError( 72 | f"Could not find any samples which meet {self.__class__.__name__} requirements after {tries} tries." 73 | ) 74 | 75 | info["stats"] = { 76 | "creator": self.__class__.__name__, 77 | "fetch_time": time.time() - t0, 78 | "num_tries": tries, 79 | "fetch_method": method, 80 | "next_kwargs": kwargs, 81 | } 82 | return Context(**info) 83 | -------------------------------------------------------------------------------- /prompting/tools/datasets/context.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from dataclasses import dataclass 3 | 4 | 5 | @dataclass 6 | class Context: 7 | 8 | # TODO: Pydantic model 9 | title: str 10 | topic: str 11 | subtopic: str 12 | content: str 13 | internal_links: List[str] 14 | external_links: List[str] 15 | source: str 16 | tags: List[str] = None 17 | extra: dict = None # additional non-essential information 18 | stats: dict = None # retrieval stats such as fetch time, number of tries, etc. 19 | -------------------------------------------------------------------------------- /prompting/tools/datasets/math.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # Copyright © 2024 Yuma Rao 3 | # Copyright © 2023 Opentensor Foundation 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 6 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 8 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of 11 | # the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 14 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 15 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 16 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 17 | # DEALINGS IN THE SOFTWARE. 18 | 19 | import time 20 | import random 21 | import itertools 22 | import mathgenerator 23 | import bittensor as bt 24 | from sympy.parsing.latex import parse_latex 25 | from typing import Dict, Union, List, Tuple 26 | 27 | 28 | from .base import Dataset 29 | from ..selector import Selector 30 | 31 | 32 | class MathDataset(Dataset): 33 | topics_list = mathgenerator.getGenList() 34 | 35 | def __init__(self, seed=None): 36 | 37 | self.seed = seed 38 | self.rng = random.Random(seed) 39 | 40 | def get( 41 | self, 42 | name: str, 43 | selector: Selector = None, 44 | include: List = None, 45 | exclude: List = None, 46 | **kwargs, 47 | ) -> Dict: 48 | """Get a math problem. 49 | 50 | Args: 51 | name (str): Name of math problem to generate. 52 | selector (Selector, optional): Selector instance to choose a specific problem. Defaults to None. 53 | include (List, optional): _description_. Defaults to None. 54 | exclude (List, optional): _description_. Defaults to None. 55 | 56 | Returns: 57 | Dict: _description_ 58 | """ 59 | bt.logging.debug(f"Getting math problem {name!r}") 60 | info = mathgenerator.generate_context(name, **kwargs) 61 | if info["reward_type"] != "float": 62 | return None 63 | 64 | math_words = [ 65 | "math", 66 | "mathematics", 67 | "mathematical", 68 | "math problem", 69 | "math technique", 70 | ] 71 | external_links = [] 72 | # construct external links from randomly shuffled trigrams containing 2 words from the problem and 1 random math word 73 | # binary_to_decimal -> ['binary to', 'to decimal'] 74 | for bigram in itertools.combinations(info["forward_words"], 2): 75 | words = list(bigram) + [random.choice(math_words)] 76 | # shuffle the words e.g. ['binary', 'decimal', 'math problem'] -> 'decimal binary math problem' 77 | external_links.append(" ".join(random.sample(words, len(words)))) 78 | 79 | return { 80 | "title": info["topic"], # title of math problem 81 | "topic": info["topic"], # title of problem topic 82 | "subtopic": info["subtopic"], # title of problem subtopic 83 | "content": info["problem"], # problem statement 84 | "internal_links": [info["topic"], info["subtopic"]], # internal links 85 | "external_links": external_links, 86 | "tags": info["forward_words"], 87 | "source": "Mathgenerator", 88 | "extra": {"reward_type": info["reward_type"], "solution": info["solution"]}, 89 | } 90 | 91 | def search( 92 | self, name, selector: Selector, include: List = None, exclude: List = None 93 | ) -> Dict: 94 | raise NotImplementedError( 95 | f"Search is not implemented for {self.__class__.__name__}" 96 | ) 97 | 98 | def random(self, selector: Selector, **kwargs): 99 | """Create a random math problem.""" 100 | return self.get(name=None, selector=selector, **kwargs) 101 | -------------------------------------------------------------------------------- /prompting/tools/datasets/mock.py: -------------------------------------------------------------------------------- 1 | from .base import Dataset 2 | 3 | # from ..selector import Selector 4 | 5 | 6 | class MockDataset(Dataset): 7 | 8 | def get(self, name, exclude=None, selector=None): 9 | return { 10 | "title": name, 11 | "topic": "Physics", 12 | "subtopic": "Quantum_mechanics", 13 | "content": f"{name} is a fraud. All of physics is a lie, the universe is a hologram, buy gold, bye!", 14 | "internal_links": [ 15 | "Quantum_mechanics", 16 | "General_relativity", 17 | "Special_relativity", 18 | "String_theory", 19 | ], 20 | "external_links": ["Einstein", "Bohr", "Feynman", "Hawking"], 21 | "tags": ["fraud", "hologram", "gold"], 22 | "source": "Mockpedia", 23 | "extra": {"solution": "religion"}, 24 | } 25 | 26 | def search(self, name, exclude=None, selector=None): 27 | return self.get(name) 28 | 29 | def random(self, name="Physics", exclude=None, selector=None): 30 | return self.get(name) 31 | -------------------------------------------------------------------------------- /prompting/tools/selector.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | class Selector: 5 | def __init__(self, seed=None): 6 | self.seed = seed 7 | self.rng = random.Random(seed) 8 | 9 | def __call__(self, items, weights=None): 10 | return self.rng.choices(items, weights=weights)[0] 11 | 12 | 13 | class PageRankSelector(Selector): 14 | """Preferentially chooses the items at the top of the list, under the assumption that they are more important.""" 15 | 16 | def __init__(self, seed=None, alpha=0.85): 17 | super().__init__(seed) 18 | self.alpha = alpha 19 | 20 | def __call__(self, items): 21 | weights = [self.alpha**i for i in range(len(items))] 22 | return self.rng.choices(items, weights=weights)[0] 23 | 24 | 25 | class SimilaritySelector(Selector): 26 | """Chooses the item most similar to the query.""" 27 | 28 | def __init__(self, seed=None, similarity_fn=None): 29 | super().__init__(seed) 30 | self.similarity_fn = similarity_fn 31 | 32 | def __call__(self, query, items): 33 | return max(items, key=lambda item: self.similarity_fn(query, item)) 34 | 35 | 36 | class TopSelector(Selector): 37 | """Chooses the top item.""" 38 | 39 | def __init__(self, seed=None): 40 | super().__init__(seed) 41 | 42 | def __call__(self, items): 43 | return items[0] 44 | 45 | 46 | if __name__ == "__main__": 47 | 48 | selector = Selector(seed=42) 49 | items = range(10) 50 | item = selector(items) 51 | 52 | assert item in items, "Selector should return one of the items" 53 | -------------------------------------------------------------------------------- /prompting/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/prompting/utils/__init__.py -------------------------------------------------------------------------------- /prompting/utils/exceptions.py: -------------------------------------------------------------------------------- 1 | class MaxRetryError(Exception): 2 | """Exception raised when the maximum number of retries is exceeded.""" 3 | 4 | def __init__(self, message="Maximum number of retries exceeded"): 5 | self.message = message 6 | super().__init__(self.message) 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bittensor==9.0.2 2 | datasets==2.18.0 3 | langchain_ollama==0.1.3 4 | loguru==0.7.0 5 | numpy==2.0.2 6 | pandas==2.2.3 7 | pydantic==2.10.3 8 | pylatexenc==2.10 9 | Requests==2.31.0 10 | rich==13.7.1 11 | scikit_learn==1.5.2 12 | setuptools==70.0.0 13 | torch==2.4.1 14 | tqdm==4.66.2 15 | transformers==4.36.0 16 | nltk==3.8.1 17 | wandb==0.17.2 18 | hf_transfer 19 | gdown 20 | zstandard==0.22.0 21 | pyspellchecker==0.8.1 22 | symspellpy==6.7.7 23 | typo==0.1.7 24 | scalecodec 25 | fasttext-numpy2==0.10.4 26 | accelerate==1.0.1 27 | bittensor-cli==9.0.2 28 | 29 | bs4 30 | pre-commit==3.3.2 31 | sentencepiece 32 | tenacity 33 | wikipedia 34 | wikipedia_sections -------------------------------------------------------------------------------- /scripts/check_compatibility.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$1" ]; then 4 | echo "Please provide a Python version as an argument." 5 | exit 1 6 | fi 7 | 8 | python_version="$1" 9 | all_passed=true 10 | 11 | GREEN='\033[0;32m' 12 | YELLOW='\033[0;33m' 13 | RED='\033[0;31m' 14 | NC='\033[0m' # No Color 15 | 16 | check_compatibility() { 17 | all_supported=0 18 | 19 | while read -r requirement; do 20 | # Skip lines starting with git+ 21 | if [[ "$requirement" == git+* ]]; then 22 | continue 23 | fi 24 | 25 | package_name=$(echo "$requirement" | awk -F'[!=<>]' '{print $1}' | awk -F'[' '{print $1}') # Strip off brackets 26 | echo -n "Checking $package_name... " 27 | 28 | url="https://pypi.org/pypi/$package_name/json" 29 | response=$(curl -s $url) 30 | status_code=$(curl -s -o /dev/null -w "%{http_code}" $url) 31 | 32 | if [ "$status_code" != "200" ]; then 33 | echo -e "${RED}Information not available for $package_name. Failure.${NC}" 34 | all_supported=1 35 | continue 36 | fi 37 | 38 | classifiers=$(echo "$response" | jq -r '.info.classifiers[]') 39 | requires_python=$(echo "$response" | jq -r '.info.requires_python') 40 | 41 | base_version="Programming Language :: Python :: ${python_version%%.*}" 42 | specific_version="Programming Language :: Python :: $python_version" 43 | 44 | if echo "$classifiers" | grep -q "$specific_version" || echo "$classifiers" | grep -q "$base_version"; then 45 | echo -e "${GREEN}Supported${NC}" 46 | elif [ "$requires_python" != "null" ]; then 47 | if echo "$requires_python" | grep -Eq "==$python_version|>=$python_version|<=$python_version"; then 48 | echo -e "${GREEN}Supported${NC}" 49 | else 50 | echo -e "${RED}Not compatible with Python $python_version due to constraint $requires_python.${NC}" 51 | all_supported=1 52 | fi 53 | else 54 | echo -e "${YELLOW}Warning: Specific version not listed, assuming compatibility${NC}" 55 | fi 56 | done < requirements.txt 57 | 58 | return $all_supported 59 | } 60 | 61 | echo "Checking compatibility for Python $python_version..." 62 | check_compatibility 63 | if [ $? -eq 0 ]; then 64 | echo -e "${GREEN}All requirements are compatible with Python $python_version.${NC}" 65 | else 66 | echo -e "${RED}All requirements are NOT compatible with Python $python_version.${NC}" 67 | all_passed=false 68 | fi 69 | 70 | echo "" 71 | if $all_passed; then 72 | echo -e "${GREEN}All tests passed.${NC}" 73 | else 74 | echo -e "${RED}All tests did not pass.${NC}" 75 | exit 1 76 | fi 77 | -------------------------------------------------------------------------------- /scripts/check_requirements_changes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if requirements files have changed in the last commit 4 | if git diff --name-only HEAD~1 | grep -E 'requirements.txt|requirements.txt'; then 5 | echo "Requirements files have changed. Running compatibility checks..." 6 | echo 'export REQUIREMENTS_CHANGED="true"' >> $BASH_ENV 7 | else 8 | echo "Requirements files have not changed. Skipping compatibility checks..." 9 | echo 'export REQUIREMENTS_CHANGED="false"' >> $BASH_ENV 10 | fi 11 | -------------------------------------------------------------------------------- /scripts/install_staging.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Section 1: Build/Install 4 | # This section is for first-time setup and installations. 5 | 6 | install_dependencies() { 7 | # Function to install packages on macOS 8 | install_mac() { 9 | which brew > /dev/null 10 | if [ $? -ne 0 ]; then 11 | echo "Installing Homebrew..." 12 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" 13 | fi 14 | echo "Updating Homebrew packages..." 15 | brew update 16 | echo "Installing required packages..." 17 | brew install make llvm curl libssl protobuf tmux 18 | } 19 | 20 | # Function to install packages on Ubuntu/Debian 21 | install_ubuntu() { 22 | echo "Updating system packages..." 23 | sudo apt update 24 | echo "Installing required packages..." 25 | sudo apt install --assume-yes make build-essential git clang curl libssl-dev llvm libudev-dev protobuf-compiler tmux 26 | } 27 | 28 | # Detect OS and call the appropriate function 29 | if [[ "$OSTYPE" == "darwin"* ]]; then 30 | install_mac 31 | elif [[ "$OSTYPE" == "linux-gnu"* ]]; then 32 | install_ubuntu 33 | else 34 | echo "Unsupported operating system." 35 | exit 1 36 | fi 37 | 38 | # Install rust and cargo 39 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 40 | 41 | # Update your shell's source to include Cargo's path 42 | source "$HOME/.cargo/env" 43 | } 44 | 45 | # Call install_dependencies only if it's the first time running the script 46 | if [ ! -f ".dependencies_installed" ]; then 47 | install_dependencies 48 | touch .dependencies_installed 49 | fi 50 | 51 | 52 | # Section 2: Test/Run 53 | # This section is for running and testing the setup. 54 | 55 | # Create a coldkey for the owner role 56 | wallet=${1:-owner} 57 | 58 | # Logic for setting up and running the environment 59 | setup_environment() { 60 | # Clone subtensor and enter the directory 61 | if [ ! -d "subtensor" ]; then 62 | git clone https://github.com/opentensor/subtensor.git 63 | fi 64 | cd subtensor 65 | git pull 66 | 67 | # Update to the nightly version of rust 68 | ./scripts/init.sh 69 | 70 | cd ../bittensor-subnet-template 71 | 72 | # Install the bittensor-subnet-template python package 73 | python -m pip install -e . 74 | 75 | # Create and set up wallets 76 | # This section can be skipped if wallets are already set up 77 | if [ ! -f ".wallets_setup" ]; then 78 | btcli wallet new_coldkey --wallet.name $wallet --no_password --no_prompt 79 | btcli wallet new_coldkey --wallet.name miner --no_password --no_prompt 80 | btcli wallet new_hotkey --wallet.name miner --wallet.hotkey default --no_prompt 81 | btcli wallet new_coldkey --wallet.name validator --no_password --no_prompt 82 | btcli wallet new_hotkey --wallet.name validator --wallet.hotkey default --no_prompt 83 | touch .wallets_setup 84 | fi 85 | 86 | } 87 | 88 | # Call setup_environment every time 89 | setup_environment 90 | 91 | ## Setup localnet 92 | # assumes we are in the bittensor-subnet-template/ directory 93 | # Initialize your local subtensor chain in development mode. This command will set up and run a local subtensor network. 94 | cd ../subtensor 95 | 96 | # Start a new tmux session and create a new pane, but do not switch to it 97 | echo "FEATURES='pow-faucet runtime-benchmarks' BT_DEFAULT_TOKEN_WALLET=$(cat ~/.bittensor/wallets/$wallet/coldkeypub.txt | grep -oP '"ss58Address": "\K[^"]+') bash scripts/localnet.sh" >> setup_and_run.sh 98 | chmod +x setup_and_run.sh 99 | tmux new-session -d -s localnet -n 'localnet' 100 | tmux send-keys -t localnet 'bash ../subtensor/setup_and_run.sh' C-m 101 | 102 | # Notify the user 103 | echo ">> localnet.sh is running in a detached tmux session named 'localnet'" 104 | echo ">> You can attach to this session with: tmux attach-session -t localnet" 105 | 106 | # Register a subnet (this needs to be run each time we start a new local chain) 107 | btcli subnet create --wallet.name $wallet --wallet.hotkey default --subtensor.chain_endpoint ws://127.0.0.1:9946 --no_prompt 108 | 109 | # Transfer tokens to miner and validator coldkeys 110 | export BT_MINER_TOKEN_WALLET=$(cat ~/.bittensor/wallets/miner/coldkeypub.txt | grep -oP '"ss58Address": "\K[^"]+') 111 | export BT_VALIDATOR_TOKEN_WALLET=$(cat ~/.bittensor/wallets/validator/coldkeypub.txt | grep -oP '"ss58Address": "\K[^"]+') 112 | 113 | btcli wallet transfer --subtensor.network ws://127.0.0.1:9946 --wallet.name $wallet --dest $BT_MINER_TOKEN_WALLET --amount 1000 --no_prompt 114 | btcli wallet transfer --subtensor.network ws://127.0.0.1:9946 --wallet.name $wallet --dest $BT_VALIDATOR_TOKEN_WALLET --amount 10000 --no_prompt 115 | 116 | # Register wallet hotkeys to subnet 117 | btcli subnet register --wallet.name miner --netuid 1 --wallet.hotkey default --subtensor.chain_endpoint ws://127.0.0.1:9946 --no_prompt 118 | btcli subnet register --wallet.name validator --netuid 1 --wallet.hotkey default --subtensor.chain_endpoint ws://127.0.0.1:9946 --no_prompt 119 | 120 | # Add stake to the validator 121 | btcli stake add --wallet.name validator --wallet.hotkey default --subtensor.chain_endpoint ws://127.0.0.1:9946 --amount 10000 --no_prompt 122 | 123 | # Ensure both the miner and validator keys are successfully registered. 124 | btcli subnet list --subtensor.chain_endpoint ws://127.0.0.1:9946 125 | btcli wallet overview --wallet.name validator --subtensor.chain_endpoint ws://127.0.0.1:9946 --no_prompt 126 | btcli wallet overview --wallet.name miner --subtensor.chain_endpoint ws://127.0.0.1:9946 --no_prompt 127 | 128 | cd ../bittensor-subnet-template 129 | 130 | 131 | # Check if inside a tmux session 132 | if [ -z "$TMUX" ]; then 133 | # Start a new tmux session and run the miner in the first pane 134 | tmux new-session -d -s bittensor -n 'miner' 'python neurons/miner.py --netuid 1 --subtensor.chain_endpoint ws://127.0.0.1:9946 --wallet.name miner --wallet.hotkey default --logging.debug' 135 | 136 | # Split the window and run the validator in the new pane 137 | tmux split-window -h -t bittensor:miner 'python neurons/validator.py --netuid 1 --subtensor.chain_endpoint ws://127.0.0.1:9946 --wallet.name validator --wallet.hotkey default --logging.debug' 138 | 139 | # Attach to the new tmux session 140 | tmux attach-session -t bittensor 141 | else 142 | # If already in a tmux session, create two panes in the current window 143 | tmux split-window -h 'python neurons/miner.py --netuid 1 --subtensor.chain_endpoint ws://127.0.0.1:9946 --wallet.name miner --wallet.hotkey default --logging.debug' 144 | tmux split-window -v -t 0 'python neurons/validator.py --netuid 1 --subtensor.chain_endpoint ws://127.0.0.1:9946 --wallet.name3 validator --wallet.hotkey default --logging.debug' 145 | fi 146 | -------------------------------------------------------------------------------- /scripts/start_validator.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script runs a validator process and automatically updates it when a new version is released. 3 | Command-line arguments will be forwarded to validator (`neurons/validator.py`), so you can pass 4 | them like this: 5 | python3 scripts/start_validator.py --wallet.name=my-wallet 6 | Auto-updates are enabled by default and will make sure that the latest version is always running 7 | by pulling the latest version from git and upgrading python packages. This is done periodically. 8 | Local changes may prevent the update, but they will be preserved. 9 | 10 | The script will use the same virtual environment as the one used to run it. If you want to run 11 | validator within virtual environment, run this auto-update script from the virtual environment. 12 | 13 | Pm2 is required for this script. This script will start a pm2 process using the name provided by 14 | the --pm2_name argument. 15 | """ 16 | import argparse 17 | import logging 18 | import subprocess 19 | import sys 20 | import time 21 | from datetime import timedelta 22 | from shlex import split 23 | from typing import List 24 | from pathlib import Path 25 | 26 | log = logging.getLogger(__name__) 27 | UPDATES_CHECK_TIME = timedelta(minutes=15) 28 | 29 | ROOT_DIR = Path(__file__).parent.parent 30 | 31 | def get_version() -> str: 32 | """Extract the version as current git commit hash""" 33 | result = subprocess.run( 34 | split("git rev-parse HEAD"), 35 | check=True, 36 | capture_output=True, 37 | cwd=ROOT_DIR, 38 | ) 39 | commit = result.stdout.decode().strip() 40 | assert len(commit) == 40, f"Invalid commit hash: {commit}" 41 | return commit[:8] 42 | 43 | 44 | def start_validator_process(pm2_name: str, args: List[str]) -> subprocess.Popen: 45 | """ 46 | Spawn a new python process running neurons.validator. 47 | `sys.executable` ensures thet the same python interpreter is used as the one 48 | used to run this auto-updater. 49 | """ 50 | assert sys.executable, "Failed to get python executable" 51 | 52 | log.info("Starting validator process with pm2, name: %s", pm2_name) 53 | process = subprocess.Popen( 54 | ( 55 | "pm2", 56 | "start", 57 | sys.executable, 58 | "--name", 59 | pm2_name, 60 | "--", 61 | "-m", 62 | "neurons.validator", 63 | *args, 64 | ), 65 | cwd=ROOT_DIR, 66 | ) 67 | process.pm2_name = pm2_name 68 | 69 | return process 70 | 71 | 72 | def stop_validator_process(process: subprocess.Popen) -> None: 73 | """Stop the validator process""" 74 | subprocess.run( 75 | ("pm2", "delete", process.pm2_name), cwd=ROOT_DIR, check=True 76 | ) 77 | 78 | 79 | def pull_latest_version() -> None: 80 | """ 81 | Pull the latest version from git. 82 | This uses `git pull --rebase`, so if any changes were made to the local repository, 83 | this will try to apply them on top of origin's changes. This is intentional, as we 84 | don't want to overwrite any local changes. However, if there are any conflicts, 85 | this will abort the rebase and return to the original state. 86 | The conflicts are expected to happen rarely since validator is expected 87 | to be used as-is. 88 | """ 89 | try: 90 | subprocess.run( 91 | split("git pull --rebase --autostash"), check=True, cwd=ROOT_DIR 92 | ) 93 | except subprocess.CalledProcessError as exc: 94 | log.error("Failed to pull, reverting: %s", exc) 95 | subprocess.run(split("git rebase --abort"), check=True, cwd=ROOT_DIR) 96 | 97 | 98 | def upgrade_packages() -> None: 99 | """ 100 | Upgrade python packages by running `pip install --upgrade -r requirements.txt`. 101 | Notice: this won't work if some package in `requirements.txt` is downgraded. 102 | Ignored as this is unlikely to happen. 103 | """ 104 | 105 | log.info("Upgrading packages") 106 | try: 107 | subprocess.run( 108 | split(f"{sys.executable} -m pip install -e ."), 109 | check=True, 110 | cwd=ROOT_DIR, 111 | ) 112 | 113 | except subprocess.CalledProcessError as exc: 114 | log.error("Failed to upgrade packages, proceeding anyway. %s", exc) 115 | 116 | 117 | def main(pm2_name: str, args: List[str]) -> None: 118 | """ 119 | Run the validator process and automatically update it when a new version is released. 120 | This will check for updates every `UPDATES_CHECK_TIME` and update the validator 121 | if a new version is available. Update is performed as simple `git pull --rebase`. 122 | """ 123 | 124 | validator = start_validator_process(pm2_name, args) 125 | current_version = latest_version = get_version() 126 | log.info("Current version: %s", current_version) 127 | 128 | try: 129 | while True: 130 | pull_latest_version() 131 | latest_version = get_version() 132 | log.info("Latest version: %s", latest_version) 133 | 134 | if latest_version != current_version: 135 | log.info( 136 | "Upgraded to latest version: %s -> %s", 137 | current_version, 138 | latest_version, 139 | ) 140 | upgrade_packages() 141 | 142 | stop_validator_process(validator) 143 | validator = start_validator_process(pm2_name, args) 144 | current_version = latest_version 145 | 146 | time.sleep(UPDATES_CHECK_TIME.total_seconds()) 147 | 148 | finally: 149 | stop_validator_process(validator) 150 | 151 | 152 | if __name__ == "__main__": 153 | logging.basicConfig( 154 | level=logging.INFO, 155 | format="%(asctime)s %(levelname)s %(message)s", 156 | handlers=[logging.StreamHandler(sys.stdout)], 157 | ) 158 | 159 | parser = argparse.ArgumentParser( 160 | description="Automatically update and restart the validator process when a new version is released.", 161 | epilog="Example usage: python start_validator.py --pm2_name 'net9vali' --wallet_name 'wallet1' --wallet_hotkey 'key123'", 162 | ) 163 | 164 | parser.add_argument( 165 | "--pm2_name", default="net9vali", help="Name of the PM2 process." 166 | ) 167 | 168 | flags, extra_args = parser.parse_known_args() 169 | 170 | main(flags.pm2_name, extra_args) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # Copyright © 2024 It's AI 3 | 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of 10 | # the Software. 11 | 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 16 | # DEALINGS IN THE SOFTWARE. 17 | 18 | import re 19 | import os 20 | import codecs 21 | import pathlib 22 | from os import path 23 | from io import open 24 | from setuptools import setup, find_packages 25 | from pkg_resources import parse_requirements 26 | 27 | import subprocess 28 | 29 | 30 | def run_command(command): 31 | try: 32 | subprocess.run(command, check=True, shell=True) 33 | print(f"Successfully executed: {command}") 34 | except subprocess.CalledProcessError as e: 35 | print(f"Error executing command: {command}") 36 | print(f"Error details: {e}") 37 | 38 | 39 | def read_requirements(path): 40 | with open(path, "r") as f: 41 | requirements = f.read().splitlines() 42 | processed_requirements = [] 43 | 44 | for req in requirements: 45 | # For git or other VCS links 46 | if req.startswith("git+") or "@" in req: 47 | pkg_name = re.search(r"(#egg=)([\w\-_]+)", req) 48 | if pkg_name: 49 | processed_requirements.append(pkg_name.group(2)) 50 | else: 51 | # You may decide to raise an exception here, 52 | # if you want to ensure every VCS link has an #egg= at the end 53 | continue 54 | else: 55 | processed_requirements.append(req) 56 | return processed_requirements 57 | 58 | 59 | requirements = read_requirements("requirements.txt") 60 | here = path.abspath(path.dirname(__file__)) 61 | 62 | with open(path.join(here, "README.md"), encoding="utf-8") as f: 63 | long_description = f.read() 64 | 65 | # loading version from setup.py 66 | with codecs.open( 67 | os.path.join(here, "detection/__init__.py"), encoding="utf-8" 68 | ) as init_file: 69 | version_match = re.search( 70 | r"^__version__ = ['\"]([^'\"]*)['\"]", init_file.read(), re.M 71 | ) 72 | version_string = version_match.group(1) 73 | 74 | commands = [ 75 | "cd cc_net && make install", 76 | "cd cc_net && make install", 77 | "cd cc_net && pip uninstall cc_net", 78 | "cd cc_net && pip install -e .", 79 | "cd cc_net && make lang=en dl_lm", 80 | ] 81 | 82 | print('Setting up cc_net') 83 | for cmd in commands: 84 | run_command(cmd) 85 | 86 | setup( 87 | name="detection", 88 | version=version_string, 89 | description="Bittensor LLM Generated Content Detection", 90 | long_description=long_description, 91 | long_description_content_type="text/markdown", 92 | url="https://github.com/It-s-AI/llm-detection", 93 | author="Sergey Volnov & Nikita Dilman", 94 | packages=find_packages(), 95 | include_package_data=True, 96 | author_email="dalmannikita@gmail.com", 97 | license="MIT", 98 | python_requires=">=3.8", 99 | install_requires=requirements, 100 | classifiers=[ 101 | "Development Status :: 3 - Alpha", 102 | "Intended Audience :: Developers", 103 | "Topic :: Software Development :: Build Tools", 104 | # Pick your license as you wish 105 | "License :: OSI Approved :: MIT License", 106 | "Programming Language :: Python :: 3 :: Only", 107 | "Programming Language :: Python :: 3.8", 108 | "Programming Language :: Python :: 3.9", 109 | "Programming Language :: Python :: 3.10", 110 | "Topic :: Scientific/Engineering", 111 | "Topic :: Scientific/Engineering :: Mathematics", 112 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 113 | "Topic :: Software Development", 114 | "Topic :: Software Development :: Libraries", 115 | "Topic :: Software Development :: Libraries :: Python Modules", 116 | ], 117 | ) 118 | 119 | 120 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/It-s-AI/llm-detection/35f973b80e0811d02e18aa4ac5e5f3424a182adb/tests/__init__.py -------------------------------------------------------------------------------- /tests/helpers.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # Copyright © 2023 Opentensor Foundation 3 | 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of 10 | # the Software. 11 | 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 16 | # DEALINGS IN THE SOFTWARE. 17 | 18 | from typing import Union 19 | from bittensor import ( 20 | Balance, 21 | NeuronInfo, 22 | AxonInfo, 23 | PrometheusInfo, 24 | __ss58_format__, 25 | ) 26 | from bittensor.mock.wallet_mock import MockWallet as _MockWallet 27 | from bittensor.mock.wallet_mock import get_mock_coldkey as _get_mock_coldkey 28 | from bittensor.mock.wallet_mock import get_mock_hotkey as _get_mock_hotkey 29 | from bittensor.mock.wallet_mock import get_mock_keypair as _get_mock_keypair 30 | from bittensor.mock.wallet_mock import get_mock_wallet as _get_mock_wallet 31 | 32 | from rich.console import Console 33 | from rich.text import Text 34 | 35 | 36 | def __mock_wallet_factory__(*args, **kwargs) -> _MockWallet: 37 | """Returns a mock wallet object.""" 38 | 39 | mock_wallet = _get_mock_wallet() 40 | 41 | return mock_wallet 42 | 43 | 44 | class CLOSE_IN_VALUE: 45 | value: Union[float, int, Balance] 46 | tolerance: Union[float, int, Balance] 47 | 48 | def __init__( 49 | self, 50 | value: Union[float, int, Balance], 51 | tolerance: Union[float, int, Balance] = 0.0, 52 | ) -> None: 53 | self.value = value 54 | self.tolerance = tolerance 55 | 56 | def __eq__(self, __o: Union[float, int, Balance]) -> bool: 57 | # True if __o \in [value - tolerance, value + tolerance] 58 | # or if value \in [__o - tolerance, __o + tolerance] 59 | return ( 60 | (self.value - self.tolerance) <= __o 61 | and __o <= (self.value + self.tolerance) 62 | ) or ( 63 | (__o - self.tolerance) <= self.value 64 | and self.value <= (__o + self.tolerance) 65 | ) 66 | 67 | 68 | def get_mock_neuron(**kwargs) -> NeuronInfo: 69 | """ 70 | Returns a mock neuron with the given kwargs overriding the default values. 71 | """ 72 | 73 | mock_neuron_d = dict( 74 | { 75 | "netuid": -1, # mock netuid 76 | "axon_info": AxonInfo( 77 | block=0, 78 | version=1, 79 | ip=0, 80 | port=0, 81 | ip_type=0, 82 | protocol=0, 83 | placeholder1=0, 84 | placeholder2=0, 85 | ), 86 | "prometheus_info": PrometheusInfo( 87 | block=0, version=1, ip=0, port=0, ip_type=0 88 | ), 89 | "validator_permit": True, 90 | "uid": 1, 91 | "hotkey": "some_hotkey", 92 | "coldkey": "some_coldkey", 93 | "active": 0, 94 | "last_update": 0, 95 | "stake": {"some_coldkey": 1e12}, 96 | "total_stake": 1e12, 97 | "rank": 0.0, 98 | "trust": 0.0, 99 | "consensus": 0.0, 100 | "validator_trust": 0.0, 101 | "incentive": 0.0, 102 | "dividends": 0.0, 103 | "emission": 0.0, 104 | "bonds": [], 105 | "weights": [], 106 | "stake_dict": {}, 107 | "pruning_score": 0.0, 108 | "is_null": False, 109 | } 110 | ) 111 | 112 | mock_neuron_d.update(kwargs) # update with kwargs 113 | 114 | if kwargs.get("stake") is None and kwargs.get("coldkey") is not None: 115 | mock_neuron_d["stake"] = {kwargs.get("coldkey"): 1e12} 116 | 117 | if kwargs.get("total_stake") is None: 118 | mock_neuron_d["total_stake"] = sum(mock_neuron_d["stake"].values()) 119 | 120 | mock_neuron = NeuronInfo._neuron_dict_to_namespace(mock_neuron_d) 121 | 122 | return mock_neuron 123 | 124 | 125 | def get_mock_neuron_by_uid(uid: int, **kwargs) -> NeuronInfo: 126 | return get_mock_neuron( 127 | uid=uid, 128 | hotkey=_get_mock_hotkey(uid), 129 | coldkey=_get_mock_coldkey(uid), 130 | **kwargs 131 | ) 132 | 133 | 134 | class MockStatus: 135 | def __enter__(self): 136 | return self 137 | 138 | def __exit__(self, exc_type, exc_value, traceback): 139 | pass 140 | 141 | def start(self): 142 | pass 143 | 144 | def stop(self): 145 | pass 146 | 147 | def update(self, *args, **kwargs): 148 | MockConsole().print(*args, **kwargs) 149 | 150 | 151 | class MockConsole: 152 | """ 153 | Mocks the console object for status and print. 154 | Captures the last print output as a string. 155 | """ 156 | 157 | captured_print = None 158 | 159 | def status(self, *args, **kwargs): 160 | return MockStatus() 161 | 162 | def print(self, *args, **kwargs): 163 | console = Console( 164 | width=1000, no_color=True, markup=False 165 | ) # set width to 1000 to avoid truncation 166 | console.begin_capture() 167 | console.print(*args, **kwargs) 168 | self.captured_print = console.end_capture() 169 | 170 | def clear(self, *args, **kwargs): 171 | pass 172 | 173 | @staticmethod 174 | def remove_rich_syntax(text: str) -> str: 175 | """ 176 | Removes rich syntax from the given text. 177 | Removes markup and ansi syntax. 178 | """ 179 | output_no_syntax = Text.from_ansi(Text.from_markup(text).plain).plain 180 | 181 | return output_no_syntax 182 | -------------------------------------------------------------------------------- /tests/test_template_validator.py: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # Copyright © 2024 It's AI# Copyright © 2023 Opentensor Foundation 3 | 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 5 | # documentation files (the “Software”), to deal in the Software without restriction, including without limitation 6 | # the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, 7 | # and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 8 | 9 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of 10 | # the Software. 11 | 12 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 13 | # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 14 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 16 | # DEALINGS IN THE SOFTWARE. 17 | 18 | import sys 19 | import torch 20 | import unittest 21 | import bittensor as bt 22 | 23 | from neurons.validator import Neuron as Validator 24 | 25 | from detection.protocol import Dummy 26 | from detection.utils.uids import get_random_uids 27 | from detection.validator.reward import get_rewards 28 | from detection.base.validator import BaseValidatorNeuron 29 | 30 | 31 | class TemplateValidatorNeuronTestCase(unittest.TestCase): 32 | """ 33 | This class contains unit tests for the RewardEvent classes. 34 | 35 | The tests cover different scenarios where completions may or may not be successful and the reward events are checked that they don't contain missing values. 36 | The `reward` attribute of all RewardEvents is expected to be a float, and the `is_filter_model` attribute is expected to be a boolean. 37 | """ 38 | 39 | def setUp(self): 40 | sys.argv = sys.argv[0] + ["--config", "tests/configs/validator.json"] 41 | 42 | config = BaseValidatorNeuron.config() 43 | config.wallet._mock = True 44 | config.metagraph._mock = True 45 | config.subtensor._mock = True 46 | self.neuron = Validator(config) 47 | self.miner_uids = get_random_uids(self, k=10) 48 | 49 | def test_run_single_step(self): 50 | # TODO: Test a single step 51 | pass 52 | 53 | def test_sync_error_if_not_registered(self): 54 | # TODO: Test that the validator throws an error if it is not registered on metagraph 55 | pass 56 | 57 | def test_forward(self): 58 | # TODO: Test that the forward function returns the correct value 59 | pass 60 | 61 | def test_dummy_responses(self): 62 | # TODO: Test that the dummy responses are correctly constructed 63 | 64 | responses = self.neuron.dendrite.query( 65 | # Send the query to miners in the network. 66 | axons=[ 67 | self.neuron.metagraph.axons[uid] for uid in self.miner_uids 68 | ], 69 | # Construct a dummy query. 70 | synapse=Dummy(dummy_input=self.neuron.step), 71 | # All responses have the deserialize function called on them before returning. 72 | deserialize=True, 73 | ) 74 | 75 | for i, response in enumerate(responses): 76 | self.assertEqual(response, self.neuron.step * 2) 77 | 78 | def test_reward(self): 79 | # TODO: Test that the reward function returns the correct value 80 | responses = self.dendrite.query( 81 | # Send the query to miners in the network. 82 | axons=[self.metagraph.axons[uid] for uid in self.miner_uids], 83 | # Construct a dummy query. 84 | synapse=Dummy(dummy_input=self.neuron.step), 85 | # All responses have the deserialize function called on them before returning. 86 | deserialize=True, 87 | ) 88 | 89 | rewards = get_rewards(self.neuron, responses) 90 | expected_rewards = torch.FloatTensor([1.0] * len(responses)) 91 | self.assertEqual(rewards, expected_rewards) 92 | 93 | def test_reward_with_nan(self): 94 | # TODO: Test that NaN rewards are correctly sanitized 95 | # TODO: Test that a bt.logging.warning is thrown when a NaN reward is sanitized 96 | responses = self.dendrite.query( 97 | # Send the query to miners in the network. 98 | axons=[self.metagraph.axons[uid] for uid in self.miner_uids], 99 | # Construct a dummy query. 100 | synapse=Dummy(dummy_input=self.neuron.step), 101 | # All responses have the deserialize function called on them before returning. 102 | deserialize=True, 103 | ) 104 | 105 | rewards = get_rewards(self.neuron, responses) 106 | expected_rewards = rewards.clone() 107 | # Add NaN values to rewards 108 | rewards[0] = float("nan") 109 | 110 | with self.assertLogs(bt.logging, level="WARNING") as cm: 111 | self.neuron.update_scores(rewards, self.miner_uids) 112 | --------------------------------------------------------------------------------