├── .coveragerc ├── .gitattributes ├── docs └── logo.jpg ├── lpz_examples ├── trtllm │ ├── prevent_hallucination_logits_processor.py │ ├── last_phrase_logits_processor.py │ ├── max_time_logits_processor.py │ ├── gen_length_logits_processor.py │ ├── cite_prompt_logits_processor.py │ ├── multiple_choice_logits_processor.py │ ├── trigger_phrase_logits_processor.py │ ├── utils.py │ └── README.md ├── vllm │ ├── utils.py │ ├── vllm_serve.ipynb │ ├── multiple_choice_logits_processor.ipynb │ ├── cite_prompt_logits_processor.ipynb │ └── prevent_hallucination_logits_processor.ipynb └── transformers │ ├── utils.py │ ├── multiple_choice_logits_processor.ipynb │ ├── cite_prompt_logits_processor.ipynb │ └── force_last_phrase_logits_processor.ipynb ├── pyproject.toml ├── tests ├── test_utils.py ├── transformers │ ├── test_prevent_hallucination.py │ ├── test_max_time.py │ ├── test_generation_length.py │ ├── test_last_phrase.py │ ├── test_cite_prompt.py │ ├── test_multiple_choice.py │ └── test_trigger_phrase.py └── conftest.py ├── .github └── workflows │ └── python-app.yml ├── logits_processor_zoo ├── trtllm │ ├── __init__.py │ ├── last_phrase.py │ ├── max_time.py │ ├── generation_length.py │ ├── cite_prompt.py │ ├── prevent_hallucination.py │ ├── multiple_choice.py │ └── trigger_phrase.py ├── vllm │ ├── __init__.py │ ├── last_phrase.py │ ├── cite_prompt.py │ ├── generation_length.py │ ├── prevent_hallucination.py │ ├── max_time.py │ ├── multiple_choice.py │ └── trigger_phrase.py ├── transformers │ ├── __init__.py │ ├── base.py │ ├── last_phrase.py │ ├── max_time.py │ ├── cite_prompt.py │ ├── generation_length.py │ ├── prevent_hallucination.py │ ├── multiple_choice.py │ └── trigger_phrase.py └── utils.py ├── .gitignore ├── README.md ├── CONTRIBUTING.md └── LICENSE /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | /tmp/* 4 | tests/* -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.py linguist-language=python 2 | *.ipynb linguist-documentation -------------------------------------------------------------------------------- /docs/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/logits-processor-zoo/HEAD/docs/logo.jpg -------------------------------------------------------------------------------- /lpz_examples/trtllm/prevent_hallucination_logits_processor.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | from logits_processor_zoo.trtllm import PreventHallucinationLogitsProcessor 3 | from utils import TRTLLMTester, get_parser 4 | 5 | 6 | if __name__ == "__main__": 7 | args = get_parser() 8 | 9 | tokenizer = AutoTokenizer.from_pretrained(args.model_name) 10 | llm_tester = TRTLLMTester(args.model_name) 11 | 12 | lp = PreventHallucinationLogitsProcessor(tokenizer, minp=0.25, tolerate=1) 13 | llm_tester.run([args.prompt], logits_processor=lp) 14 | -------------------------------------------------------------------------------- /lpz_examples/trtllm/last_phrase_logits_processor.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | from logits_processor_zoo.trtllm import ForceLastPhraseLogitsProcessor 3 | from utils import TRTLLMTester, get_parser 4 | 5 | 6 | if __name__ == "__main__": 7 | args = get_parser() 8 | 9 | tokenizer = AutoTokenizer.from_pretrained(args.model_name) 10 | llm_tester = TRTLLMTester(args.model_name) 11 | 12 | phrase = "\n\nThanks for trying our application! If you have more questions about" 13 | lp = ForceLastPhraseLogitsProcessor(phrase, tokenizer) 14 | 15 | llm_tester.run([args.prompt], logits_processor=lp) 16 | -------------------------------------------------------------------------------- /lpz_examples/trtllm/max_time_logits_processor.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | from logits_processor_zoo.trtllm import MaxTimeLogitsProcessor 3 | from utils import TRTLLMTester, get_parser 4 | 5 | 6 | if __name__ == "__main__": 7 | args = get_parser() 8 | 9 | tokenizer = AutoTokenizer.from_pretrained(args.model_name) 10 | llm_tester = TRTLLMTester(args.model_name) 11 | 12 | lp = MaxTimeLogitsProcessor(tokenizer, max_time=100, complete_sentences=True) 13 | llm_tester.run([args.prompt], logits_processor=lp) 14 | 15 | lp = MaxTimeLogitsProcessor(tokenizer, max_time=1.0, complete_sentences=True) 16 | llm_tester.run([args.prompt], logits_processor=lp) 17 | -------------------------------------------------------------------------------- /lpz_examples/trtllm/gen_length_logits_processor.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | from logits_processor_zoo.trtllm import GenLengthLogitsProcessor 3 | from utils import TRTLLMTester, get_parser 4 | 5 | 6 | if __name__ == "__main__": 7 | args = get_parser() 8 | 9 | tokenizer = AutoTokenizer.from_pretrained(args.model_name) 10 | llm_tester = TRTLLMTester(args.model_name) 11 | 12 | lp = GenLengthLogitsProcessor(tokenizer, boost_factor=1.0, complete_sentences=True) 13 | llm_tester.run([args.prompt], logits_processor=lp) 14 | 15 | lp = GenLengthLogitsProcessor(tokenizer, boost_factor=-1.0, p=0, complete_sentences=True) 16 | llm_tester.run([args.prompt], logits_processor=lp) 17 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "logits-processor-zoo" 3 | version = "0.2.1" 4 | 5 | description = "A collection of LogitsProcessors to customize and enhance LLM behavior for specific tasks." 6 | authors = ["Ahmet Erdem", "Ivan Sorokin", "Maximilian Jeblick", "Darragh Hanley", "David Austin"] 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.10" 11 | torch = "*" 12 | transformers = ">=4.41.2" 13 | accelerate = ">=0.26.1" 14 | vllm = { version = ">=0.5.0.post1", optional = true } 15 | 16 | [tool.poetry.extras] 17 | vllm = ["vllm"] 18 | 19 | 20 | [build-system] 21 | requires = ["poetry-core"] 22 | build-backend = "poetry.core.masonry.api" 23 | 24 | [tool.flake8] 25 | max-line-length = 120 26 | -------------------------------------------------------------------------------- /lpz_examples/trtllm/cite_prompt_logits_processor.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | from logits_processor_zoo.trtllm import CiteFromPromptLogitsProcessor 3 | from utils import TRTLLMTester, get_parser 4 | 5 | 6 | if __name__ == "__main__": 7 | args = get_parser() 8 | 9 | tokenizer = AutoTokenizer.from_pretrained(args.model_name) 10 | llm_tester = TRTLLMTester(args.model_name) 11 | 12 | lp = CiteFromPromptLogitsProcessor(tokenizer, boost_factor=1.0, boost_eos=False, conditional_boost_factor=3.0) 13 | llm_tester.run([args.prompt], logits_processor=lp) 14 | 15 | lp = CiteFromPromptLogitsProcessor(tokenizer, boost_factor=-1.0, boost_eos=False, conditional_boost_factor=-1.0) 16 | llm_tester.run([args.prompt], logits_processor=lp) 17 | -------------------------------------------------------------------------------- /lpz_examples/trtllm/multiple_choice_logits_processor.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | from logits_processor_zoo.trtllm import MultipleChoiceLogitsProcessor 3 | from utils import TRTLLMTester, get_parser 4 | 5 | 6 | if __name__ == "__main__": 7 | args = get_parser() 8 | 9 | tokenizer = AutoTokenizer.from_pretrained(args.model_name) 10 | llm_tester = TRTLLMTester(args.model_name) 11 | 12 | lp = MultipleChoiceLogitsProcessor(tokenizer, choices=["0", "1", "2", "3"]) 13 | llm_tester.run([args.prompt], logits_processor=lp, max_tokens=1) 14 | 15 | lp = MultipleChoiceLogitsProcessor(tokenizer, choices=["0", "1", "2", "3"], delimiter=".", boost_first_words=2.0) 16 | llm_tester.run([args.prompt], logits_processor=lp, max_tokens=1) 17 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from logits_processor_zoo.utils import text_to_token, get_new_line_tokens, enforce_tokens 2 | import torch 3 | 4 | 5 | def test_text_to_token(llm_runner): 6 | assert text_to_token(llm_runner.tokenizer, ",", last=False) == 1919 7 | assert text_to_token(llm_runner.tokenizer, "apple, orange,", last=True) == 29892 8 | assert text_to_token(llm_runner.tokenizer, "apple, orange\n", last=True) == 13 9 | 10 | try: 11 | token = text_to_token(llm_runner.tokenizer, "apple, orange,", last=False) 12 | except Exception: 13 | token = -1 14 | 15 | assert token == -1 16 | 17 | 18 | def test_get_new_line_tokens(llm_runner): 19 | assert get_new_line_tokens(llm_runner.tokenizer) == {13} 20 | 21 | 22 | def test_enforce_tokens(): 23 | scores = torch.FloatTensor([0.1, -0.4, -0.2, -0.6, 1.1]) 24 | tokens = [1, 2] 25 | 26 | scores = enforce_tokens(scores, tokens) 27 | _, top2_tokens = torch.topk(scores, k=2) 28 | assert torch.equal(top2_tokens, torch.tensor([2, 1])) 29 | -------------------------------------------------------------------------------- /lpz_examples/trtllm/trigger_phrase_logits_processor.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer 2 | from logits_processor_zoo.trtllm import TriggerPhraseLogitsProcessor 3 | from utils import TRTLLMTester, get_parser 4 | 5 | 6 | if __name__ == "__main__": 7 | args = get_parser() 8 | 9 | tokenizer = AutoTokenizer.from_pretrained(args.model_name) 10 | llm_tester = TRTLLMTester(args.model_name) 11 | 12 | lp = TriggerPhraseLogitsProcessor( 13 | tokenizer, "...Wait, let me think more.", " function", trigger_count=2, trigger_after=False 14 | ) 15 | llm_tester.run([args.prompt], logits_processor=lp) 16 | 17 | lp = TriggerPhraseLogitsProcessor(tokenizer, "\n```python", " function", trigger_count=1, trigger_after=True) 18 | llm_tester.run([args.prompt], logits_processor=lp) 19 | 20 | lp = TriggerPhraseLogitsProcessor( 21 | tokenizer, " only a few seconds left...", trigger_time=2, trigger_count=1, trigger_after=True 22 | ) 23 | llm_tester.run([args.prompt], logits_processor=lp) 24 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python 3.10 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: "3.10" 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install poetry flake8 Flake8-pyproject pytest pytest-cov 30 | pip install -e . 31 | - name: Lint with flake8 32 | run: | 33 | poetry run flake8 34 | - name: Test with pytest 35 | run: | 36 | python -m pytest tests/ --cov --cov-config=.coveragerc -------------------------------------------------------------------------------- /tests/transformers/test_prevent_hallucination.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from logits_processor_zoo.transformers import PreventHallucinationLogitsProcessor 19 | 20 | 21 | def test_gen_length_logits_processor(llm_runner): 22 | example_prompts = ["Please describe what macaques are.", "Tell me a story about a kid lost in forest."] 23 | 24 | logits_processors = [PreventHallucinationLogitsProcessor(llm_runner.tokenizer, batch_size=2, minp=0.99, tolerate=2)] 25 | processed_gen_output = llm_runner.generate_response(example_prompts, logits_processors) 26 | 27 | assert all(["I don't know" in out for out in processed_gen_output]) 28 | -------------------------------------------------------------------------------- /logits_processor_zoo/trtllm/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from .generation_length import GenLengthLogitsProcessor 19 | from .last_phrase import ForceLastPhraseLogitsProcessor 20 | from .cite_prompt import CiteFromPromptLogitsProcessor 21 | from .multiple_choice import MultipleChoiceLogitsProcessor 22 | from .prevent_hallucination import PreventHallucinationLogitsProcessor 23 | from .trigger_phrase import TriggerPhraseLogitsProcessor 24 | from .max_time import MaxTimeLogitsProcessor 25 | 26 | __all__ = ['GenLengthLogitsProcessor', 'ForceLastPhraseLogitsProcessor', 'CiteFromPromptLogitsProcessor', 27 | 'MultipleChoiceLogitsProcessor', 'PreventHallucinationLogitsProcessor', 'TriggerPhraseLogitsProcessor', 28 | 'MaxTimeLogitsProcessor'] 29 | -------------------------------------------------------------------------------- /logits_processor_zoo/vllm/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from .generation_length import GenLengthLogitsProcessor 19 | from .cite_prompt import CiteFromPromptLogitsProcessor 20 | from .last_phrase import ForceLastPhraseLogitsProcessor 21 | from .multiple_choice import MultipleChoiceLogitsProcessor 22 | from .trigger_phrase import TriggerPhraseLogitsProcessor 23 | from .prevent_hallucination import PreventHallucinationLogitsProcessor 24 | from .max_time import MaxTimeLogitsProcessor 25 | 26 | __all__ = ['GenLengthLogitsProcessor', 'CiteFromPromptLogitsProcessor', 'ForceLastPhraseLogitsProcessor', 27 | 'MultipleChoiceLogitsProcessor', 'TriggerPhraseLogitsProcessor', 'PreventHallucinationLogitsProcessor', 28 | 'MaxTimeLogitsProcessor'] 29 | -------------------------------------------------------------------------------- /logits_processor_zoo/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from .generation_length import GenLengthLogitsProcessor 19 | from .cite_prompt import CiteFromPromptLogitsProcessor 20 | from .last_phrase import ForceLastPhraseLogitsProcessor 21 | from .multiple_choice import MultipleChoiceLogitsProcessor 22 | from .trigger_phrase import TriggerPhraseLogitsProcessor 23 | from .prevent_hallucination import PreventHallucinationLogitsProcessor 24 | from .max_time import MaxTimeLogitsProcessor 25 | 26 | __all__ = ['GenLengthLogitsProcessor', 'CiteFromPromptLogitsProcessor', 'ForceLastPhraseLogitsProcessor', 27 | 'MultipleChoiceLogitsProcessor', 'TriggerPhraseLogitsProcessor', 'PreventHallucinationLogitsProcessor', 28 | 'MaxTimeLogitsProcessor'] 29 | -------------------------------------------------------------------------------- /tests/transformers/test_max_time.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import time 19 | from logits_processor_zoo.transformers import MaxTimeLogitsProcessor 20 | 21 | 22 | def test_max_time_logits_processor(llm_runner): 23 | """Test that the phrase is triggered when the specified token is generated.""" 24 | example_prompts = [ 25 | "Hello, how are you?", 26 | "What is the capital of France?", 27 | "What is the capital of Germany?", 28 | ] 29 | 30 | max_time = 2 31 | tolerance = 1 32 | start_time = time.time() 33 | 34 | logits_processors = [MaxTimeLogitsProcessor(llm_runner.tokenizer, max_time=max_time, complete_sentences=False)] 35 | outs = llm_runner.generate_response(example_prompts, logits_processors, max_new_tokens=1000) 36 | end_time = time.time() 37 | elapsed_time = end_time - start_time 38 | print(outs) 39 | assert elapsed_time <= max_time + tolerance 40 | -------------------------------------------------------------------------------- /tests/transformers/test_generation_length.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from logits_processor_zoo.transformers import GenLengthLogitsProcessor 19 | 20 | 21 | def test_gen_length_logits_processor(llm_runner): 22 | example_prompts = [ 23 | "Please describe what macaques are.", 24 | "Tell me a story about a kid lost in forest." 25 | ] 26 | 27 | default_gen_output = llm_runner.generate_response(example_prompts) 28 | 29 | logits_processors = [GenLengthLogitsProcessor(llm_runner.tokenizer, boost_factor=1.0)] 30 | processed_gen_output = llm_runner.generate_response(example_prompts, logits_processors) 31 | 32 | assert all(len(p1) > len(p2) for p1, p2 in zip(default_gen_output, processed_gen_output)) 33 | 34 | processed_gen_output_repeat = llm_runner.generate_response(example_prompts, logits_processors) 35 | assert all(p1 == p2 for p1, p2 in zip(processed_gen_output, processed_gen_output_repeat)) 36 | -------------------------------------------------------------------------------- /tests/transformers/test_last_phrase.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from logits_processor_zoo.transformers import ForceLastPhraseLogitsProcessor, GenLengthLogitsProcessor 19 | 20 | 21 | def test_cite_from_prompt_logits_processor(llm_runner): 22 | example_prompts = [ 23 | "Please describe what macaques are.", 24 | "Tell me a story about a kid lost in forest." 25 | ] 26 | 27 | phrase = "This is a test phrase." 28 | 29 | logits_processors = [GenLengthLogitsProcessor(llm_runner.tokenizer, boost_factor=1.0), 30 | ForceLastPhraseLogitsProcessor(phrase, llm_runner.tokenizer, batch_size=len(example_prompts))] 31 | processed_gen_output = llm_runner.generate_response(example_prompts, logits_processors, max_new_tokens=100) 32 | 33 | assert all((phrase in out) for out in processed_gen_output) 34 | 35 | processed_gen_output_repeat = llm_runner.generate_response(example_prompts, logits_processors, max_new_tokens=100) 36 | assert all(p1 == p2 for p1, p2 in zip(processed_gen_output, processed_gen_output_repeat)) 37 | -------------------------------------------------------------------------------- /lpz_examples/vllm/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import vllm 3 | 4 | # vLLM V1 does not currently accept logits processor so we need to disable it 5 | # https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html#deprecated-features 6 | os.environ["VLLM_USE_V1"] = "0" 7 | 8 | 9 | class vLLMRunner: 10 | def __init__(self, model_name="Qwen/Qwen2.5-1.5B-Instruct"): 11 | self.model = vllm.LLM( 12 | model_name, 13 | trust_remote_code=True, 14 | dtype="half", 15 | enforce_eager=True 16 | ) 17 | self.tokenizer = self.model.get_tokenizer() 18 | 19 | def generate_response(self, prompts, logits_processor_list=None, max_tokens=1000): 20 | if logits_processor_list is None: 21 | logits_processor_list = [] 22 | 23 | prompts_with_template = [] 24 | for prompt in prompts: 25 | messages = [ 26 | { 27 | "role": "user", 28 | "content": prompt 29 | } 30 | ] 31 | text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 32 | prompts_with_template.append(text) 33 | 34 | gen_output = self.model.generate( 35 | prompts_with_template, 36 | vllm.SamplingParams( 37 | n=1, 38 | temperature=0, 39 | seed=0, 40 | skip_special_tokens=True, 41 | max_tokens=max_tokens, 42 | logits_processors=logits_processor_list 43 | ), 44 | use_tqdm=False 45 | ) 46 | 47 | for prompt, out in zip(prompts, gen_output): 48 | out = out.outputs[0].text 49 | print(f"Prompt: {prompt}") 50 | print(out) 51 | print("-----END-----") 52 | print() 53 | -------------------------------------------------------------------------------- /logits_processor_zoo/transformers/base.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import torch 19 | 20 | 21 | class BaseLogitsProcessor: 22 | def __init__(self): 23 | self.prompt_token_ids = None 24 | self.prev_token_ids = None 25 | 26 | def _reset(self): 27 | pass 28 | 29 | def _check_new_generation(self, input_ids: torch.LongTensor): 30 | first_time = self.prompt_token_ids is None 31 | if first_time: 32 | self._reset() 33 | self.prompt_token_ids = input_ids 34 | else: 35 | same_gen = False 36 | if input_ids.shape[1] > 1: 37 | same_gen = torch.equal(input_ids[:, :-1], self.prev_token_ids) 38 | 39 | if not same_gen: 40 | self._reset() 41 | self.prompt_token_ids = input_ids 42 | 43 | self.prev_token_ids = input_ids 44 | 45 | def _process(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.Tensor: 46 | return scores 47 | 48 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.Tensor: 49 | self._check_new_generation(input_ids) 50 | scores = self._process(input_ids, scores) 51 | return scores 52 | -------------------------------------------------------------------------------- /lpz_examples/transformers/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessorList 3 | 4 | 5 | class LLMRunner: 6 | def __init__(self, model_name="Qwen/Qwen2.5-1.5B-Instruct"): 7 | self.tokenizer = AutoTokenizer.from_pretrained(model_name) 8 | self.tokenizer.padding_side = "left" 9 | 10 | self.model = AutoModelForCausalLM.from_pretrained( 11 | model_name, 12 | torch_dtype=torch.float16, 13 | device_map="auto", 14 | ) 15 | 16 | def generate_response(self, prompts, logits_processor_list=None, max_tokens=1000): 17 | if logits_processor_list is None: 18 | logits_processor_list = [] 19 | 20 | prompts_with_template = [] 21 | for prompt in prompts: 22 | messages = [ 23 | { 24 | "role": "user", 25 | "content": prompt 26 | } 27 | ] 28 | text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 29 | prompts_with_template.append(text) 30 | 31 | input_ids = self.tokenizer(prompts_with_template, return_tensors='pt', padding=True)["input_ids"] 32 | out_ids = self.model.generate(input_ids.cuda(), max_new_tokens=max_tokens, min_new_tokens=1, do_sample=False, 33 | logits_processor=LogitsProcessorList(logits_processor_list), 34 | temperature=None, top_p=None, top_k=None) 35 | gen_output = self.tokenizer.batch_decode(out_ids[:, input_ids.shape[1]:], skip_special_tokens=True, 36 | clean_up_tokenization_spaces=False) 37 | for prompt, out in zip(prompts, gen_output): 38 | print(f"Prompt: {prompt}") 39 | print() 40 | print(f"LLM response:\n{out.strip()}") 41 | print("-----END-----") 42 | print() 43 | -------------------------------------------------------------------------------- /tests/transformers/test_cite_prompt.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from logits_processor_zoo.transformers import CiteFromPromptLogitsProcessor 19 | 20 | 21 | def test_cite_from_prompt_logits_processor(llm_runner): 22 | example_prompts = [ 23 | "Please describe what macaques are.", 24 | "Tell me a story about a kid lost in forest." 25 | ] 26 | 27 | default_gen_output = llm_runner.generate_response(example_prompts, max_new_tokens=10) 28 | 29 | logits_processors = [CiteFromPromptLogitsProcessor(llm_runner.tokenizer, boost_factor=50.0, 30 | conditional_boost_factor=50.0)] 31 | processed_gen_output = llm_runner.generate_response(example_prompts, logits_processors, max_new_tokens=10) 32 | 33 | for prompt, default_out, processed_out in zip(example_prompts, default_gen_output, processed_gen_output): 34 | prompt_tokens = set(prompt.split()) 35 | default_out_tokens = set(default_out.split()) 36 | processed_out_tokens = set(processed_out.split()) 37 | 38 | default_shared_tokens = prompt_tokens.intersection(default_out_tokens) 39 | processed_shared_tokens = prompt_tokens.intersection(processed_out_tokens) 40 | 41 | assert len(processed_shared_tokens) > len(default_shared_tokens) 42 | -------------------------------------------------------------------------------- /lpz_examples/trtllm/utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import List 3 | from tensorrt_llm.sampling_params import SamplingParams, LogitsProcessor 4 | 5 | 6 | class TRTLLMTester: 7 | def __init__(self, model_name: str = "Qwen/Qwen2.5-1.5B-Instruct"): 8 | # Temporarily attempt to import the torch backend until it becomes default 9 | try: 10 | from tensorrt_llm._torch import LLM 11 | except ImportError: 12 | from tensorrt_llm import LLM 13 | 14 | self.llm = LLM(model=model_name) 15 | 16 | def run(self, prompts: List[str], max_tokens: int = 256, logits_processor: LogitsProcessor = None): 17 | sparams = {"top_k": 1, "max_tokens": max_tokens, "temperature": 0.001} 18 | if logits_processor: 19 | sparams["logits_processor"] = logits_processor 20 | 21 | prompts_with_template = [] 22 | for prompt in prompts: 23 | messages = [ 24 | { 25 | "role": "user", 26 | "content": prompt 27 | } 28 | ] 29 | text = self.llm.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 30 | prompts_with_template.append(text) 31 | 32 | gens = self.llm.generate(prompts_with_template, SamplingParams(**sparams)) 33 | for prompt, gen in zip(prompts, gens): 34 | print(prompt) 35 | print(gen.outputs[0].text) 36 | 37 | 38 | def get_parser(): 39 | parser = argparse.ArgumentParser(description="Logits Processor Example") 40 | parser.add_argument("--model_name", 41 | "-m", 42 | type=str, 43 | default="Qwen/Qwen2.5-1.5B-Instruct", 44 | help="Directory or HF link containing model") 45 | parser.add_argument("--prompt", 46 | "-p", 47 | type=str, 48 | default="Please give me information about macaques:", 49 | help="Prompt to test") 50 | 51 | return parser.parse_args() 52 | -------------------------------------------------------------------------------- /tests/transformers/test_multiple_choice.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from logits_processor_zoo.transformers import MultipleChoiceLogitsProcessor 19 | 20 | 21 | def test_cite_from_prompt_logits_processor(llm_runner): 22 | example_prompts = [ 23 | """ 24 | I am getting a lot of calls during the day. What is more important for me to consider when I buy a new phone? 25 | a) Camera 26 | b) Screen resolution 27 | c) Operating System 28 | d) Battery 29 | 30 | Answer: 31 | """, 32 | 33 | """ 34 | Which user review doesn't belong to a summer dress? 35 | a) Looks good 36 | b) Keeps warm 37 | c) Too long 38 | d) Liked the color 39 | 40 | Answer: 41 | """ 42 | ] 43 | 44 | choices = ["a", "b", "c", "d"] 45 | logits_processors = [MultipleChoiceLogitsProcessor(llm_runner.tokenizer, choices=choices, delimiter=")")] 46 | processed_gen_output = llm_runner.generate_response(example_prompts, logits_processors, max_new_tokens=1) 47 | 48 | assert all((out in choices) for out in processed_gen_output) 49 | 50 | example_prompts = [prompt.replace("a)", "1.").replace("b)", "2.").replace("c)", "3.").replace("d)", "4.") 51 | for prompt in example_prompts] 52 | 53 | choices = ["1", "2", "3", "4"] 54 | logits_processors = [MultipleChoiceLogitsProcessor(llm_runner.tokenizer, choices=choices, delimiter=".", 55 | boost_first_words=1.0)] 56 | processed_gen_output = llm_runner.generate_response(example_prompts, logits_processors, max_new_tokens=1) 57 | 58 | assert all((out in choices) for out in processed_gen_output) 59 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import pytest 19 | import torch 20 | from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessorList 21 | 22 | 23 | class LLMRunner: 24 | def __init__(self, model_name='google/gemma-1.1-2b-it'): 25 | self.tokenizer = AutoTokenizer.from_pretrained(model_name) 26 | self.tokenizer.pad_token = self.tokenizer.eos_token 27 | self.tokenizer.padding_side = "left" 28 | 29 | self.model = AutoModelForCausalLM.from_pretrained( 30 | model_name, 31 | torch_dtype=torch.float16, 32 | device_map="auto", 33 | trust_remote_code=True 34 | ) 35 | 36 | def generate_response(self, prompts, logits_processor_list=None, max_new_tokens=1000): 37 | if logits_processor_list is None: 38 | logits_processor_list = [] 39 | 40 | input_ids = self.tokenizer(prompts, return_tensors='pt', padding=True)["input_ids"] 41 | 42 | out_ids = self.model.generate(input_ids.to(self.model.device), 43 | max_new_tokens=max_new_tokens, min_new_tokens=1, 44 | logits_processor=LogitsProcessorList(logits_processor_list) 45 | ) 46 | 47 | gen_output = self.tokenizer.batch_decode(out_ids, skip_special_tokens=True, 48 | clean_up_tokenization_spaces=False) 49 | 50 | return [out[len(prompt):].strip() for prompt, out in zip(prompts, gen_output)] 51 | 52 | 53 | @pytest.fixture(scope='session') 54 | def llm_runner(): 55 | return LLMRunner(model_name="MaxJeblick/llama2-0b-unit-test") 56 | -------------------------------------------------------------------------------- /tests/transformers/test_trigger_phrase.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from logits_processor_zoo.transformers import TriggerPhraseLogitsProcessor 19 | 20 | 21 | def test_trigger_phrase_token_based_triggering(llm_runner): 22 | """Test that the phrase is triggered when the specified token is generated.""" 23 | example_prompts = ["Query: "] 24 | 25 | trigger_token = "fig" 26 | phrase = "This is a triggered phrase." 27 | 28 | logits_processors = [ 29 | TriggerPhraseLogitsProcessor( 30 | llm_runner.tokenizer, 31 | batch_size=len(example_prompts), 32 | phrase=phrase, 33 | trigger_token_phrase=trigger_token, 34 | trigger_after=True, 35 | ) 36 | ] 37 | 38 | processed_gen_output = llm_runner.generate_response(example_prompts, logits_processors, max_new_tokens=1000) 39 | assert phrase in processed_gen_output[0] 40 | 41 | 42 | def test_trigger_phrase_token_phrase_based_triggering(llm_runner): 43 | """Test that the phrase is triggered when the specified token is generated.""" 44 | example_prompts = [ 45 | "Generate a python function to calculate fibonacci numbers.", 46 | "Simple python function to calculate fibonacci numbers.", 47 | ] 48 | 49 | trigger_time = 2 50 | phrase = "This is a triggered phrase." 51 | 52 | logits_processors = [ 53 | TriggerPhraseLogitsProcessor( 54 | llm_runner.tokenizer, 55 | batch_size=len(example_prompts), 56 | phrase=phrase, 57 | trigger_time=trigger_time, 58 | trigger_after=True, 59 | ) 60 | ] 61 | 62 | processed_gen_output = llm_runner.generate_response(example_prompts, logits_processors, max_new_tokens=1000) 63 | assert phrase in processed_gen_output[0] 64 | -------------------------------------------------------------------------------- /logits_processor_zoo/transformers/last_phrase.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from transformers import PreTrainedTokenizer 19 | import torch 20 | from logits_processor_zoo.transformers.base import BaseLogitsProcessor 21 | from logits_processor_zoo.utils import enforce_tokens 22 | 23 | 24 | class ForceLastPhraseLogitsProcessor(BaseLogitsProcessor): 25 | """ 26 | A logits processor which forces LLMs to use the given phrase before they finalize their answers. 27 | Most common use cases can be providing references, thanking user with context etc. 28 | WARNING: Create a new object before every model.generate call to reset iterators. 29 | 30 | Parameters 31 | ---------- 32 | phrase (str): The phrase to be generated by LLM before the end of its speech. 33 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 34 | batch_size (int): Number of prompts in the batch. 35 | """ 36 | def __init__(self, phrase: str, tokenizer: PreTrainedTokenizer, batch_size: int): 37 | super().__init__() 38 | self.eos_token_id = tokenizer.eos_token_id 39 | self.phrase_tokens = tokenizer.encode(phrase, add_special_tokens=False) 40 | self.batch_size = batch_size 41 | 42 | def _reset(self): 43 | self.iterators = torch.zeros(self.batch_size, dtype=torch.int32) 44 | 45 | def _process(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.Tensor: 46 | for i in range(scores.shape[0]): 47 | it = self.iterators[i].item() 48 | if scores[i, :].argmax() == self.eos_token_id and it == 0: 49 | scores[i] = enforce_tokens(scores[i], [self.phrase_tokens[it]]) 50 | self.iterators[i] += 1 51 | elif len(self.phrase_tokens) > it > 0: 52 | scores[i] = enforce_tokens(scores[i], [self.phrase_tokens[it]]) 53 | self.iterators[i] += 1 54 | 55 | return scores 56 | -------------------------------------------------------------------------------- /logits_processor_zoo/utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from transformers import PreTrainedTokenizer 19 | from typing import List, Union 20 | import torch 21 | 22 | 23 | def text_to_token(tokenizer: PreTrainedTokenizer, text: str, last: bool): 24 | tokens = tokenizer.encode(text, add_special_tokens=False) 25 | 26 | # We allow 2 tokens to account for the BOS or prefix token 27 | max_token_count = 1 28 | bos_token_added = getattr(tokenizer, 'bos_token', None) and getattr(tokenizer, 'bos_token_id', None) in tokens 29 | prefix_token_added = getattr(tokenizer, 'add_prefix_space', None) is not False 30 | if bos_token_added or prefix_token_added: 31 | max_token_count = 2 32 | 33 | if not last and len(tokens) > max_token_count: 34 | raise Exception(f"Can't convert {text} to token. It has {len(tokens)} tokens.") 35 | 36 | return tokens[-1] 37 | 38 | 39 | def get_new_line_tokens(tokenizer: PreTrainedTokenizer): 40 | new_line_tokens = [token for token in tokenizer.get_vocab().values() 41 | if tokenizer.decode(token).endswith("\n")] 42 | 43 | return set(new_line_tokens) 44 | 45 | 46 | def enforce_tokens(scores: torch.Tensor, tokens: List[int]): 47 | choice_scores = scores[tokens].clone() 48 | gap = scores.max() - choice_scores.min() 49 | choice_scores += gap 50 | scores.fill_(scores.min()) 51 | scores[tokens] = choice_scores 52 | return scores 53 | 54 | 55 | class SentenceChecker: 56 | def __init__(self, tokenizer: PreTrainedTokenizer): 57 | self.full_stop_token = text_to_token(tokenizer, "It is a sentence.", last=True) 58 | self.new_line_token = text_to_token(tokenizer, "It is a new line\n", last=True) 59 | 60 | def _check_sentence_end(self, input_ids: Union[List[int], torch.Tensor]): 61 | if isinstance(input_ids, list) or isinstance(input_ids, tuple): # vllm input 62 | return (input_ids[-1] == self.full_stop_token) | (input_ids[-1] == self.new_line_token) 63 | else: 64 | return (input_ids[:, -1] == self.full_stop_token) | (input_ids[:, -1] == self.new_line_token) 65 | -------------------------------------------------------------------------------- /logits_processor_zoo/trtllm/last_phrase.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from typing import List, Optional 19 | from transformers import PreTrainedTokenizer 20 | import torch 21 | from logits_processor_zoo.utils import enforce_tokens 22 | from tensorrt_llm.sampling_params import LogitsProcessor 23 | 24 | 25 | class ForceLastPhraseLogitsProcessor(LogitsProcessor): 26 | """ 27 | A logits processor which forces LLMs to use the given phrase before they finalize their answers. 28 | Most common use cases can be providing references, thanking user with context etc. 29 | 30 | Parameters 31 | ---------- 32 | phrase (str): The phrase to be generated by LLM before the end of its speech. 33 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 34 | """ 35 | def __init__(self, phrase: str, tokenizer: PreTrainedTokenizer): 36 | self.eos_token_id = tokenizer.eos_token_id 37 | self.phrase_tokens = tokenizer.encode(phrase, add_special_tokens=False) 38 | self.iterators = None 39 | 40 | def _init_before_gen(self, beam_width): 41 | self.iterators = torch.zeros(beam_width, dtype=torch.int32) 42 | 43 | def __call__(self, req_id: int, logits: torch.Tensor, 44 | token_ids: List[List[int]], stream_ptr: Optional[int], 45 | client_id: Optional[int]) -> None: 46 | beam_width = len(token_ids) 47 | if self.iterators is None: 48 | self._init_before_gen(beam_width) 49 | 50 | stream = None if stream_ptr is None else torch.cuda.ExternalStream(stream_ptr) 51 | 52 | with torch.cuda.stream(stream): 53 | for i in range(beam_width): # iterate over beams 54 | current_index = self.iterators[i].item() 55 | if logits[0, i].argmax() == self.eos_token_id and current_index == 0: 56 | enforce_tokens(logits[0, i], [self.phrase_tokens[current_index]]) 57 | self.iterators[i] += 1 58 | elif len(self.phrase_tokens) > current_index > 0: 59 | enforce_tokens(logits[0, i], [self.phrase_tokens[current_index]]) 60 | self.iterators[i] += 1 61 | -------------------------------------------------------------------------------- /logits_processor_zoo/vllm/last_phrase.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from transformers import PreTrainedTokenizer, AutoTokenizer 19 | from typing import List, Union 20 | import torch 21 | from logits_processor_zoo.utils import enforce_tokens 22 | 23 | 24 | class ForceLastPhraseLogitsProcessor: 25 | """ 26 | A logits processor which forces LLMs to use the given phrase before they finalize their answers. 27 | Most common use cases can be providing references, thanking user with context etc. 28 | 29 | Parameters 30 | ---------- 31 | phrase (str): The phrase to be generated by LLM before the end of its speech. 32 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 33 | """ 34 | def __init__(self, phrase: str, tokenizer: Union[PreTrainedTokenizer, str]): 35 | self.tokenizer = tokenizer 36 | if isinstance(self.tokenizer, str): 37 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer) 38 | 39 | self.eos_token_id = self.tokenizer.eos_token_id 40 | self.phrase_tokens = self.tokenizer.encode(phrase, add_special_tokens=False) 41 | self._reset() 42 | self.phrase = phrase 43 | 44 | # Mutable logits processor gets cloned for each prompt in a batch in order to prevent updating the same object 45 | # https://github.com/vllm-project/vllm/blob/19dcc02a72e3ed52e3bf95aae44ea1f40ce42ea0/vllm/sampling_params.py#L537-L550 46 | def clone(self): 47 | return ForceLastPhraseLogitsProcessor(self.phrase, self.tokenizer) 48 | 49 | def _reset(self): 50 | self.index = 0 51 | 52 | def __call__(self, prompt_tokens_ids: List[int], past_token_ids: List[int], scores: torch.Tensor) -> torch.Tensor: 53 | if not past_token_ids: # new generation 54 | self._reset() 55 | 56 | if scores.argmax() == self.eos_token_id and self.index == 0: 57 | scores = enforce_tokens(scores, [self.phrase_tokens[self.index]]) 58 | self.index += 1 59 | elif len(self.phrase_tokens) > self.index > 0: 60 | scores = enforce_tokens(scores, [self.phrase_tokens[self.index]]) 61 | self.index += 1 62 | 63 | return scores 64 | -------------------------------------------------------------------------------- /logits_processor_zoo/vllm/cite_prompt.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from typing import List, Union 19 | import torch 20 | from transformers import PreTrainedTokenizer, AutoTokenizer 21 | 22 | 23 | class CiteFromPromptLogitsProcessor: 24 | """ 25 | A logits processor which boosts or diminishes the likelihood of tokens present in the prompt (and optionally 26 | EOS token) to encourage the model to generate tokens similar to those seen in the prompt or vice versa. 27 | 28 | Parameters 29 | ---------- 30 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 31 | boost_factor (float): A factor to boost the likelihood of the tokens from the prompt. 32 | Negative values are used for the opposite effect. 33 | boost_eos (bool, optional): If True, boosts EOS token too. 34 | conditional_boost_factor (float, optional): A factor to boost the likelihood of the tokens based on previous token. 35 | """ 36 | def __init__(self, tokenizer: Union[PreTrainedTokenizer, str], boost_factor: float = 1.0, boost_eos: bool = True, 37 | conditional_boost_factor: float = 0.0): 38 | self.tokenizer = tokenizer 39 | if isinstance(self.tokenizer, str): 40 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer) 41 | 42 | self.boost_factor = boost_factor 43 | self.eos_token_id = self.tokenizer.eos_token_id 44 | self.boost_eos = boost_eos 45 | self.conditional_boost_factor = conditional_boost_factor 46 | 47 | def __call__(self, prompt_tokens_ids: List[int], past_token_ids: List[int], scores: torch.Tensor) -> torch.Tensor: 48 | tokens = set(prompt_tokens_ids) 49 | if self.boost_eos: 50 | tokens.add(self.eos_token_id) 51 | 52 | tokens = [t for t in tokens if t < scores.shape[0]] 53 | scores[tokens] += self.boost_factor 54 | 55 | if (self.conditional_boost_factor != 0) and (len(past_token_ids) > 0): 56 | tokens = set() 57 | last_token = past_token_ids[-1] 58 | for i in range(len(prompt_tokens_ids) - 1): 59 | if (prompt_tokens_ids[i] == last_token) and (prompt_tokens_ids[i + 1] < scores.shape[0]): 60 | tokens.add(prompt_tokens_ids[i + 1]) 61 | scores[list(tokens)] += self.conditional_boost_factor 62 | 63 | return scores 64 | -------------------------------------------------------------------------------- /logits_processor_zoo/transformers/max_time.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import time 19 | import torch 20 | from transformers import PreTrainedTokenizer 21 | from logits_processor_zoo.transformers.base import BaseLogitsProcessor 22 | from logits_processor_zoo.utils import text_to_token, enforce_tokens, SentenceChecker 23 | 24 | 25 | class MaxTimeLogitsProcessor(BaseLogitsProcessor, SentenceChecker): 26 | """ 27 | A logits processor that enforces the end-of-sentence (EOS) token after a specified maximum time passes. 28 | Useful for controlling generation time and ensuring responses complete within time constraints. 29 | 30 | Parameters 31 | ---------- 32 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 33 | max_time (float): Maximum time (wall-clock time) in seconds after which the EOS token must be enforced. 34 | complete_sentences (bool, optional): If True, enforces EOS token only when the last token is a full stop 35 | or a new line. Default is False. 36 | boost_token_str (str, optional): A string to be tokenized and used instead of EOS. 37 | 38 | """ 39 | 40 | def __init__( 41 | self, 42 | tokenizer: PreTrainedTokenizer, 43 | max_time: float, 44 | complete_sentences: bool = False, 45 | boost_token_str: str = None, 46 | ): 47 | BaseLogitsProcessor.__init__(self) 48 | SentenceChecker.__init__(self, tokenizer) 49 | self.boost_token = tokenizer.eos_token_id 50 | if boost_token_str is not None: 51 | self.boost_token = text_to_token(tokenizer, boost_token_str, last=False) 52 | self.max_time = max_time 53 | self.complete_sentences = complete_sentences 54 | 55 | def _reset(self): 56 | self.start_time = time.time() 57 | 58 | def _process(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.Tensor: 59 | elapsed_time = time.time() - self.start_time 60 | token_count = input_ids.shape[1] - self.prompt_token_ids.shape[1] 61 | 62 | enabled = (input_ids[:, -token_count:] == self.boost_token).sum(dim=1) == 0 63 | if self.complete_sentences: 64 | enabled = enabled & self._check_sentence_end(input_ids) 65 | 66 | if elapsed_time > self.max_time: 67 | for i in range(scores.shape[0]): 68 | if enabled[i]: 69 | scores[i] = enforce_tokens(scores[i], [self.boost_token]) 70 | return scores 71 | 72 | return scores 73 | -------------------------------------------------------------------------------- /logits_processor_zoo/transformers/cite_prompt.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import torch 19 | from transformers import PreTrainedTokenizer 20 | from logits_processor_zoo.transformers.base import BaseLogitsProcessor 21 | 22 | 23 | class CiteFromPromptLogitsProcessor(BaseLogitsProcessor): 24 | """ 25 | A logits processor which boosts or diminishes the likelihood of tokens present in the prompt (and optionally 26 | EOS token) to encourage the model to generate tokens similar to those seen in the prompt or vice versa. 27 | WARNING: Create a new object before every model.generate call since every batch has different prompts. 28 | 29 | Parameters 30 | ---------- 31 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 32 | boost_factor (float): A factor to boost the likelihood of the tokens from the prompt. 33 | Negative values are used for the opposite effect. 34 | boost_eos (bool, optional): If True, boosts EOS token too. 35 | conditional_boost_factor (float, optional): A factor to boost the likelihood of the tokens based on previous token. 36 | """ 37 | def __init__(self, tokenizer: PreTrainedTokenizer, boost_factor: float = 1.0, boost_eos: bool = True, 38 | conditional_boost_factor: float = 0.0): 39 | super().__init__() 40 | self.boost_factor = boost_factor 41 | self.eos_token_id = tokenizer.eos_token_id 42 | self.boost_eos = boost_eos 43 | self.conditional_boost_factor = conditional_boost_factor 44 | 45 | def _process(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.Tensor: 46 | voc_size = scores.shape[1] 47 | for i in range(scores.shape[0]): 48 | tokens = set(self.prompt_token_ids[i]) 49 | if self.boost_eos: 50 | tokens.add(self.eos_token_id) 51 | 52 | tokens = [t for t in tokens if t < voc_size] 53 | scores[i, tokens] += self.boost_factor 54 | 55 | if (self.conditional_boost_factor != 0) and (input_ids.shape[1] > self.prompt_token_ids.shape[1]): 56 | tokens = set() 57 | last_token = input_ids[i][-1] 58 | for j in range(len(self.prompt_token_ids[i]) - 1): 59 | if (self.prompt_token_ids[i, j] == last_token) and (self.prompt_token_ids[i, j + 1] < voc_size): 60 | tokens.add(self.prompt_token_ids[i, j + 1]) 61 | scores[i, list(tokens)] += self.conditional_boost_factor 62 | 63 | return scores 64 | -------------------------------------------------------------------------------- /logits_processor_zoo/transformers/generation_length.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import torch 19 | from transformers import PreTrainedTokenizer 20 | from logits_processor_zoo.utils import text_to_token, SentenceChecker 21 | from logits_processor_zoo.transformers.base import BaseLogitsProcessor 22 | 23 | 24 | class GenLengthLogitsProcessor(BaseLogitsProcessor, SentenceChecker): 25 | """ 26 | A logits processor that adjusts the likelihood of the end-of-sequence (EOS) token 27 | based on the length of the generated sequence, encouraging or discouraging shorter answers. 28 | WARNING: Create a new object before every model.generate call since token_count is accumulated. 29 | 30 | Parameters 31 | ---------- 32 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 33 | boost_factor (float): A factor to boost the likelihood of the EOS token as the sequence length increases. 34 | Suggested value range is [-1.0, 1.0]. Negative values are used for the opposite effect. 35 | p (int, optional): The power to which the token count is raised when computing the boost value. Default is 2. 36 | complete_sentences (bool, optional): If True, boosts EOS token likelihood only when the last token is a full stop 37 | or a new line. Default is False. 38 | boost_token_str (str, optional): A string to be tokenized and used instead of EOS. Especially useful for . 39 | """ 40 | def __init__(self, tokenizer: PreTrainedTokenizer, boost_factor: float, 41 | p: int = 2, complete_sentences: bool = False, boost_token_str: str = None): 42 | BaseLogitsProcessor.__init__(self) 43 | SentenceChecker.__init__(self, tokenizer) 44 | self.boost_token = tokenizer.eos_token_id 45 | if boost_token_str is not None: 46 | self.boost_token = text_to_token(tokenizer, boost_token_str, last=False) 47 | self.boost_factor = boost_factor 48 | self.p = p 49 | self.complete_sentences = complete_sentences 50 | 51 | def _process(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.Tensor: 52 | token_count = input_ids.shape[1] - self.prompt_token_ids.shape[1] 53 | 54 | boost_val = self.boost_factor * (token_count ** self.p) / (10 ** self.p) 55 | 56 | enabled = (input_ids[:, -token_count:] == self.boost_token).sum(dim=1) == 0 57 | if self.complete_sentences: 58 | enabled = enabled & self._check_sentence_end(input_ids) 59 | 60 | scores[:, self.boost_token] += enabled * boost_val 61 | 62 | return scores 63 | -------------------------------------------------------------------------------- /lpz_examples/trtllm/README.md: -------------------------------------------------------------------------------- 1 | # Test TensorRT-LLM logits processors 2 | 3 | ## Quick Start 4 | 5 | It's recommended to use [TensorRT-LLM release containers](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) (>= 0.20.0) that has TensorRT-LLM pre-installed. 6 | Alternatively, please follow [this documentation](https://nvidia.github.io/TensorRT-LLM/installation/linux.html) to install it in [NGC PyTorch containers](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch/tags) (>=25.04). 7 | 8 | ## lpz_examples 9 | 10 | ### GenLengthLogitsProcessor 11 | A logits processor that adjusts the likelihood of the end-of-sequence (EOS) token based on the length of the generated sequence, encouraging or discouraging shorter answers. 12 | ``` 13 | python lpz_examples/trtllm/gen_length_logits_processor.py 14 | ``` 15 | 16 | ### CiteFromPromptLogitsProcessor 17 | A logits processor which boosts or diminishes the likelihood of tokens present in the prompt (and optionally EOS token) to encourage the model to generate tokens similar to those seen in the prompt or vice versa. 18 | ``` 19 | python lpz_examples/trtllm/cite_prompt_logits_processor.py -p "Retrieved information: 20 | Pokémon is a Japanese media franchise consisting of video games, animated series and films, a trading card game, and other related media. 21 | The franchise takes place in a shared universe in which humans co-exist with creatures known as Pokémon, a large variety of species endowed with special powers. 22 | The franchise's target audience is children aged 5 to 12, but it is known to attract people of all ages. 23 | 24 | Can you shortly describe what Pokémon is?" 25 | ``` 26 | 27 | ### ForceLastPhraseLogitsProcessor 28 | A logits processor which forces LLMs to use the given phrase before they finalize their answers. Most common use cases can be providing references, thanking user with context etc. 29 | ``` 30 | python lpz_examples/trtllm/last_phrase_logits_processor.py 31 | ``` 32 | 33 | ### MultipleChoiceLogitsProcessor 34 | A logits processor to answer multiple choice questions with one of the choices. 35 | ``` 36 | python lpz_examples/trtllm/multiple_choice_logits_processor.py -p "I am getting a lot of calls during the day. What is more important for me to consider when I buy a new phone? 37 | 0. Camera 38 | 1. Screen resolution 39 | 2. Operating System 40 | 3. Battery" 41 | ``` 42 | 43 | ### TriggerPhraseLogitsProcessor 44 | A logits processor which triggers phrases when it encounters a given token. 45 | ``` 46 | python lpz_examples/trtllm/trigger_phrase_logits_processor.py -p "Generate a python function to calculate nth fibonacci number. Make it recursive. Keep thinking short." 47 | ``` 48 | 49 | ### PreventHallucinationLogitsProcessor 50 | A logits processor that mitigates hallucinated model outputs by enforcing a predefined fallback phrase when token confidence falls below a specified threshold. 51 | ``` 52 | python lpz_examples/trtllm/prevent_hallucination_logits_processor.py -p "What are Nobel Prizes? Name the winners in 1977" 53 | ``` 54 | 55 | ### MaxTimeLogitsProcessor 56 | A logits processor that enforces the end-of-sentence (EOS) token after a specified maximum time passes, optionally waiting for a new line or a full stop. Useful for controlling generation time and ensuring responses complete within time constraints. 57 | ``` 58 | python lpz_examples/trtllm/max_time_logits_processor.py 59 | ``` 60 | -------------------------------------------------------------------------------- /logits_processor_zoo/vllm/generation_length.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from typing import List, Union 19 | import torch 20 | from transformers import PreTrainedTokenizer, AutoTokenizer 21 | from logits_processor_zoo.utils import text_to_token, SentenceChecker 22 | 23 | 24 | class GenLengthLogitsProcessor(SentenceChecker): 25 | """ 26 | A logits processor that adjusts the likelihood of the end-of-sequence (EOS) token 27 | based on the length of the generated sequence, encouraging or discouraging shorter answers. 28 | 29 | Parameters 30 | ---------- 31 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 32 | boost_factor (float): A factor to boost the likelihood of the EOS token as the sequence length increases. 33 | Suggested value range is [-1.0, 1.0]. Negative values are used for the opposite effect. 34 | p (int, optional): The power to which the token count is raised when computing the boost value. Default is 2. 35 | complete_sentences (bool, optional): If True, boosts EOS token likelihood only when the last token is a full stop 36 | or a new line. Default is False. 37 | boost_token_str (str, optional): A string to be tokenized and used instead of EOS. Especially useful for . 38 | """ 39 | def __init__(self, tokenizer: Union[PreTrainedTokenizer, str], boost_factor: float, 40 | p: int = 2, complete_sentences: bool = False, boost_token_str: str = None): 41 | self.tokenizer = tokenizer 42 | if isinstance(self.tokenizer, str): 43 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer) 44 | SentenceChecker.__init__(self, self.tokenizer) 45 | 46 | self.boost_token = self.tokenizer.eos_token_id 47 | self.boost_token_str = boost_token_str 48 | if boost_token_str is not None: 49 | self.boost_token = text_to_token(self.tokenizer, boost_token_str, last=False) 50 | self.boost_factor = boost_factor 51 | self.p = p 52 | self.complete_sentences = complete_sentences 53 | 54 | def __call__(self, prompt_tokens_ids: List[int], past_token_ids: List[int], scores: torch.Tensor) -> torch.Tensor: 55 | if self.boost_token in past_token_ids: # do not boost repeatedly 56 | return scores 57 | 58 | gen_length = len(past_token_ids) 59 | 60 | boost_val = 0 61 | if not (self.boost_token in past_token_ids): 62 | boost_val = self.boost_factor * (gen_length ** self.p) / (10 ** self.p) 63 | 64 | if self.complete_sentences and gen_length > 0: 65 | enabled = self._check_sentence_end(past_token_ids) 66 | scores[self.boost_token] += enabled * boost_val 67 | else: 68 | scores[self.boost_token] += boost_val 69 | 70 | return scores 71 | -------------------------------------------------------------------------------- /logits_processor_zoo/trtllm/max_time.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from typing import List, Optional 19 | import time 20 | from transformers import PreTrainedTokenizer 21 | import torch 22 | from tensorrt_llm.sampling_params import LogitsProcessor 23 | from logits_processor_zoo.utils import text_to_token, enforce_tokens, SentenceChecker 24 | 25 | 26 | class MaxTimeLogitsProcessor(LogitsProcessor, SentenceChecker): 27 | """ 28 | A logits processor that enforces the end-of-sentence (EOS) token after a specified maximum time passes. 29 | Useful for controlling generation time and ensuring responses complete within time constraints. 30 | 31 | Parameters 32 | ---------- 33 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 34 | max_time (float): Maximum time (wall-clock time) in seconds after which the EOS token must be enforced. 35 | complete_sentences (bool, optional): If True, enforces EOS token only when the last token is a full stop 36 | or a new line. Default is False. 37 | boost_token_str (str, optional): A string to be tokenized and used instead of EOS. 38 | """ 39 | 40 | def __init__( 41 | self, 42 | tokenizer: PreTrainedTokenizer, 43 | max_time: float, 44 | complete_sentences: bool = False, 45 | boost_token_str: str = None, 46 | ): 47 | SentenceChecker.__init__(self, tokenizer) 48 | self.tokenizer = tokenizer 49 | self.boost_token = self.tokenizer.eos_token_id 50 | self.boost_token_str = boost_token_str 51 | if boost_token_str is not None: 52 | self.boost_token = text_to_token(self.tokenizer, boost_token_str, last=False) 53 | self.complete_sentences = complete_sentences 54 | self.token_count = 0 55 | self.max_time = max_time 56 | self.start_time = time.time() 57 | 58 | def __call__( 59 | self, 60 | req_id: int, 61 | logits: torch.Tensor, 62 | token_ids: List[List[int]], 63 | stream_ptr: Optional[int], 64 | client_id: Optional[int], 65 | ) -> None: 66 | 67 | elapsed_time = time.time() - self.start_time 68 | time_exceeded = elapsed_time > self.max_time 69 | 70 | stream = None if stream_ptr is None else torch.cuda.ExternalStream(stream_ptr) 71 | 72 | with torch.cuda.stream(stream): 73 | ids = torch.LongTensor(token_ids).to(logits.device, non_blocking=True) 74 | 75 | enabled = True 76 | if self.complete_sentences: 77 | enabled = self._check_sentence_end(ids) 78 | 79 | if time_exceeded and enabled: 80 | # enforce the EOS token 81 | for i in range(logits.shape[1]): 82 | enforce_tokens(logits[0, i], [self.boost_token]) 83 | 84 | self.token_count += 1 85 | -------------------------------------------------------------------------------- /logits_processor_zoo/trtllm/generation_length.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from typing import List, Optional 19 | from transformers import PreTrainedTokenizer 20 | import torch 21 | from tensorrt_llm.sampling_params import LogitsProcessor 22 | from logits_processor_zoo.utils import text_to_token, SentenceChecker 23 | 24 | 25 | class GenLengthLogitsProcessor(LogitsProcessor, SentenceChecker): 26 | """ 27 | A logits processor that adjusts the likelihood of the end-of-sequence (EOS) token 28 | based on the length of the generated sequence, encouraging or discouraging shorter answers. 29 | 30 | Parameters 31 | ---------- 32 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 33 | boost_factor (float): A factor to boost the likelihood of the EOS token as the sequence length increases. 34 | Suggested value range is [-1.0, 1.0]. Negative values are used for the opposite effect. 35 | p (int, optional): The power to which the token count is raised when computing the boost value. Default is 2. 36 | complete_sentences (bool, optional): If True, boosts EOS token likelihood only when the last token is a full stop 37 | or a new line. Default is False. 38 | boost_token_str (str, optional): A string to be tokenized and used instead of EOS. Especially useful for . 39 | """ 40 | def __init__(self, tokenizer: PreTrainedTokenizer, boost_factor: float, p: int = 2, 41 | complete_sentences: bool = False, boost_token_str: str = None): 42 | SentenceChecker.__init__(self, tokenizer) 43 | self.tokenizer = tokenizer 44 | self.boost_token = self.tokenizer.eos_token_id 45 | self.boost_token_str = boost_token_str 46 | if boost_token_str is not None: 47 | self.boost_token = text_to_token(self.tokenizer, boost_token_str, last=False) 48 | self.boost_factor = boost_factor 49 | self.p = p 50 | self.complete_sentences = complete_sentences 51 | self.token_count = 0 52 | 53 | def __call__(self, req_id: int, logits: torch.Tensor, 54 | token_ids: List[List[int]], stream_ptr: Optional[int], 55 | client_id: Optional[int]) -> None: 56 | 57 | boost_val = self.boost_factor * (self.token_count ** self.p) / (10 ** self.p) 58 | 59 | stream = None if stream_ptr is None else torch.cuda.ExternalStream(stream_ptr) 60 | 61 | with torch.cuda.stream(stream): 62 | ids = torch.LongTensor(token_ids).to(logits.device, non_blocking=True) 63 | 64 | if self.complete_sentences: 65 | enabled = self._check_sentence_end(ids) 66 | logits[:, :, self.boost_token] += enabled * boost_val 67 | else: 68 | logits[:, :, self.boost_token] += boost_val 69 | 70 | self.token_count += 1 71 | -------------------------------------------------------------------------------- /logits_processor_zoo/vllm/prevent_hallucination.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from typing import List 19 | import torch 20 | from transformers import PreTrainedTokenizer 21 | from logits_processor_zoo.utils import enforce_tokens 22 | 23 | 24 | class PreventHallucinationLogitsProcessor: 25 | """ 26 | A logits processor that mitigates hallucinated model outputs by enforcing a predefined fallback phrase 27 | when token confidence falls below a specified threshold. 28 | 29 | This processor monitors token probabilities during generation. If the model produces a number of 30 | low-confidence tokens (below `minp`) exceeding `tolerate`, it begins injecting a fallback phrase 31 | token-by-token to gracefully indicate uncertainty. 32 | 33 | Parameters 34 | ---------- 35 | tokenizer : PreTrainedTokenizer 36 | The tokenizer used by the language model. It is used to tokenize the fallback phrase. 37 | minp : float, optional (default=0.4) 38 | The minimum probability threshold. Tokens with max probability below this are considered low-confidence. 39 | tolerate : int, optional (default=1) 40 | The number of consecutive low-confidence tokens tolerated before triggering the fallback phrase. 41 | phrase : str, optional (default="...I don't know actually.\\n") 42 | The phrase that will be inserted when hallucination is detected. It will be tokenized and injected 43 | sequentially into the generation. 44 | """ 45 | def __init__(self, tokenizer: PreTrainedTokenizer, minp: float = 0.4, tolerate: int = 1, 46 | phrase: str = "...I don't know actually.\n"): 47 | self.phrase = phrase 48 | self.eos_token_id = tokenizer.eos_token_id 49 | self.phrase_tokens = tokenizer.encode(self.phrase, add_special_tokens=False) 50 | self._reset() 51 | self.tokenizer = tokenizer 52 | self.minp = minp 53 | self.tolerate = tolerate 54 | 55 | def clone(self): 56 | return PreventHallucinationLogitsProcessor(self.tokenizer, self.minp, self.tolerate, self.phrase) 57 | 58 | def _reset(self): 59 | self.index = 0 60 | self.minp_count = 0 61 | 62 | def __call__(self, prompt_tokens_ids: List[int], past_token_ids: List[int], scores: torch.Tensor) -> torch.Tensor: 63 | if not past_token_ids: # new generation 64 | self._reset() 65 | 66 | if scores.softmax(dim=-1).amax() < self.minp: 67 | self.minp_count += 1 68 | 69 | if self.minp_count > self.tolerate and self.index == 0: 70 | scores = enforce_tokens(scores, [self.phrase_tokens[self.index]]) 71 | self.index += 1 72 | elif len(self.phrase_tokens) > self.index > 0: 73 | scores = enforce_tokens(scores, [self.phrase_tokens[self.index]]) 74 | self.index += 1 75 | elif self.index == len(self.phrase_tokens): 76 | self._reset() 77 | 78 | return scores 79 | -------------------------------------------------------------------------------- /logits_processor_zoo/trtllm/cite_prompt.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from typing import List, Optional 19 | import torch 20 | from transformers import PreTrainedTokenizer 21 | from tensorrt_llm.sampling_params import LogitsProcessor 22 | 23 | 24 | class CiteFromPromptLogitsProcessor(LogitsProcessor): 25 | """ 26 | A logits processor which boosts or diminishes the likelihood of tokens present in the prompt (and optionally 27 | EOS token) to encourage the model to generate tokens similar to those seen in the prompt or vice versa. 28 | 29 | Parameters 30 | ---------- 31 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 32 | boost_factor (float): A factor to boost the likelihood of the tokens from the prompt. 33 | Negative values are used for the opposite effect. 34 | boost_eos (bool, optional): If True, boosts EOS token too. 35 | conditional_boost_factor (float, optional): A factor to boost the likelihood of the tokens based on previous token. 36 | """ 37 | def __init__(self, tokenizer: PreTrainedTokenizer, boost_factor: float = 1.0, boost_eos: bool = True, 38 | conditional_boost_factor: float = 0.0): 39 | self.tokenizer = tokenizer 40 | self.boost_factor = boost_factor 41 | self.eos_token_id = self.tokenizer.eos_token_id 42 | self.boost_eos = boost_eos 43 | self.conditional_boost_factor = conditional_boost_factor 44 | self.prompt_token_ids = None 45 | 46 | def _init_before_gen(self, token_ids): 47 | self.prompt_token_ids = list(token_ids[0]) # take first beam since all beams have the same prompt 48 | 49 | def __call__(self, req_id: int, logits: torch.Tensor, 50 | token_ids: List[List[int]], stream_ptr: Optional[int], 51 | client_id: Optional[int]) -> None: 52 | if self.prompt_token_ids is None: 53 | self._init_before_gen(token_ids) 54 | 55 | tokens = set(self.prompt_token_ids) 56 | if self.boost_eos: 57 | tokens.add(self.eos_token_id) 58 | 59 | tokens = [t for t in tokens if t < logits.shape[-1]] 60 | 61 | stream = None if stream_ptr is None else torch.cuda.ExternalStream(stream_ptr) 62 | 63 | with torch.cuda.stream(stream): 64 | logits[:, :, tokens] += self.boost_factor 65 | 66 | if self.conditional_boost_factor != 0: 67 | 68 | for i in range(len(token_ids)): # iterate over beams 69 | tokens = set() 70 | for prompt_token_idx in range(len(self.prompt_token_ids) - 1): 71 | in_vocab = self.prompt_token_ids[prompt_token_idx + 1] < logits.shape[-1] 72 | last_token = self.prompt_token_ids[prompt_token_idx] == token_ids[i][-1] 73 | if last_token and in_vocab: 74 | tokens.add(self.prompt_token_ids[prompt_token_idx + 1]) 75 | logits[:, i, list(tokens)] += self.conditional_boost_factor 76 | -------------------------------------------------------------------------------- /logits_processor_zoo/transformers/prevent_hallucination.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import torch 19 | from transformers import PreTrainedTokenizer 20 | from logits_processor_zoo.transformers.base import BaseLogitsProcessor 21 | from logits_processor_zoo.utils import enforce_tokens 22 | 23 | 24 | class PreventHallucinationLogitsProcessor(BaseLogitsProcessor): 25 | """ 26 | A logits processor that mitigates hallucinated model outputs by enforcing a predefined fallback phrase 27 | when token confidence falls below a specified threshold. 28 | 29 | This processor monitors token probabilities during generation. If the model produces a number of 30 | low-confidence tokens (below `minp`) exceeding `tolerate`, it begins injecting a fallback phrase 31 | token-by-token to gracefully indicate uncertainty. 32 | 33 | Parameters 34 | ---------- 35 | tokenizer : PreTrainedTokenizer 36 | The tokenizer used by the language model. It is used to tokenize the fallback phrase. 37 | batch_size (int): 38 | Number of prompts in the batch. 39 | minp : float, optional (default=0.4) 40 | The minimum probability threshold. Tokens with max probability below this are considered low-confidence. 41 | tolerate : int, optional (default=1) 42 | The number of consecutive low-confidence tokens tolerated before triggering the fallback phrase. 43 | phrase : str, optional (default="...I don't know actually.\\n") 44 | The phrase that will be inserted when hallucination is detected. It will be tokenized and injected 45 | sequentially into the generation. 46 | """ 47 | def __init__(self, tokenizer: PreTrainedTokenizer, batch_size: int, minp: float = 0.4, tolerate: int = 1, 48 | phrase: str = "...I don't know actually.\n"): 49 | super().__init__() 50 | self.phrase = phrase 51 | self.eos_token_id = tokenizer.eos_token_id 52 | self.phrase_tokens = tokenizer.encode(self.phrase, add_special_tokens=False) 53 | self.tokenizer = tokenizer 54 | self.minp = minp 55 | self.tolerate = tolerate 56 | self.batch_size = batch_size 57 | 58 | def _reset(self): 59 | self.iterators = torch.zeros(self.batch_size, dtype=torch.int32) 60 | self.minp_count = torch.zeros(self.batch_size, dtype=torch.int32) 61 | 62 | def _process(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.Tensor: 63 | for i in range(scores.shape[0]): 64 | it = self.iterators[i].item() 65 | if scores[i].softmax(dim=-1).amax() < self.minp: 66 | self.minp_count[i] += 1 67 | 68 | if self.minp_count[i] > self.tolerate and it == 0: 69 | scores[i] = enforce_tokens(scores[i], [self.phrase_tokens[it]]) 70 | self.iterators[i] += 1 71 | elif len(self.phrase_tokens) > it > 0: 72 | scores[i] = enforce_tokens(scores[i], [self.phrase_tokens[it]]) 73 | self.iterators[i] += 1 74 | elif it == len(self.phrase_tokens): 75 | self.iterators[i] = 0 76 | self.minp_count[i] = 0 77 | 78 | return scores 79 | -------------------------------------------------------------------------------- /logits_processor_zoo/vllm/max_time.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import time 19 | from typing import List 20 | import torch 21 | from transformers import PreTrainedTokenizer, AutoTokenizer 22 | from logits_processor_zoo.utils import text_to_token, enforce_tokens, SentenceChecker 23 | 24 | 25 | class MaxTimeLogitsProcessor(SentenceChecker): 26 | """ 27 | A logits processor that enforces the end-of-sentence (EOS) token after a specified maximum time passes. 28 | Useful for controlling generation time and ensuring responses complete within time constraints. 29 | 30 | Parameters 31 | ---------- 32 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 33 | max_time (float): Maximum time (wall-clock time) in seconds after which the EOS token must be enforced. 34 | complete_sentences (bool, optional): If True, enforces EOS token only when the last token is a full stop 35 | or a new line. Default is False. 36 | boost_token_str (str, optional): A string to be tokenized and used instead of EOS. 37 | 38 | """ 39 | 40 | def __init__( 41 | self, 42 | tokenizer: PreTrainedTokenizer, 43 | max_time: float, 44 | complete_sentences: bool = False, 45 | boost_token_str: str = None, 46 | ): 47 | self.tokenizer = tokenizer 48 | if isinstance(self.tokenizer, str): 49 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer) 50 | SentenceChecker.__init__(self, self.tokenizer) 51 | 52 | self.boost_token = self.tokenizer.eos_token_id 53 | self.boost_token_str = boost_token_str 54 | if boost_token_str is not None: 55 | self.boost_token = text_to_token(self.tokenizer, boost_token_str, last=False) 56 | self.complete_sentences = complete_sentences 57 | self.max_time = max_time 58 | self._reset() 59 | 60 | # Mutable logits processor gets cloned for each prompt in a batch in order to prevent updating the same object 61 | # https://github.com/vllm-project/vllm/blob/19dcc02a72e3ed52e3bf95aae44ea1f40ce42ea0/vllm/sampling_params.py#L537-L550 62 | def clone(self): 63 | return MaxTimeLogitsProcessor( 64 | self.tokenizer, 65 | self.max_time, 66 | self.complete_sentences, 67 | self.boost_token_str, 68 | ) 69 | 70 | def _reset(self): 71 | self.start_time = time.time() 72 | 73 | def __call__( 74 | self, 75 | prompt_tokens_ids: List[int], 76 | past_token_ids: List[int], 77 | scores: torch.Tensor, 78 | ) -> torch.Tensor: 79 | if self.boost_token in past_token_ids: # do not force repeatedly 80 | return scores 81 | 82 | elapsed_time = time.time() - self.start_time 83 | time_exceeded = elapsed_time > self.max_time 84 | gen_length = len(past_token_ids) 85 | 86 | enabled = True 87 | if self.complete_sentences and gen_length > 0: 88 | enabled = self._check_sentence_end(past_token_ids) 89 | 90 | if time_exceeded and enabled: 91 | scores = enforce_tokens(scores, [self.boost_token]) 92 | 93 | return scores 94 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # poetry 100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 104 | poetry.lock 105 | 106 | # pdm 107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 108 | #pdm.lock 109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 110 | # in version control. 111 | # https://pdm.fming.dev/#use-with-ide 112 | .pdm.toml 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | .idea/ -------------------------------------------------------------------------------- /logits_processor_zoo/vllm/multiple_choice.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from transformers import PreTrainedTokenizer, AutoTokenizer 19 | from typing import List, Union 20 | import torch 21 | from logits_processor_zoo.utils import text_to_token, get_new_line_tokens, enforce_tokens 22 | 23 | 24 | class MultipleChoiceLogitsProcessor: 25 | """ 26 | A logits processor to answer multiple choice questions with one of the choices. 27 | A multiple choice question is like: 28 | I am getting a lot of calls during the day. What is more important for me to consider when I buy a new phone? 29 | 0. Camera 30 | 1. Screen resolution 31 | 2. Operating System 32 | 3. Battery 33 | The goal is to make LLM generate "3" as an answer. 34 | 35 | 36 | Parameters 37 | ---------- 38 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 39 | choices (List[str]): List of one character answers like A, B, C, D. 40 | delimiter (str): One character delimiter that comes after the choices like 1. or 2-. 41 | boost_first_words (float): Nonzero values add choices' first tokens' logits to boost performance. 42 | Especially useful for the models which have difficulty associating the choice with its text. 43 | """ 44 | def __init__(self, tokenizer: Union[PreTrainedTokenizer, str], choices: List[str] = None, 45 | delimiter: str = ".", boost_first_words: float = 0.0): 46 | self.tokenizer = tokenizer 47 | if isinstance(self.tokenizer, str): 48 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer) 49 | 50 | self.choices = choices 51 | self.delimiter = delimiter 52 | if choices is None: 53 | choices = ["1", "2", "3", "4"] 54 | 55 | self.new_line_token = get_new_line_tokens(self.tokenizer) 56 | self.delimiter_token = text_to_token(self.tokenizer, delimiter, last=False) 57 | self.choice_tokens = [text_to_token(self.tokenizer, choice, last=False) for choice in choices] 58 | self.boost_first_words = boost_first_words 59 | 60 | def __call__(self, prompt_tokens_ids: List[int], past_token_ids: List[int], scores: torch.Tensor) -> torch.Tensor: 61 | 62 | if self.boost_first_words: 63 | choice = 0 64 | 65 | first_tokens = [] 66 | for i in range(len(prompt_tokens_ids) - 3): 67 | # A choice is like "\nA) hair dryer", where first token is "hair" 68 | choice_starts = ( 69 | (prompt_tokens_ids[i] in self.new_line_token) and 70 | (prompt_tokens_ids[i + 1] == self.choice_tokens[choice]) and 71 | (prompt_tokens_ids[i + 2] == self.delimiter_token) 72 | ) 73 | 74 | if choice_starts: 75 | first_tokens.append(prompt_tokens_ids[i + 3]) 76 | choice += 1 77 | 78 | if choice >= len(self.choice_tokens): 79 | break 80 | 81 | scores[self.choice_tokens[:len(first_tokens)]] += self.boost_first_words * scores[first_tokens] 82 | 83 | scores = enforce_tokens(scores, self.choice_tokens) 84 | return scores 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![PyPI version](https://badge.fury.io/py/logits-processor-zoo.svg)](https://badge.fury.io/py/logits-processor-zoo) 2 | [![License: MIT](https://img.shields.io/badge/License-Apache2.0-yellow.svg)](https://opensource.org/licenses/Apache2.0) 3 | 4 |

5 | 6 |

7 | 8 | # logits-processor-zoo 9 | 10 | Struggling to get LLMs to follow your instructions? LogitsProcessorZoo offers a zoo of tools to use LLMs for specific tasks, beyond just grammar enforcement! 11 | 12 | ## Installation 13 | 14 | ```bash 15 | pip install logits-processor-zoo 16 | ``` 17 | 18 | ## Supported Frameworks 19 | * transformers 20 | * vLLM 21 | * TensorRT-LLM (>=0.20.0) 22 | 23 | ## Usage 24 | 25 | ```python 26 | import vllm 27 | from logits_processor_zoo.vllm import GenLengthLogitsProcessor, CiteFromPromptLogitsProcessor, ForceLastPhraseLogitsProcessor 28 | 29 | model = vllm.LLM( 30 | model_name, 31 | trust_remote_code=True, 32 | dtype="half", 33 | enforce_eager=True 34 | ) 35 | tokenizer = model.get_tokenizer() 36 | 37 | logits_processors = [ 38 | CiteFromPromptLogitsProcessor(tokenizer, boost_factor=2.0), 39 | GenLengthLogitsProcessor(tokenizer, boost_factor=-0.2, p=1), 40 | ForceLastPhraseLogitsProcessor("\n\nReferences:\n", tokenizer) 41 | ] 42 | 43 | 44 | gen_output = model.generate( 45 | prompts, 46 | vllm.SamplingParams( 47 | n=1, 48 | temperature=0, 49 | seed=0, 50 | skip_special_tokens=True, 51 | max_tokens=64, 52 | logits_processors=logits_processors 53 | ), 54 | use_tqdm=False 55 | ) 56 | ``` 57 | 58 | 59 | For the detailed examples in each framework, please have a look at **lpz_examples** directory. 60 | 61 | ## Available Logits Processors 62 | 63 | ### GenLengthLogitsProcessor 64 | A logits processor that adjusts the likelihood of the end-of-sequence (EOS) token based on the length of the generated sequence, encouraging or discouraging shorter answers. 65 | 66 | ### CiteFromPromptLogitsProcessor 67 | A logits processor which boosts or diminishes the likelihood of tokens present in the prompt (and optionally EOS token) to encourage the model to generate tokens similar to those seen in the prompt or vice versa. 68 | 69 | ### ForceLastPhraseLogitsProcessor 70 | A logits processor which forces LLMs to use the given phrase before they finalize their answers. Most common use cases can be providing references, thanking user with context etc. 71 | 72 | ### MultipleChoiceLogitsProcessor 73 | A logits processor to answer multiple choice questions with one of the choices. A multiple choice question is like: 74 | ``` 75 | I am getting a lot of calls during the day. What is more important for me to consider when I buy a new phone? 76 | 0. Camera 77 | 1. Screen resolution 78 | 2. Operating System 79 | 3. Battery 80 | ``` 81 | The goal is to make LLM generate "3" as an answer. 82 | 83 | ### TriggerPhraseLogitsProcessor 84 | A logits processor which triggers phrases when it encounters a given token or after a specified time. 85 | One common use case is to force writing python code just after thinking: 86 | ```python 87 | trigger_python = TriggerPhraseLogitsProcessor(phrase="\n```python", trigger_token_phrase="", 88 | tokenizer=tokenizer, trigger_count=1, trigger_after=True) 89 | ``` 90 | ### PreventHallucinationLogitsProcessor 91 | A logits processor that mitigates hallucinated model outputs by enforcing a predefined fallback phrase when token confidence falls below a specified threshold. 92 | 93 | ### MaxTimeLogitsProcessor 94 | A logits processor that enforces the end-of-sentence (EOS) token after a specified maximum time passes, optionally waiting for a new line or a full stop. Useful for controlling generation time and ensuring responses complete within time constraints. -------------------------------------------------------------------------------- /logits_processor_zoo/transformers/multiple_choice.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from transformers import PreTrainedTokenizer 19 | from typing import List 20 | import torch 21 | from logits_processor_zoo.utils import text_to_token, get_new_line_tokens, enforce_tokens 22 | from logits_processor_zoo.transformers.base import BaseLogitsProcessor 23 | 24 | 25 | class MultipleChoiceLogitsProcessor(BaseLogitsProcessor): 26 | """ 27 | A logits processor to answer multiple choice questions with one of the choices. 28 | A multiple choice question is like: 29 | I am getting a lot of calls during the day. What is more important for me to consider when I buy a new phone? 30 | 0. Camera 31 | 1. Screen resolution 32 | 2. Operating System 33 | 3. Battery 34 | The goal is to make LLM generate "3" as an answer. 35 | 36 | 37 | Parameters 38 | ---------- 39 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 40 | choices (List[str]): List of one character answers like A, B, C, D. 41 | delimiter (str): One character delimiter that comes after the choices like 1. or 2-. 42 | boost_first_words (float): Nonzero values add choices' first tokens' logits to boost performance. 43 | Especially useful for the models which have difficulty associating the choice with its text. 44 | """ 45 | def __init__(self, tokenizer: PreTrainedTokenizer, choices: List[str] = None, delimiter: str = ".", 46 | boost_first_words: float = 0.0): 47 | super().__init__() 48 | if choices is None: 49 | choices = ["1", "2", "3", "4"] 50 | 51 | self.new_line_tokens = get_new_line_tokens(tokenizer) 52 | self.delimiter_token = text_to_token(tokenizer, delimiter, last=False) 53 | self.choice_tokens = [text_to_token(tokenizer, choice, last=False) for choice in choices] 54 | self.boost_first_words = boost_first_words 55 | self.very_large_number = 999 56 | 57 | def _process(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.Tensor: 58 | for row_ind in range(self.prompt_token_ids.shape[0]): 59 | if self.boost_first_words: 60 | choice = 0 61 | 62 | first_tokens = [] 63 | for i in range(len(self.prompt_token_ids[row_ind]) - 3): 64 | # A choice is like "\nA) hair dryer", where first token is "hair" 65 | choice_starts = ( 66 | (self.prompt_token_ids[row_ind, i].item() in self.new_line_tokens) and 67 | (self.prompt_token_ids[row_ind, i + 1] == self.choice_tokens[choice]) and 68 | (self.prompt_token_ids[row_ind, i + 2] == self.delimiter_token) 69 | ) 70 | 71 | if choice_starts: 72 | first_tokens.append(self.prompt_token_ids[row_ind, i + 3]) 73 | choice += 1 74 | 75 | if choice >= len(self.choice_tokens): 76 | break 77 | 78 | boost = self.boost_first_words * scores[row_ind, first_tokens] 79 | scores[row_ind, self.choice_tokens[:len(first_tokens)]] += boost 80 | 81 | for i in range(scores.shape[0]): 82 | scores[i] = enforce_tokens(scores[i], self.choice_tokens) 83 | 84 | return scores 85 | -------------------------------------------------------------------------------- /logits_processor_zoo/transformers/trigger_phrase.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import time 19 | from typing import Optional 20 | from transformers import PreTrainedTokenizer 21 | import torch 22 | from logits_processor_zoo.utils import text_to_token, enforce_tokens 23 | from logits_processor_zoo.transformers.base import BaseLogitsProcessor 24 | 25 | 26 | class TriggerPhraseLogitsProcessor(BaseLogitsProcessor): 27 | """ 28 | A logits processor which triggers phrases when it encounters a given token. 29 | 30 | Parameters 31 | ---------- 32 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 33 | batch_size (int): The batch size. 34 | phrase (str): The phrase to be generated by LLM when it encounters the trigger token. 35 | trigger_token_phrase (str): (Optional) One token phrase in string to trigger phrases. 36 | trigger_time (float): (Optional) Time (wall-clock time) in seconds after which the phrase will be triggered. 37 | trigger_count (int): How many times the phrase will be triggered. 38 | trigger_after (bool): Whether the phrase is written after the trigger token or instead of the trigger token. 39 | """ 40 | 41 | def __init__(self, tokenizer: PreTrainedTokenizer, batch_size: int, phrase: str, 42 | trigger_token_phrase: Optional[str] = None, trigger_time: Optional[float] = None, 43 | trigger_count: int = 1, trigger_after: bool = False): 44 | 45 | assert ( 46 | trigger_token_phrase is not None or trigger_time is not None 47 | ), "Either trigger_token_phrase or trigger_time must be provided" 48 | 49 | super().__init__() 50 | 51 | self.trigger_token = None 52 | if trigger_token_phrase is not None: 53 | self.trigger_token = text_to_token(tokenizer, trigger_token_phrase, last=False) 54 | 55 | self.phrase_tokens = tokenizer.encode(phrase, add_special_tokens=False) 56 | self.trigger_after = trigger_after 57 | self.batch_size = batch_size 58 | self.initial_trigger_count = trigger_count 59 | self.trigger_time = trigger_time or float("inf") 60 | 61 | def _reset(self): 62 | self.iterators = -torch.ones(self.batch_size, dtype=torch.int32) 63 | self.trigger_count = self.initial_trigger_count * torch.ones(self.batch_size, dtype=torch.int32) 64 | self.start_time = time.time() 65 | 66 | def _process(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.Tensor: 67 | for i in range(scores.shape[0]): 68 | if self.trigger_count[i] <= 0: 69 | continue 70 | 71 | it = self.iterators[i].item() 72 | 73 | time_over = time.time() - self.start_time > self.trigger_time 74 | if (scores[i, :].argmax() == self.trigger_token or time_over) and it == -1: 75 | self.iterators[i] = 0 76 | if not self.trigger_after: 77 | scores[i] = enforce_tokens(scores[i], [self.phrase_tokens[0]]) 78 | self.iterators[i] += 1 79 | elif len(self.phrase_tokens) > it >= 0: 80 | scores[i] = enforce_tokens(scores[i], [self.phrase_tokens[it]]) 81 | self.iterators[i] += 1 82 | 83 | if len(self.phrase_tokens) == self.iterators[i].item(): # phrase completed, reset for next trigger 84 | self.iterators[i] = -1 85 | self.trigger_count[i] -= 1 86 | self.start_time = time.time() 87 | 88 | return scores 89 | -------------------------------------------------------------------------------- /logits_processor_zoo/trtllm/prevent_hallucination.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from typing import List, Optional 19 | import torch 20 | from transformers import PreTrainedTokenizer 21 | from logits_processor_zoo.utils import enforce_tokens 22 | from tensorrt_llm.sampling_params import LogitsProcessor 23 | 24 | 25 | class PreventHallucinationLogitsProcessor(LogitsProcessor): 26 | """ 27 | A logits processor that mitigates hallucinated model outputs by enforcing a predefined fallback phrase 28 | when token confidence falls below a specified threshold. 29 | 30 | This processor monitors token probabilities during generation. If the model produces a number of 31 | low-confidence tokens (below `minp`) exceeding `tolerate`, it begins injecting a fallback phrase 32 | token-by-token to gracefully indicate uncertainty. 33 | 34 | Parameters 35 | ---------- 36 | tokenizer : PreTrainedTokenizer 37 | The tokenizer used by the language model. It is used to tokenize the fallback phrase. 38 | minp : float, optional (default=0.4) 39 | The minimum probability threshold. Tokens with max probability below this are considered low-confidence. 40 | tolerate : int, optional (default=1) 41 | The number of consecutive low-confidence tokens tolerated before triggering the fallback phrase. 42 | phrase : str, optional (default="...I don't know actually.\\n") 43 | The phrase that will be inserted when hallucination is detected. It will be tokenized and injected 44 | sequentially into the generation. 45 | """ 46 | def __init__(self, tokenizer: PreTrainedTokenizer, minp: float = 0.4, tolerate: int = 1, 47 | phrase: str = "...I don't know actually.\n"): 48 | self.phrase = phrase 49 | self.eos_token_id = tokenizer.eos_token_id 50 | self.phrase_tokens = tokenizer.encode(self.phrase, add_special_tokens=False) 51 | self.tokenizer = tokenizer 52 | self.minp = minp 53 | self.tolerate = tolerate 54 | self.iterators = None 55 | self.minp_counts = None 56 | 57 | def _init_before_gen(self, beam_width): 58 | self.iterators = torch.zeros(beam_width, dtype=torch.int32) 59 | self.minp_counts = torch.zeros(beam_width, dtype=torch.int32) 60 | 61 | def __call__(self, req_id: int, logits: torch.Tensor, 62 | token_ids: List[List[int]], stream_ptr: Optional[int], 63 | client_id: Optional[int]) -> None: 64 | beam_width = len(token_ids) 65 | if self.iterators is None: 66 | self._init_before_gen(beam_width) 67 | 68 | beam_width = len(token_ids) 69 | stream = None if stream_ptr is None else torch.cuda.ExternalStream(stream_ptr) 70 | 71 | with torch.cuda.stream(stream): 72 | for i in range(beam_width): # iterate over beams 73 | current_index = self.iterators[i].item() 74 | 75 | if logits[0, i, :].softmax(dim=-1).amax() < self.minp: 76 | self.minp_counts[i] += 1 77 | 78 | if self.minp_counts[i] > self.tolerate and current_index == 0: 79 | enforce_tokens(logits[0, i], [self.phrase_tokens[current_index]]) 80 | self.iterators[i] += 1 81 | elif len(self.phrase_tokens) > current_index > 0: 82 | enforce_tokens(logits[0, i], [self.phrase_tokens[current_index]]) 83 | self.iterators[i] += 1 84 | elif current_index == len(self.phrase_tokens): 85 | self.iterators[i] = 0 86 | self.minp_counts[i] = 0 87 | -------------------------------------------------------------------------------- /logits_processor_zoo/trtllm/multiple_choice.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from transformers import PreTrainedTokenizer 19 | from typing import List, Optional 20 | import torch 21 | from logits_processor_zoo.utils import text_to_token, get_new_line_tokens, enforce_tokens 22 | from tensorrt_llm.sampling_params import LogitsProcessor 23 | 24 | 25 | class MultipleChoiceLogitsProcessor(LogitsProcessor): 26 | """ 27 | A logits processor to answer multiple choice questions with one of the choices. 28 | A multiple choice question is like: 29 | I am getting a lot of calls during the day. What is more important for me to consider when I buy a new phone? 30 | 0. Camera 31 | 1. Screen resolution 32 | 2. Operating System 33 | 3. Battery 34 | The goal is to make LLM generate "3" as an answer. 35 | 36 | 37 | Parameters 38 | ---------- 39 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 40 | choices (List[str]): List of one character answers like A, B, C, D. 41 | delimiter (str): One character delimiter that comes after the choices like 1. or 2-. 42 | boost_first_words (float): Nonzero values add choices' first tokens' logits to boost performance. 43 | Especially useful for the models which have difficulty associating the choice with its text. 44 | """ 45 | def __init__(self, tokenizer: PreTrainedTokenizer, choices: List[str] = None, 46 | delimiter: str = ".", boost_first_words: float = 0.0): 47 | if choices is None: 48 | choices = ["1", "2", "3", "4"] 49 | 50 | self.new_line_tokens = get_new_line_tokens(tokenizer) 51 | self.delimiter_token = text_to_token(tokenizer, delimiter, last=False) 52 | self.choice_tokens = [text_to_token(tokenizer, choice, last=False) for choice in choices] 53 | self.boost_first_words = boost_first_words 54 | self.first_tokens = list() 55 | 56 | def _init_choice_first_words(self, prompt_token_ids): 57 | choice = 0 58 | 59 | first_tokens = [] 60 | for i in range(len(prompt_token_ids) - 3): 61 | # A choice is like "\nA) hair dryer", where first token is "hair" 62 | choice_starts = ( 63 | (prompt_token_ids[i] in self.new_line_tokens) and 64 | (prompt_token_ids[i + 1] == self.choice_tokens[choice]) and 65 | (prompt_token_ids[i + 2] == self.delimiter_token) 66 | ) 67 | 68 | if choice_starts: 69 | first_tokens.append(prompt_token_ids[i + 3]) 70 | choice += 1 71 | 72 | if choice >= len(self.choice_tokens): 73 | break 74 | return first_tokens 75 | 76 | def __call__(self, req_id: int, logits: torch.Tensor, 77 | token_ids: List[List[int]], stream_ptr: Optional[int], 78 | client_id: Optional[int]) -> None: 79 | 80 | if len(self.first_tokens) == 0 and self.boost_first_words: 81 | prompt_token_ids = list(token_ids[0]) # take first beam since all beams have the same prompt 82 | self.first_tokens = self._init_choice_first_words(prompt_token_ids) 83 | 84 | beam_width = len(token_ids) 85 | stream = None if stream_ptr is None else torch.cuda.ExternalStream(stream_ptr) 86 | 87 | with torch.cuda.stream(stream): 88 | if len(self.first_tokens) > 0: 89 | boost = self.boost_first_words * logits[0, :, self.first_tokens] 90 | logits[0, :, self.choice_tokens[:len(self.first_tokens)]] += boost 91 | for i in range(beam_width): # iterate over beams 92 | enforce_tokens(logits[0, i], self.choice_tokens) 93 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to logits-processor-zoo 2 | 3 | Contributions to logits-processor-zoo fall into the following categories: 4 | 5 | 1. To report a bug, request a new feature, or report a problem with documentation, please file an 6 | issue describing the problem or new feature 7 | in detail. The team evaluates and triages issues, and schedules them for a release. If you 8 | believe the issue needs priority attention, please comment on the issue to notify the team. 9 | 2. To propose and implement a new feature, please file a new feature request. Describe the intended feature and 10 | discuss the design and implementation with the team and community. Once the team agrees that the 11 | plan looks good, go ahead and implement it, using the [code contributions](#code-contributions) 12 | guide below. 13 | 3. To implement a feature or bug fix for an existing issue, please follow the [code 14 | contributions](#code-contributions) guide below. If you need more context on a particular issue, 15 | please ask in a comment. 16 | 17 | ## Code contributions 18 | 19 | ### Your first issue 20 | 21 | 1. Find an issue to work on. The best way is to look for the 22 | good first issue or help wanted labels. 23 | 2. Comment on the issue stating that you are going to work on it. 24 | 3. Create a fork of the repository and check out a branch with a name that 25 | describes your planned work. For example, `fix-documentation`. 26 | 4. Write code to address the issue or implement the feature. 27 | 5. Add unit tests and unit benchmarks. 28 | 6. Create your Pull Request. To run continuous integration (CI) tests without requesting review, open a draft pull request. 29 | 7. Verify that CI passes all status checks. Fix if needed. 30 | 8. Wait for other developers to review your code and update code as needed. 31 | 9. Once reviewed and approved, a developer will merge your pull request. 32 | 33 | If you are unsure about anything, don't hesitate to comment on issues and ask for clarification! 34 | 35 | ### Seasoned developers 36 | 37 | Look at the unassigned issues, and find an issue to which you are comfortable contributing. Start 38 | with _Step 3_ above, commenting on the issue to let others know you are working on it. If you have 39 | any questions related to the implementation of the issue, ask them in the issue instead of the PR. 40 | 41 | #### Signing Your Work 42 | 43 | * We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license. 44 | 45 | * Any contribution which contains commits that are not Signed-Off will not be accepted. 46 | 47 | * To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes: 48 | ```bash 49 | $ git commit -s -m "Add cool feature." 50 | ``` 51 | This will append the following to your commit message: 52 | ``` 53 | Signed-off-by: Your Name 54 | ``` 55 | 56 | * Full text of the DCO: 57 | 58 | ``` 59 | Developer Certificate of Origin 60 | Version 1.1 61 | 62 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 63 | 1 Letterman Drive 64 | Suite D4700 65 | San Francisco, CA, 94129 66 | 67 | Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. 68 | ``` 69 | 70 | ``` 71 | Developer's Certificate of Origin 1.1 72 | 73 | By making a contribution to this project, I certify that: 74 | 75 | (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or 76 | 77 | (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or 78 | 79 | (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it. 80 | 81 | (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved. 82 | ``` 83 | 84 | -------------------------------------------------------------------------------- /logits_processor_zoo/trtllm/trigger_phrase.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | from typing import List, Optional 19 | import time 20 | from transformers import PreTrainedTokenizer 21 | import torch 22 | from logits_processor_zoo.utils import enforce_tokens, text_to_token 23 | from tensorrt_llm.sampling_params import LogitsProcessor 24 | 25 | 26 | class TriggerPhraseLogitsProcessor(LogitsProcessor): 27 | """ 28 | A logits processor which triggers phrases when it encounters a given token. 29 | 30 | Parameters 31 | ---------- 32 | tokenizer (PreTrainedTokenizer): The tokenizer used by the LLM. 33 | phrase (str): The phrase to be generated by LLM when it encounters the trigger token. 34 | trigger_token_phrase (str): (Optional) One token phrase in string to trigger phrases. 35 | trigger_time (float): (Optional) Time (wall-clock time) in seconds after which the phrase will be triggered. 36 | trigger_count (int): How many times the phrase will be triggered. 37 | trigger_after (bool): Whether the phrase is written after the trigger token or instead of the trigger token. 38 | """ 39 | 40 | def __init__(self, tokenizer: PreTrainedTokenizer, phrase: str, trigger_token_phrase: Optional[str] = None, 41 | trigger_time: Optional[float] = None, trigger_count: int = 1, trigger_after: bool = False): 42 | assert ( 43 | trigger_token_phrase is not None or trigger_time is not None 44 | ), "Either trigger_token_phrase or trigger_time must be provided" 45 | self.tokenizer = tokenizer 46 | self.trigger_token = None 47 | if trigger_token_phrase is not None: 48 | self.trigger_token = text_to_token(self.tokenizer, trigger_token_phrase, last=False) 49 | 50 | self.trigger_time = trigger_time or float("inf") 51 | self.phrase_tokens = self.tokenizer.encode(phrase, add_special_tokens=False) 52 | self.initial_trigger_count = trigger_count 53 | self.trigger_after = trigger_after 54 | self.iterators = None 55 | self.trigger_counts = None 56 | self.start_time = time.time() 57 | 58 | def _init_before_gen(self, beam_width): 59 | self.iterators = -torch.ones(beam_width, dtype=torch.int32) 60 | self.trigger_counts = self.initial_trigger_count * torch.ones(beam_width, dtype=torch.int32) 61 | 62 | def __call__(self, req_id: int, logits: torch.Tensor, 63 | token_ids: List[List[int]], stream_ptr: Optional[int], 64 | client_id: Optional[int]) -> None: 65 | beam_width = len(token_ids) 66 | if self.iterators is None: 67 | self._init_before_gen(beam_width) 68 | 69 | stream = None if stream_ptr is None else torch.cuda.ExternalStream(stream_ptr) 70 | 71 | with torch.cuda.stream(stream): 72 | for i in range(beam_width): # iterate over beams 73 | if self.trigger_counts[i] <= 0: 74 | continue 75 | 76 | current_index = self.iterators[i].item() 77 | 78 | time_over = time.time() - self.start_time > self.trigger_time 79 | if (logits[0, i].argmax() == self.trigger_token or time_over) and current_index == -1: 80 | self.iterators[i] = 0 81 | if not self.trigger_after: 82 | enforce_tokens(logits[0, i], [self.phrase_tokens[0]]) 83 | self.iterators[i] += 1 84 | elif len(self.phrase_tokens) > current_index >= 0: 85 | enforce_tokens(logits[0, i], [self.phrase_tokens[current_index]]) 86 | self.iterators[i] += 1 87 | 88 | if len(self.phrase_tokens) == self.iterators[i].item(): # phrase completed, reset for next trigger 89 | self.iterators[i] = -1 90 | self.trigger_counts[i] -= 1 91 | -------------------------------------------------------------------------------- /logits_processor_zoo/vllm/trigger_phrase.py: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | # SPDX-License-Identifier: Apache-2.0 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | import time 19 | from transformers import PreTrainedTokenizer, AutoTokenizer 20 | from typing import List, Optional, Union 21 | import torch 22 | from logits_processor_zoo.utils import text_to_token, enforce_tokens 23 | 24 | 25 | class TriggerPhraseLogitsProcessor: 26 | """ 27 | A logits processor which triggers phrases when it encounters a given token. 28 | 29 | Parameters 30 | ---------- 31 | tokenizer (Union[PreTrainedTokenizer, str]): The tokenizer to use. 32 | phrase (str): The phrase to be generated by LLM when it encounters the trigger token. 33 | trigger_token_phrase (str): (Optional) One token phrase in string to trigger phrases. 34 | trigger_time (float): (Optional) Time (wall-clock time) in seconds after which the phrase will be triggered. 35 | trigger_count (int): How many times the phrase will be triggered. 36 | trigger_after (bool): Whether the phrase is written after the trigger token or instead of the trigger token. 37 | """ 38 | 39 | def __init__(self, tokenizer: Union[PreTrainedTokenizer, str], phrase: str, 40 | trigger_token_phrase: Optional[str] = None, trigger_time: Optional[float] = None, 41 | trigger_count: int = 1, trigger_after: bool = False): 42 | 43 | assert ( 44 | trigger_token_phrase is not None or trigger_time is not None 45 | ), "Either trigger_token_phrase or trigger_time must be provided" 46 | 47 | self.tokenizer = tokenizer 48 | if isinstance(self.tokenizer, str): 49 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer) 50 | 51 | self.phrase = phrase 52 | self.trigger_token_phrase = trigger_token_phrase 53 | self.trigger_count = trigger_count 54 | self.trigger_token = None 55 | if trigger_token_phrase is not None: 56 | self.trigger_token = text_to_token(self.tokenizer, trigger_token_phrase, last=False) 57 | 58 | self.phrase_tokens = self.tokenizer.encode(phrase, add_special_tokens=False) 59 | self.initial_trigger_count = trigger_count 60 | self.trigger_after = trigger_after 61 | self.trigger_time = trigger_time or float("inf") 62 | self._reset() 63 | 64 | # Mutable logits processor gets cloned for each prompt in a batch in order to prevent updating the same object 65 | # https://github.com/vllm-project/vllm/blob/19dcc02a72e3ed52e3bf95aae44ea1f40ce42ea0/vllm/sampling_params.py#L537-L550 66 | def clone(self): 67 | return TriggerPhraseLogitsProcessor(self.tokenizer, self.phrase, self.trigger_token_phrase, self.trigger_time, 68 | self.initial_trigger_count, self.trigger_after) 69 | 70 | def _reset(self): 71 | self.index = -1 72 | self.trigger_count = self.initial_trigger_count 73 | self.start_time = time.time() 74 | 75 | def __call__(self, prompt_tokens_ids: List[int], past_token_ids: List[int], scores: torch.Tensor) -> torch.Tensor: 76 | if not past_token_ids: # new generation 77 | self._reset() 78 | 79 | if self.trigger_count <= 0: 80 | return scores 81 | 82 | time_over = time.time() - self.start_time > self.trigger_time 83 | if (scores.argmax() == self.trigger_token or time_over) and self.index == -1: 84 | self.index = 0 85 | if not self.trigger_after: 86 | scores = enforce_tokens(scores, [self.phrase_tokens[self.index]]) 87 | self.index += 1 88 | elif len(self.phrase_tokens) > self.index >= 0: 89 | scores = enforce_tokens(scores, [self.phrase_tokens[self.index]]) 90 | self.index += 1 91 | 92 | if len(self.phrase_tokens) == self.index: # phrase completed, reset for next trigger 93 | self.index = -1 94 | self.trigger_count -= 1 95 | self.start_time = time.time() 96 | 97 | return scores 98 | -------------------------------------------------------------------------------- /lpz_examples/vllm/vllm_serve.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "59f98cf9", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "/home/aerdem/projects/nvidia/logits-processor-zoo\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "%cd ../.." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "id": "f2a86616", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "# Run vllm serve like this:\n", 29 | "# vllm serve Qwen/Qwen2.5-1.5B-Instruct --dtype auto --api-key lpz-test --logits-processor-pattern \"logits_processor_zoo.vllm\"" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "id": "13f407ff", 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "Fried rice chicken is a popular Chinese dish that combines the flavors of fried rice with the tender texture and juicy meat of chicken. Here's a basic recipe to help you make it at home:\n", 43 | "\n", 44 | "### Ingredients:\n", 45 | "- 1 pound boneless skinless chicken breast or thighs (cut into bite-sized pieces)\n", 46 | "- 2 tablespoons vegetable oil\n", 47 | "- 3 cloves garlic, minced\n", 48 | "- 1 tablespoon ginger, grated\n", 49 | "- 1/4 cup soy sauce\n", 50 | "- 1/4 cup oyster sauce\n", 51 | "- 1 teaspoon sugar\n", 52 | "- 1/2 teaspoon salt\n", 53 | "- 1/4 teaspoon black pepper\n", 54 | "- 1 can (8 oz) condensed cream of mushroom soup\n", 55 | "- 1 cup frozen mixed vegetables (such as peas, carrots, corn)\n", 56 | "- 1/2 cup chopped green onions\n", 57 | "- 1/4 cup chopped cilantro\n", 58 | "\n", 59 | "### Instructions:\n", 60 | "\n", 61 | "#### Step 1: Prepare the Chicken\n", 62 | "1. **Marinate the Chicken:** In a bowl, mix together the chicken, soy sauce, oyster sauce, sugar, salt, and black pepper.\n", 63 | "2. **Cook the Chicken:** Heat the vegetable oil in a large skillet over medium-high heat. Add the marinated chicken and cook until browned on all sides, about 5 minutes per side. Remove from the pan and set aside.\n", 64 | "\n", 65 | "#### Step 2: Cook the Vegetables\n", 66 | "1. **Sauté the Vegetables:** In the same skillet, add the remaining 1 tablespoon of oil. Sauté the minced garlic and grated ginger for about 30 seconds until fragrant.\n", 67 | "2. **Add the Mixed Vegetables:** Stir in the frozen mixed vegetables and sauté until they start to soften, about 2-3 minutes.\n", 68 | "3. **Combine Everything:** Return the cooked chicken to the skillet along with the sautéed vegetables. Pour in the condensed cream of mushroom soup and stir well to combine everything.\n", 69 | "\n", 70 | "#### Step 3: Finish Cooking\n", 71 | "1. **Simmer the Sauce:** Bring the mixture to a simmer over low heat. Let it cook for about 5 minutes, stirring occasionally, until the sauce thickens slightly.\n", 72 | "2. **Serve:** Garnish with chopped green onions and cilantro before serving. This dish can be served hot or cold depending on your preference.\n", 73 | "\n", 74 | "Enjoy your homemade fried rice chicken! Adjust the seasoning according to your taste preferences.\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "from openai import OpenAI\n", 80 | "\n", 81 | "model_name = \"Qwen/Qwen2.5-1.5B-Instruct\"\n", 82 | "\n", 83 | "client = OpenAI(\n", 84 | " base_url=\"http://localhost:8000/v1\",\n", 85 | " api_key=\"lpz-test\",\n", 86 | ")\n", 87 | "\n", 88 | "completion = client.chat.completions.create(\n", 89 | " model=model_name,\n", 90 | " messages=[\n", 91 | " {\"role\": \"user\", \"content\": \"Can you explain how fried rice chicken is cooked?\"}\n", 92 | " ], \n", 93 | " temperature=0,\n", 94 | " top_p=1\n", 95 | ")\n", 96 | "\n", 97 | "print(completion.choices[0].message.content)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "id": "6227231c", 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "Fried rice chicken is a popular Chinese dish that combines the flavors of fried rice with the tender texture and juicy meat of chicken. Here's a basic recipe to help you make it at home:\n", 111 | "\n", 112 | "### Ingredients:\n", 113 | "- 1 pound boneless skinless chicken breast or thighs (cut into bite-sized pieces)\n", 114 | "- 2 tablespoons vegetable oil\n", 115 | "- 3 cloves garlic, minced\n", 116 | "- 1 tablespoon ginger, grated\n", 117 | "- 1/4 cup soy sauce\n", 118 | "- 1/4 cup oyster sauce\n", 119 | "- 1 teaspoon sugar\n", 120 | "- 1/2 teaspoon salt\n", 121 | "- 1/4 teaspoon black pepper\n", 122 | "- 1 can (8 oz) condensed cream of mushroom soup\n", 123 | "\n" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "completion = client.chat.completions.create(\n", 129 | " model=model_name,\n", 130 | " messages=[\n", 131 | " {\"role\": \"user\", \"content\": \"Can you explain how fried rice chicken is cooked?\"}\n", 132 | " ],\n", 133 | " temperature=0,\n", 134 | " top_p=1,\n", 135 | " extra_body={\n", 136 | " \"logits_processors\": [{\n", 137 | " \"qualname\": \"logits_processor_zoo.vllm.GenLengthLogitsProcessor\",\n", 138 | " \"kwargs\": {\"tokenizer\": model_name, \"boost_factor\": 0.2, \"complete_sentences\": True}\n", 139 | " }]\n", 140 | " }\n", 141 | ")\n", 142 | "\n", 143 | "print(completion.choices[0].message.content)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "96544ec2", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [] 153 | } 154 | ], 155 | "metadata": { 156 | "kernelspec": { 157 | "display_name": "Python 3 (ipykernel)", 158 | "language": "python", 159 | "name": "python3" 160 | }, 161 | "language_info": { 162 | "codemirror_mode": { 163 | "name": "ipython", 164 | "version": 3 165 | }, 166 | "file_extension": ".py", 167 | "mimetype": "text/x-python", 168 | "name": "python", 169 | "nbconvert_exporter": "python", 170 | "pygments_lexer": "ipython3", 171 | "version": "3.10.17" 172 | } 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 5 176 | } 177 | -------------------------------------------------------------------------------- /lpz_examples/transformers/multiple_choice_logits_processor.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "28ed6952", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "/home/aerdem/projects/nvidia/logits-processor-zoo\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "%cd ../.." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "id": "a85f8503", 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stderr", 29 | "output_type": "stream", 30 | "text": [ 31 | "/home/aerdem/projects/LLM/llmenv/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", 32 | " warnings.warn(\n", 33 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", 34 | "/home/aerdem/projects/LLM/llmenv/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", 35 | " warnings.warn(\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "from lpz_examples.transformers.utils import LLMRunner\n", 41 | "from logits_processor_zoo.transformers import MultipleChoiceLogitsProcessor\n", 42 | "\n", 43 | "\n", 44 | "example_prompts = [\n", 45 | "\"\"\"\n", 46 | "I am getting a lot of calls during the day. What is more important for me to consider when I buy a new phone?\n", 47 | "0. Camera\n", 48 | "1. Screen resolution\n", 49 | "2. Operating System\n", 50 | "3. Battery\n", 51 | "\"\"\",\n", 52 | "\n", 53 | "\"\"\"\n", 54 | "Which user review doesn't belong to a summer dress?\n", 55 | "a) Looks good\n", 56 | "b) Keeps warm\n", 57 | "c) Too long\n", 58 | "d) Liked the color\n", 59 | "\"\"\"\n", 60 | "]\n", 61 | "\n", 62 | "runner = LLMRunner()" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "id": "859aef8d", 68 | "metadata": {}, 69 | "source": [ 70 | "## Default Response" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 3, 76 | "id": "cbf4c2d5", 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stderr", 81 | "output_type": "stream", 82 | "text": [ 83 | "/home/aerdem/projects/LLM/llmenv/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:392: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `None` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n", 84 | " warnings.warn(\n", 85 | "/home/aerdem/projects/LLM/llmenv/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:397: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `None` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`.\n", 86 | " warnings.warn(\n", 87 | "/home/aerdem/projects/LLM/llmenv/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:407: UserWarning: `do_sample` is set to `False`. However, `top_k` is set to `None` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_k`.\n", 88 | " warnings.warn(\n" 89 | ] 90 | }, 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | "Prompt: \n", 96 | "I am getting a lot of calls during the day. What is more important for me to consider when I buy a new phone?\n", 97 | "0. Camera\n", 98 | "1. Screen resolution\n", 99 | "2. Operating System\n", 100 | "3. Battery\n", 101 | "\n", 102 | "\n", 103 | "LLM response:\n", 104 | "When\n", 105 | "-----END-----\n", 106 | "\n", 107 | "Prompt: \n", 108 | "Which user review doesn't belong to a summer dress?\n", 109 | "a) Looks good\n", 110 | "b) Keeps warm\n", 111 | "c) Too long\n", 112 | "d) Liked the color\n", 113 | "\n", 114 | "\n", 115 | "LLM response:\n", 116 | "The\n", 117 | "-----END-----\n", 118 | "\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "runner.generate_response(example_prompts, max_tokens=1)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "id": "88bc2f8a", 129 | "metadata": {}, 130 | "source": [ 131 | "## Multiple Choice Answer" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 4, 137 | "id": "7d74eb26", 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "Prompt: \n", 145 | "I am getting a lot of calls during the day. What is more important for me to consider when I buy a new phone?\n", 146 | "0. Camera\n", 147 | "1. Screen resolution\n", 148 | "2. Operating System\n", 149 | "3. Battery\n", 150 | "\n", 151 | "\n", 152 | "LLM response:\n", 153 | "1\n", 154 | "-----END-----\n", 155 | "\n", 156 | "Prompt: \n", 157 | "Which user review doesn't belong to a summer dress?\n", 158 | "a) Looks good\n", 159 | "b) Keeps warm\n", 160 | "c) Too long\n", 161 | "d) Liked the color\n", 162 | "\n", 163 | "\n", 164 | "LLM response:\n", 165 | "b\n", 166 | "-----END-----\n", 167 | "\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "mclp = MultipleChoiceLogitsProcessor(runner.tokenizer, choices=[\"0\", \"1\", \"2\", \"3\"], delimiter=\".\")\n", 173 | "\n", 174 | "runner.generate_response(example_prompts[:1], [mclp], max_tokens=1)\n", 175 | "\n", 176 | "mclp = MultipleChoiceLogitsProcessor(runner.tokenizer, choices=[\"a\", \"b\", \"c\", \"d\"], delimiter=\")\")\n", 177 | "\n", 178 | "runner.generate_response(example_prompts[1:], [mclp], max_tokens=1)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "id": "15b5afa5", 184 | "metadata": {}, 185 | "source": [ 186 | "## Multiple Choice Answer by boosting first words of options" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 5, 192 | "id": "b2297aab", 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "Prompt: \n", 200 | "I am getting a lot of calls during the day. What is more important for me to consider when I buy a new phone?\n", 201 | "0. Camera\n", 202 | "1. Screen resolution\n", 203 | "2. Operating System\n", 204 | "3. Battery\n", 205 | "\n", 206 | "\n", 207 | "LLM response:\n", 208 | "3\n", 209 | "-----END-----\n", 210 | "\n", 211 | "Prompt: \n", 212 | "Which user review doesn't belong to a summer dress?\n", 213 | "a) Looks good\n", 214 | "b) Keeps warm\n", 215 | "c) Too long\n", 216 | "d) Liked the color\n", 217 | "\n", 218 | "\n", 219 | "LLM response:\n", 220 | "a\n", 221 | "-----END-----\n", 222 | "\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "mclp = MultipleChoiceLogitsProcessor(\n", 228 | " runner.tokenizer, choices=[\"0\", \"1\", \"2\", \"3\"], delimiter=\".\", boost_first_words=1.0\n", 229 | ")\n", 230 | "\n", 231 | "runner.generate_response(example_prompts[:1], [mclp], max_tokens=1)\n", 232 | "\n", 233 | "mclp = MultipleChoiceLogitsProcessor(\n", 234 | " runner.tokenizer, choices=[\"a\", \"b\", \"c\", \"d\"], delimiter=\")\", boost_first_words=1.0\n", 235 | ")\n", 236 | "\n", 237 | "runner.generate_response(example_prompts[1:], [mclp], max_tokens=1)" 238 | ] 239 | } 240 | ], 241 | "metadata": { 242 | "kernelspec": { 243 | "display_name": "Python 3 (ipykernel)", 244 | "language": "python", 245 | "name": "python3" 246 | }, 247 | "language_info": { 248 | "codemirror_mode": { 249 | "name": "ipython", 250 | "version": 3 251 | }, 252 | "file_extension": ".py", 253 | "mimetype": "text/x-python", 254 | "name": "python", 255 | "nbconvert_exporter": "python", 256 | "pygments_lexer": "ipython3", 257 | "version": "3.10.17" 258 | } 259 | }, 260 | "nbformat": 4, 261 | "nbformat_minor": 5 262 | } 263 | -------------------------------------------------------------------------------- /lpz_examples/vllm/multiple_choice_logits_processor.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "28ed6952", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "/home/aerdem/projects/nvidia/logits-processor-zoo\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "%cd ../.." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "id": "b89279fe", 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stdout", 29 | "output_type": "stream", 30 | "text": [ 31 | "WARNING 04-29 15:34:27 cuda.py:22] You are using a deprecated `pynvml` package. Please install `nvidia-ml-py` instead. See https://pypi.org/project/pynvml for more information.\n" 32 | ] 33 | }, 34 | { 35 | "name": "stderr", 36 | "output_type": "stream", 37 | "text": [ 38 | "/home/aerdem/projects/LLM/llmenv/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", 39 | " warnings.warn(\n" 40 | ] 41 | }, 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "WARNING 04-29 15:34:30 config.py:1563] Casting torch.bfloat16 to torch.float16.\n", 47 | "INFO 04-29 15:34:30 llm_engine.py:184] Initializing an LLM engine (v0.5.5) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=Qwen/Qwen2.5-1.5B-Instruct, use_v2_block_manager=False, enable_prefix_caching=False)\n" 48 | ] 49 | }, 50 | { 51 | "name": "stderr", 52 | "output_type": "stream", 53 | "text": [ 54 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" 55 | ] 56 | }, 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "INFO 04-29 15:34:32 model_runner.py:879] Starting to load model Qwen/Qwen2.5-1.5B-Instruct...\n", 62 | "INFO 04-29 15:34:32 weight_utils.py:236] Using model weights format ['*.safetensors']\n", 63 | "INFO 04-29 15:34:33 weight_utils.py:280] No model.safetensors.index.json found in remote.\n" 64 | ] 65 | }, 66 | { 67 | "data": { 68 | "application/vnd.jupyter.widget-view+json": { 69 | "model_id": "", 70 | "version_major": 2, 71 | "version_minor": 0 72 | }, 73 | "text/plain": [ 74 | "Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver\n", 34 | "INFO 06-18 16:20:24 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.\n", 35 | "WARNING 06-18 16:20:25 [config.py:3135] Casting torch.bfloat16 to torch.float16.\n", 36 | "INFO 06-18 16:20:30 [config.py:793] This model supports multiple tasks: {'score', 'generate', 'embed', 'reward', 'classify'}. Defaulting to 'generate'.\n", 37 | "WARNING 06-18 16:20:30 [cuda.py:87] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used\n", 38 | "INFO 06-18 16:20:30 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.0) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=None, served_model_name=Qwen/Qwen2.5-1.5B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=False, pooler_config=None, compilation_config={\"compile_sizes\": [], \"inductor_compile_config\": {\"enable_auto_functionalized_v2\": false}, \"cudagraph_capture_sizes\": [], \"max_capture_size\": 0}, use_cached_outputs=False, \n", 39 | "INFO 06-18 16:20:32 [cuda.py:292] Using Flash Attention backend.\n", 40 | "INFO 06-18 16:20:32 [parallel_state.py:1064] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0\n", 41 | "INFO 06-18 16:20:32 [model_runner.py:1170] Starting to load model Qwen/Qwen2.5-1.5B-Instruct...\n", 42 | "INFO 06-18 16:20:32 [weight_utils.py:291] Using model weights format ['*.safetensors']\n", 43 | "INFO 06-18 16:20:33 [weight_utils.py:344] No model.safetensors.index.json found in remote.\n" 44 | ] 45 | }, 46 | { 47 | "data": { 48 | "application/vnd.jupyter.widget-view+json": { 49 | "model_id": "e79891b6e4a4416696c420ff78e9b058", 50 | "version_major": 2, 51 | "version_minor": 0 52 | }, 53 | "text/plain": [ 54 | "Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00