├── .gitignore ├── LICENSE ├── README.md ├── assets └── results.png ├── config.json ├── package.json ├── populate_index.py ├── requirements.txt ├── src ├── __init__.py ├── engines │ ├── __init__.py │ ├── engine_google_vertex.py │ ├── engine_llamacpp.py │ └── engine_mockup.py ├── evals │ ├── __init__.py │ ├── assets │ │ └── sample_bill.jpg │ ├── components │ │ ├── __init__.py │ │ ├── api_builder.py │ │ ├── factorization.py │ │ └── paper.py │ ├── eval_computation_graphs.py │ ├── eval_in_context_associations.py │ ├── eval_logic_components.py │ ├── eval_multimodal_bindings.py │ ├── eval_program_synthesis.py │ └── snippets │ │ ├── code_api_builder.txt │ │ ├── code_api_builder2.txt │ │ ├── code_api_builder_website_result.txt │ │ ├── einstein_puzzle.txt │ │ ├── einstein_puzzle_human_solution.txt │ │ ├── einstein_puzzle_logic_solution.txt │ │ ├── formulations_dsl_rewriting.txt │ │ ├── google_organic_results_20240111_query=What-is-sulfuric-acid.txt │ │ ├── google_organic_results_20240121_query=Search-for-U-235.txt │ │ ├── jays_brother_human_solution.txt │ │ ├── jays_brother_trajectories.txt │ │ ├── latex_templating_output.txt │ │ ├── latex_templating_problem.txt │ │ ├── latex_templating_solution_1.txt │ │ ├── latex_templating_solution_2.txt │ │ ├── paper │ │ ├── bib │ │ │ └── related_work │ │ │ │ ├── laird87.txt │ │ │ │ ├── mccarthy06.txt │ │ │ │ ├── newell56.txt │ │ │ │ ├── newell57.txt │ │ │ │ └── newell72.txt │ │ ├── method │ │ │ └── symbolicai_docs.txt │ │ ├── ref │ │ │ ├── reference_abstract.txt │ │ │ ├── reference_paper.txt │ │ │ ├── reference_section_framework.txt │ │ │ ├── reference_section_relatedwork.txt │ │ │ └── reference_title.txt │ │ └── traj │ │ │ ├── reference_abstract.txt │ │ │ ├── reference_paper.txt │ │ │ ├── reference_section_framework.txt │ │ │ ├── reference_section_relatedwork.txt │ │ │ └── reference_title.txt │ │ ├── richard_feynman_summary.txt │ │ ├── sample_bill.txt │ │ └── wiki_page_20240121.txt ├── func.py ├── report.py └── utils.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | src/evals/.DS_Store 162 | src/.DS_Store 163 | .DS_Store 164 | experiments 165 | symai.config.json 166 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2023, ExtensityAI 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Benchmark 2 | 3 | ## SymbolicAI: A framework for logic-based approaches combining generative models and solvers 4 | 5 | We introduce SymbolicAI, a versatile and modular framework employing a logic-based approach to concept learning and flow management in generative processes. SymbolicAI enables the seamless integration of generative models with a diverse range of solvers by treating large language models (LLMs) as semantic parsers that execute tasks based on both natural and formal language instructions, thus bridging the gap between symbolic reasoning and generative AI. We leverage probabilistic programming principles to tackle complex tasks, and utilize differentiable and classical programming paradigms with their respective strengths. The framework introduces a set of polymorphic, compositional, and self-referential operations for data stream manipulation, aligning LLM outputs with user objectives. As a result, we can transition between the capabilities of various foundation models endowed with zero- and few-shot learning capabilities and specialized, fine-tuned models or solvers proficient in addressing specific problems. In turn, the framework facilitates the creation and evaluation of explainable computational graphs. We conclude by introducing a quality measure and its empirical score for evaluating these computational graphs, and propose a benchmark that compares various state-of-the-art LLMs across a set of complex workflows. We refer to the empirical score as the "Vector Embedding for Relational Trajectory Evaluation through Cross-similarity", or VERTEX score for short. The SymbolicAI framework codebase is available [here](https://github.com/ExtensityAI/symbolicai). 6 | 7 | Results 8 | 9 | ## Installation 10 | 11 | ### Requirements 12 | 13 | Install dependencies. 14 | 15 | ```bash 16 | pip install "symbolicai[all]" 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | Install LlamaCpp backend. 21 | 22 | ```bash 23 | sympkg i ExtensityAI/llamacpp 24 | ``` 25 | 26 | Then follow the instructions in the [ExtensityAI/llamacpp](https://github.com/ExtensityAI/llamacpp) repository to install and run the LlamaCpp backend with various HuggingFace models. 27 | 28 | Install embeddings backend. 29 | 30 | ```bash 31 | sympkg i ExtensityAI/embeddings 32 | ``` 33 | 34 | ## Configuration 35 | 36 | Set the respective `config.json` properties for engine API keys and local models as shown below, and run the local models with the configured port and host name. 37 | 38 | ```json 39 | { 40 | "gpt4": { 41 | "api_key": "", 42 | "model": "gpt-4-1106-preview" 43 | }, 44 | "gpt3.5": { 45 | "api_key": "", 46 | "model": "gpt-3.5-turbo-1106" 47 | }, 48 | "gemini": { 49 | "api_key": "", 50 | "model": "gemini-pro" 51 | }, 52 | "llama": { 53 | "host": "http://localhost", 54 | "port": 8080 55 | }, 56 | ... 57 | } 58 | ``` 59 | 60 | ## Usage 61 | 62 | Run the full benchmark. 63 | 64 | ```bash 65 | python test.py --context_associations --program_synthesis --multimodal_bindings --logic_components --computation_graphs 66 | ``` 67 | 68 | This will run all the evaluations in the benchmark. 69 | 70 | ## Cite us 71 | 72 | ```bibtex 73 | @article{ 74 | Dinu:24, 75 | title={SymbolicAI: A framework for logic-based approaches combining generative models and solvers}, 76 | author={Marius–Constantin Dinu and Claudiu Leoveanu–Condrei and Markus Holzleitner and Werner Zellinger and Sepp Hochreiter}, 77 | year={2024}, 78 | eprint={2402.00854}, 79 | archivePrefix={arXiv}, 80 | primaryClass={cs.LG}, 81 | url={https://arxiv.org/abs/2402.00854} 82 | } 83 | ``` 84 | -------------------------------------------------------------------------------- /assets/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ExtensityAI/benchmark/24d3e93681d454b379d7f1e787b2a2284c41922f/assets/results.png -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "gpt4": { 3 | "api_key": "", 4 | "model": "gpt-4-1106-preview" 5 | }, 6 | "gpt3.5": { 7 | "api_key": "", 8 | "model": "gpt-3.5-turbo" 9 | }, 10 | "gemini": { 11 | "api_key": "", 12 | "model": "gemini-1.0-pro" 13 | }, 14 | "llama": { 15 | "host": "http://localhost", 16 | "port": 8080 17 | }, 18 | "zephyr": { 19 | "host": "http://localhost", 20 | "port": 8081 21 | }, 22 | "mistral": { 23 | "host": "http://localhost", 24 | "port": 8082 25 | }, 26 | "llama3_8B": { 27 | "host": "http://localhost", 28 | "port": 8083 29 | }, 30 | "llama3_70B": { 31 | "host": "http://localhost", 32 | "port": 8084 33 | }, 34 | "gemini1.5": { 35 | "api_key": "", 36 | "model": "gemini-1.5-pro-latest" 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.0.1", 3 | "name": "ExtensityAI/benchmark", 4 | "description": "Evaluation of the neuro-symbolic framework.", 5 | "expressions": [ 6 | { 7 | "module": "src/func", 8 | "type": "EvaluateBenchmark" 9 | } 10 | ], 11 | "run": { 12 | "module": "src/func", 13 | "type": "EvaluateBenchmark" 14 | }, 15 | "dependencies": [ 16 | "ExtensityAI/llamacpp", 17 | "ExtensityAI/tuning" 18 | ] 19 | } -------------------------------------------------------------------------------- /populate_index.py: -------------------------------------------------------------------------------- 1 | from symai.shellsv import retrieval_augmented_indexing 2 | from symai.functional import EngineRepository 3 | from symai.backend.engines.index.engine_pinecone import PineconeIndexEngine 4 | from symai.backend.engines.index.engine_vectordb import VectorDBIndexEngine 5 | 6 | 7 | def run(): 8 | # Register embeddings engine globally for all Symbols from plugin 9 | EngineRepository.register_from_plugin('embedding', plugin='ExtensityAI/embeddings', kwargs={'model': 'all-mpnet-base-v2'}, allow_engine_override=True) 10 | # EngineRepository.register('index', PineconeIndexEngine(index_name='dataindex', 11 | # index_dims=768, 12 | # index_top_k=5)) 13 | vectorDB = VectorDBIndexEngine(index_name='dataindex', 14 | index_dims=768, 15 | index_top_k=5) 16 | EngineRepository.register('index', vectorDB) 17 | # insert into the index 18 | retrieval_augmented_indexing('!src/evals/snippets', index_name='dataindex') 19 | # # need to persist in-memory to disk 20 | vectorDB.save() 21 | 22 | 23 | if __name__ == '__main__': 24 | run() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | backoff 2 | seaborn 3 | google-cloud-aiplatform 4 | google-generativeai 5 | anthropic 6 | wandb 7 | parso 8 | sympy 9 | z3-solver -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ExtensityAI/benchmark/24d3e93681d454b379d7f1e787b2a2284c41922f/src/__init__.py -------------------------------------------------------------------------------- /src/engines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ExtensityAI/benchmark/24d3e93681d454b379d7f1e787b2a2284c41922f/src/engines/__init__.py -------------------------------------------------------------------------------- /src/engines/engine_google_vertex.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import google.generativeai as genai 3 | 4 | from typing import List, Optional 5 | 6 | from symai.backend.base import Engine 7 | from symai.backend.settings import SYMAI_CONFIG 8 | 9 | 10 | logging.getLogger("requests").setLevel(logging.ERROR) 11 | logging.getLogger("urllib").setLevel(logging.ERROR) 12 | logging.getLogger("httpx").setLevel(logging.ERROR) 13 | logging.getLogger("httpcore").setLevel(logging.ERROR) 14 | 15 | 16 | class GoogleGeminiEngine(Engine): 17 | def __init__(self, api_key: Optional[str] = None, model: Optional[str] = None): 18 | super().__init__() 19 | logger = logging.getLogger('vertexai') 20 | logger.setLevel(logging.WARNING) 21 | self.config = SYMAI_CONFIG 22 | # Initialize the Vertex AI project 23 | self.api_key = api_key 24 | genai.configure(api_key=api_key) 25 | # Create a generative model instance from Vertex AI 26 | self.model = genai.GenerativeModel(model_name=model) 27 | self.max_tokens = 32_760 - 100 # @NOTE: account for tolerance. 28 | self.seed = None 29 | self.except_remedy = None 30 | 31 | def id(self) -> str: 32 | if self.config['NEUROSYMBOLIC_ENGINE_MODEL'] and \ 33 | self.config['NEUROSYMBOLIC_ENGINE_MODEL'].startswith('gemini'): 34 | return 'neurosymbolic' 35 | return super().id() # default to unregistered 36 | 37 | def command(self, *args, **kwargs): 38 | super().command(*args, **kwargs) 39 | if 'NEUROSYMBOLIC_ENGINE_MODEL' in kwargs: 40 | self.model = kwargs['NEUROSYMBOLIC_ENGINE_MODEL'] 41 | if 'seed' in kwargs: 42 | self.seed = kwargs['seed'] 43 | if 'except_remedy' in kwargs: 44 | self.except_remedy = kwargs['except_remedy'] 45 | 46 | def compute_remaining_tokens(self, prompts: list) -> int: 47 | return int((8_192) * 0.99) # @NOTE: account for tolerance. 48 | 49 | def forward(self, argument): 50 | kwargs = argument.kwargs 51 | prompts_ = argument.prop.prepared_input 52 | 53 | # send prompt to GPT-X Chat-based 54 | stop = kwargs['stop'] if 'stop' in kwargs else None 55 | model = kwargs['model'] if 'model' in kwargs else self.model 56 | seed = kwargs['seed'] if 'seed' in kwargs else self.seed 57 | 58 | # convert map to list of strings 59 | max_tokens = kwargs['max_tokens'] if 'max_tokens' in kwargs else self.compute_remaining_tokens(prompts_) 60 | temperature = kwargs['temperature'] if 'temperature' in kwargs else 0.1 61 | top_p = kwargs['top_p'] if 'top_p' in kwargs else 1 62 | top_k = kwargs['top_k'] if 'top_k' in kwargs else 40 63 | except_remedy = kwargs['except_remedy'] if 'except_remedy' in kwargs else self.except_remedy 64 | 65 | try: 66 | res = model.generate_content( 67 | prompts_, 68 | generation_config={ 69 | "temperature": temperature, 70 | "max_output_tokens": max_tokens, 71 | "top_p": top_p, 72 | "top_k": top_k 73 | } 74 | ) 75 | 76 | except Exception as e: 77 | callback = model.generate_content 78 | kwargs['model'] = kwargs['model'] if 'model' in kwargs else self.model 79 | if except_remedy is not None: 80 | res = except_remedy(self, e, callback, argument) 81 | else: 82 | raise e 83 | 84 | metadata = {} 85 | output = [res.text] 86 | return output, metadata 87 | 88 | def prepare(self, argument): 89 | if argument.prop.raw_input: 90 | if not argument.prop.processed_input: 91 | raise ValueError('Need to provide a prompt instruction to the engine if raw_input is enabled.') 92 | argument.prop.prepared_input = str(argument.prop.processed_input) 93 | return 94 | 95 | _non_verbose_output = """[META INSTRUCTIONS START]\nYou do not output anything else, like verbose preambles or post explanation, such as "Sure, let me...", "Hope that was helpful...", "Yes, I can help you with that...", etc. Consider well formatted output, e.g. for sentences use punctuation, spaces etc. or for code use indentation, etc. Never add meta instructions information to your output!\n""" 96 | user: str = "" 97 | system: str = "" 98 | 99 | if argument.prop.disable_verbose_output_suppression: 100 | system += _non_verbose_output 101 | system = f'{system}\n' if system and len(system) > 0 else '' 102 | 103 | ref = argument.prop.instance 104 | static_ctxt, dyn_ctxt = ref.global_context 105 | if len(static_ctxt) > 0: 106 | system += f"[STATIC CONTEXT]\n{static_ctxt}\n\n" 107 | 108 | if len(dyn_ctxt) > 0: 109 | system += f"[DYNAMIC CONTEXT]\n{dyn_ctxt}\n\n" 110 | 111 | payload = argument.prop.payload 112 | if argument.prop.payload: 113 | system += f"[ADDITIONAL CONTEXT]\n{str(payload)}\n\n" 114 | 115 | examples: List[str] = argument.prop.examples 116 | if examples and len(examples) > 0: 117 | system += f"[EXAMPLES]\n{str(examples)}\n\n" 118 | 119 | if argument.prop.prompt is not None and len(argument.prop.prompt) > 0: 120 | val = str(argument.prop.prompt) 121 | system += f"[INSTRUCTION]\n{val}" 122 | 123 | suffix: str = str(argument.prop.processed_input) 124 | 125 | if '[SYSTEM_INSTRUCTION::]: <<<' in suffix and argument.prop.parse_system_instructions: 126 | parts = suffix.split('\n>>>\n') 127 | # first parts are the system instructions 128 | c = 0 129 | for i, p in enumerate(parts): 130 | if 'SYSTEM_INSTRUCTION' in p: 131 | system += f"{p}\n" 132 | c += 1 133 | else: 134 | break 135 | # last part is the user input 136 | suffix = '\n>>>\n'.join(parts[c:]) 137 | user += f"{suffix}" 138 | 139 | if argument.prop.template_suffix: 140 | user += f"\n[[PLACEHOLDER]]\n{str(argument.prop.template_suffix)}\n\n" 141 | user += f"Only generate content for the placeholder `[[PLACEHOLDER]]` following the instructions and context information. Do NOT write `[[PLACEHOLDER]]` or anything else in your output.\n\n" 142 | 143 | argument.prop.prepared_input = f'---------SYSTEM BEHAVIOR--------\n{system}\n\n---------USER REQUEST--------\n{user}' 144 | -------------------------------------------------------------------------------- /src/engines/engine_llamacpp.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import requests 3 | import json 4 | 5 | from typing import List 6 | from requests_toolbelt.multipart.encoder import MultipartEncoder 7 | 8 | from symai.backend.base import Engine 9 | from symai.backend.settings import SYMAI_CONFIG 10 | from symai.symbol import Result 11 | 12 | 13 | class LLaMAResult(Result): 14 | def __init__(self, value=None, *args, **kwargs): 15 | super().__init__(value, *args, **kwargs) 16 | self._value = value 17 | self.error = None 18 | self.raw = value 19 | self._perse_result() 20 | 21 | def _perse_result(self): 22 | val = json.loads(self.value) 23 | self.value = val 24 | if 'error' in val: 25 | self.error = val['error'] 26 | if 'content' in val: 27 | self.value = val['content'] 28 | 29 | 30 | class LLaMACppClientEngine(Engine): 31 | def __init__(self, host: str = 'http://localhost', port: int = 8080, uri: str = 'completion', timeout: int = 600): 32 | super().__init__() 33 | logger = logging.getLogger('nesy_client') 34 | logger.setLevel(logging.WARNING) 35 | self.config = SYMAI_CONFIG 36 | self.host = host 37 | self.port = port 38 | self.uri = uri 39 | self.timeout = timeout 40 | self.seed = None 41 | self.except_remedy = None 42 | 43 | def id(self) -> str: 44 | if self.config['CAPTION_ENGINE_MODEL'] and \ 45 | 'llamacpp' in self.config['CAPTION_ENGINE_MODEL']: 46 | return 'neurosymbolic' 47 | return super().id() # default to unregistered 48 | 49 | def command(self, *args, **kwargs): 50 | super().command(*args, **kwargs) 51 | if 'seed' in kwargs: 52 | self.seed = kwargs['seed'] 53 | if 'except_remedy' in kwargs: 54 | self.except_remedy = kwargs['except_remedy'] 55 | 56 | @property 57 | def max_tokens(self): 58 | return 2048 59 | 60 | def compute_remaining_tokens(self, prompts: list) -> int: 61 | return int((1024) * 0.99) # @NOTE: account for tolerance. 62 | 63 | def forward(self, argument): 64 | prompts = argument.prop.prepared_input 65 | kwargs = argument.kwargs 66 | 67 | model_kwargs = {} 68 | 69 | # convert map to list of strings 70 | stop = kwargs['stop'] if 'stop' in kwargs else None 71 | seed = kwargs['seed'] if 'seed' in kwargs else self.seed 72 | max_tokens = kwargs['max_tokens'] if 'max_tokens' in kwargs else self.compute_remaining_tokens(prompts) 73 | temperature = kwargs['temperature'] if 'temperature' in kwargs else 0.7 74 | top_p = kwargs['top_p'] if 'top_p' in kwargs else 0.95 75 | top_k = kwargs['top_k'] if 'top_k' in kwargs else 40 76 | except_remedy = kwargs['except_remedy'] if 'except_remedy' in kwargs else self.except_remedy 77 | 78 | if stop is not None: 79 | model_kwargs['stop'] = stop 80 | if seed is not None: 81 | model_kwargs['seed'] = seed 82 | if max_tokens is not None: 83 | model_kwargs['n_predict'] = max_tokens 84 | if temperature is not None: 85 | model_kwargs['temperature'] = temperature 86 | if top_p is not None: 87 | model_kwargs['top_p'] = top_p 88 | if top_k is not None: 89 | model_kwargs['top_k'] = top_k 90 | 91 | # Create multipart/form-data payload 92 | # Since the LLaMA server expects a JSON payload, we construct JSON data 93 | prompt = prompts[0] if prompts[0] is not None and len(prompts[0]) > 0 else ' ' # @NOTE: space char to produce at least empty prompt and avoid exception on server side 94 | payload = { 95 | 'prompt': prompt, 96 | **model_kwargs 97 | } 98 | headers = {'Content-Type': 'application/json'} 99 | api = f'{self.host}:{self.port}/{self.uri}' 100 | try: 101 | rsp = requests.post(api, json=payload, headers=headers, timeout=self.timeout) 102 | # Verify the success of the response 103 | rsp.raise_for_status() 104 | res = rsp.text 105 | except requests.exceptions.HTTPError as e: 106 | if except_remedy is None: 107 | self.logger.error(f"HTTP error occurred: {e}") 108 | # Here you can add more sophisticated error handling and recovery 109 | raise e 110 | # Retry the request or handle it based on the exception remedy provided 111 | callback = lambda: requests.post(api, data=payload, headers=headers, timeout=self.timeout) 112 | res = except_remedy(self, e, callback, argument) 113 | except requests.exceptions.RequestException as e: 114 | # Handle non-HTTP exceptions (e.g., network errors, timeout) 115 | if except_remedy is None: 116 | self.logger.error(f"Request error occurred: {e}") 117 | raise e 118 | # Retry the request or handle it based on the exception remedy provided 119 | callback = lambda: requests.post(api, data=payload, headers=headers, timeout=self.timeout) 120 | res = except_remedy(self, e, callback, argument) 121 | except Exception as e: 122 | # Handle unforeseen exceptions 123 | self.logger.error(f"An unexpected error occurred: {e}") 124 | raise e 125 | 126 | metadata = {} 127 | 128 | try: 129 | res = LLaMAResult(res) 130 | except json.JSONDecodeError: 131 | # Handle a JSON parse error specifically 132 | self.logger.error(f"JSON parse error: Invalid response {res}") 133 | raise Exception(f"Invalid response: {res}") 134 | 135 | rsp = [res] 136 | output = rsp if isinstance(prompts, list) else rsp[0] 137 | return output, metadata 138 | 139 | def prepare(self, argument): 140 | if argument.prop.raw_input: 141 | if not argument.prop.processed_input: 142 | raise ValueError('Need to provide a prompt instruction to the engine if raw_input is enabled.') 143 | argument.prop.prepared_input = [str(argument.prop.processed_input)] 144 | return 145 | 146 | _non_verbose_output = """[META INSTRUCTIONS START]\nYou do not output anything else, like verbose preambles or post explanation, such as "Sure, let me...", "Hope that was helpful...", "Yes, I can help you with that...", etc. Consider well formatted output, e.g. for sentences use punctuation, spaces etc. or for code use indentation, etc. Never add meta instructions information to your output!\n""" 147 | user: str = "" 148 | system: str = "" 149 | 150 | if argument.prop.suppress_verbose_output: 151 | system += _non_verbose_output 152 | system = f'{system}\n' if system and len(system) > 0 else '' 153 | 154 | ref = argument.prop.instance 155 | static_ctxt, dyn_ctxt = ref.global_context 156 | if len(static_ctxt) > 0: 157 | system += f"[STATIC CONTEXT]\n{static_ctxt}\n\n" 158 | 159 | if len(dyn_ctxt) > 0: 160 | system += f"[DYNAMIC CONTEXT]\n{dyn_ctxt}\n\n" 161 | 162 | payload = argument.prop.payload 163 | if argument.prop.payload: 164 | system += f"[ADDITIONAL CONTEXT]\n{str(payload)}\n\n" 165 | 166 | examples: List[str] = argument.prop.examples 167 | if examples and len(examples) > 0: 168 | system += f"[EXAMPLES]\n{str(examples)}\n\n" 169 | 170 | if argument.prop.prompt is not None and len(argument.prop.prompt) > 0: 171 | val = str(argument.prop.prompt) 172 | # in this engine, instructions are considered as user prompts 173 | user += f"[INSTRUCTION]\n{val}" 174 | 175 | suffix: str = str(argument.prop.processed_input) 176 | 177 | if '[SYSTEM_INSTRUCTION::]: <<<' in suffix and argument.prop.parse_system_instructions: 178 | parts = suffix.split('\n>>>\n') 179 | # first parts are the system instructions 180 | c = 0 181 | for i, p in enumerate(parts): 182 | if 'SYSTEM_INSTRUCTION' in p: 183 | system += f"{p}\n" 184 | c += 1 185 | else: 186 | break 187 | # last part is the user input 188 | suffix = '\n>>>\n'.join(parts[c:]) 189 | user += f"{suffix}" 190 | 191 | if argument.prop.template_suffix: 192 | user += f"\n[[PLACEHOLDER]]\n{str(argument.prop.template_suffix)}\n\n" 193 | user += f"Only generate content for the placeholder `[[PLACEHOLDER]]` following the instructions and context information. Do NOT write `[[PLACEHOLDER]]` or anything else in your output.\n\n" 194 | 195 | argument.prop.prepared_input = [f'---------SYSTEM BEHAVIOR--------\n{system}\n\n---------USER REQUEST--------\n{user}'] 196 | -------------------------------------------------------------------------------- /src/engines/engine_mockup.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from box import Box 4 | from typing import List 5 | 6 | from symai.backend.base import Engine 7 | from symai.backend.settings import SYMAI_CONFIG 8 | from symai.symbol import Result 9 | 10 | from ..utils import RANDOM_RESPONSE 11 | 12 | 13 | class MockupResult(Result): 14 | def __init__(self, value=None, *args, **kwargs): 15 | super().__init__(value, *args, **kwargs) 16 | self._value = value 17 | self.error = None 18 | self.raw = value 19 | 20 | 21 | class MockupEngine(Engine): 22 | def __init__(self, verbose: bool = False): 23 | super().__init__() 24 | self.logger = logging.getLogger('mockup') 25 | self.logger.setLevel(logging.DEBUG) 26 | self.config = SYMAI_CONFIG 27 | self.seed = None 28 | self.except_remedy = None 29 | self.verbose = verbose 30 | 31 | def id(self) -> str: 32 | return super().id() # default to unregistered 33 | 34 | def command(self, *args, **kwargs): 35 | super().command(*args, **kwargs) 36 | 37 | @property 38 | def max_tokens(self): 39 | return 2048 40 | 41 | def compute_remaining_tokens(self, prompts: list) -> int: 42 | return int((1024) * 0.99) 43 | 44 | def forward(self, argument): 45 | prompts = argument.prop.prepared_input 46 | kwargs = argument.kwargs 47 | 48 | model_kwargs = {} 49 | 50 | # convert map to list of strings 51 | stop = kwargs['stop'] if 'stop' in kwargs else None 52 | seed = kwargs['seed'] if 'seed' in kwargs else self.seed 53 | max_tokens = kwargs['max_tokens'] if 'max_tokens' in kwargs else self.compute_remaining_tokens(prompts) 54 | temperature = kwargs['temperature'] if 'temperature' in kwargs else 0.7 55 | top_p = kwargs['top_p'] if 'top_p' in kwargs else 0.95 56 | top_k = kwargs['top_k'] if 'top_k' in kwargs else 40 57 | except_remedy = kwargs['except_remedy'] if 'except_remedy' in kwargs else self.except_remedy 58 | 59 | if stop is not None: 60 | model_kwargs['stop'] = stop 61 | if seed is not None: 62 | model_kwargs['seed'] = seed 63 | if max_tokens is not None: 64 | model_kwargs['n_predict'] = max_tokens 65 | if temperature is not None: 66 | model_kwargs['temperature'] = temperature 67 | if top_p is not None: 68 | model_kwargs['top_p'] = top_p 69 | if top_k is not None: 70 | model_kwargs['top_k'] = top_k 71 | 72 | if self.verbose: 73 | self.logger.debug(f"kwargs: {kwargs}") 74 | self.logger.debug(f"prompts: {prompts}") 75 | self.logger.debug(f"model_kwargs: {model_kwargs}") 76 | 77 | # Create multipart/form-data payload 78 | # Since the LLaMA server expects a JSON payload, we construct JSON data 79 | try: 80 | rsp = Box({ 81 | 'text': RANDOM_RESPONSE 82 | }) 83 | # Verify the success of the response 84 | res = rsp.text 85 | except Exception as e: 86 | # Handle unforeseen exceptions 87 | self.logger.error(f"An unexpected error occurred: {e}") 88 | raise e 89 | 90 | metadata = {} 91 | res = MockupResult(res) 92 | 93 | rsp = [res] 94 | output = rsp if isinstance(prompts, list) else rsp[0] 95 | return output, metadata 96 | 97 | def prepare(self, argument): 98 | if argument.prop.raw_input: 99 | if not argument.prop.processed_input: 100 | raise ValueError('Need to provide a prompt instruction to the engine if raw_input is enabled.') 101 | argument.prop.prepared_input = [str(argument.prop.processed_input)] 102 | return 103 | 104 | _non_verbose_output = """[META INSTRUCTIONS START]\nYou do not output anything else, like verbose preambles or post explanation, such as "Sure, let me...", "Hope that was helpful...", "Yes, I can help you with that...", etc. Consider well formatted output, e.g. for sentences use punctuation, spaces etc. or for code use indentation, etc. Never add meta instructions information to your output!\n""" 105 | user: str = "" 106 | system: str = "" 107 | 108 | if argument.prop.disable_verbose_output_suppression: 109 | system += _non_verbose_output 110 | system = f'{system}\n' if system and len(system) > 0 else '' 111 | 112 | ref = argument.prop.instance 113 | static_ctxt, dyn_ctxt = ref.global_context 114 | if len(static_ctxt) > 0: 115 | system += f"[STATIC CONTEXT]\n{static_ctxt}\n\n" 116 | 117 | if len(dyn_ctxt) > 0: 118 | system += f"[DYNAMIC CONTEXT]\n{dyn_ctxt}\n\n" 119 | 120 | payload = argument.prop.payload 121 | if argument.prop.payload: 122 | system += f"[ADDITIONAL CONTEXT]\n{str(payload)}\n\n" 123 | 124 | examples: List[str] = argument.prop.examples 125 | if examples and len(examples) > 0: 126 | system += f"[EXAMPLES]\n{str(examples)}\n\n" 127 | 128 | if argument.prop.prompt is not None and len(argument.prop.prompt) > 0: 129 | val = str(argument.prop.prompt) 130 | # in this engine, instructions are considered as user prompts 131 | user += f"[INSTRUCTION]\n{val}" 132 | 133 | suffix: str = str(argument.prop.processed_input) 134 | 135 | if '[SYSTEM_INSTRUCTION::]: <<<' in suffix and argument.prop.parse_system_instructions: 136 | parts = suffix.split('\n>>>\n') 137 | # first parts are the system instructions 138 | c = 0 139 | for i, p in enumerate(parts): 140 | if 'SYSTEM_INSTRUCTION' in p: 141 | system += f"{p}\n" 142 | c += 1 143 | else: 144 | break 145 | # last part is the user input 146 | suffix = '\n>>>\n'.join(parts[c:]) 147 | user += f"{suffix}" 148 | 149 | if argument.prop.template_suffix: 150 | user += f"\n[[PLACEHOLDER]]\n{str(argument.prop.template_suffix)}\n\n" 151 | user += f"Only generate content for the placeholder `[[PLACEHOLDER]]` following the instructions and context information. Do NOT write `[[PLACEHOLDER]]` or anything else in your output.\n\n" 152 | 153 | argument.prop.prepared_input = [f'---------SYSTEM BEHAVIOR--------\n{system}\n\n---------USER REQUEST--------\n{user}'] 154 | -------------------------------------------------------------------------------- /src/evals/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ExtensityAI/benchmark/24d3e93681d454b379d7f1e787b2a2284c41922f/src/evals/__init__.py -------------------------------------------------------------------------------- /src/evals/assets/sample_bill.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ExtensityAI/benchmark/24d3e93681d454b379d7f1e787b2a2284c41922f/src/evals/assets/sample_bill.jpg -------------------------------------------------------------------------------- /src/evals/components/__init__.py: -------------------------------------------------------------------------------- 1 | from .factorization import * -------------------------------------------------------------------------------- /src/evals/components/api_builder.py: -------------------------------------------------------------------------------- 1 | from symai import core 2 | from symai import Expression, Symbol 3 | from symai.pre_processors import PreProcessor 4 | from symai.post_processors import CodeExtractPostProcessor 5 | from symai.components import Execute 6 | 7 | 8 | API_BUILDER_DESCRIPTION = """[Description] 9 | You are an API coding tool for Python that creates API calls to any web URL based on user requests. 10 | For example, if the user wants to use the X API (former Twitter) to post a tweet, you will create the required API post call for that, i.e. 'Write Twitter post `hey, what's up` API-Key:...'. 11 | If the user wants to use the X API to get the latest tweets, you will create the API call for that, e.g. 'Read Twitter post https://twitter.com/...'. 12 | Each created function is atomic and can be used as a building block for more complex functions. 13 | You can also create a function that calls other functions. However, all code must be self-contained in one function `run` including all imports. 14 | Another constraint is that there is one mandatory function called `run` as an entry point to the executable runnable and one provided pre-build function that uses an large language model to extract and parse API calls parameters of user requests or manipulates string-based data as you see fit. 15 | All code parts marked with [MANAGED] are strictly forbidden to be changed! They must be provided as is. 16 | Always generate the entire code for the `run` function, including the `try` and `except` blocks, imports, etc. and the unchanged managed code parts. 17 | 18 | For example, you can write yourself prompts to extract parameters from user requests and use them to create API calls: 19 | ```python 20 | # all code must be self-contained in one function called `run` including all imports 21 | def run(text: str) -> str: # [MANAGED] entry point cannot be changed 22 | # [MANAGED-BEGIN] mandatory imports here 23 | import traceback 24 | import requests 25 | from symai import Function 26 | # [MANAGED-END] mandatory imports here 27 | 28 | # optional imports here 29 | # TODO: all your imports and code here 30 | 31 | # executable code here 32 | try: # [MANAGED] must contain this line, do not change 33 | # optional helper functions here 34 | 35 | # optional params extraction here 36 | # TODO: extract params from request full-text if needed 37 | # Example: 38 | func = Function('YOUR_PROMPT_1') # TODO: extract function param 1 39 | param1 = func(request) 40 | func = Function('YOUR_PROMPT_2') # TODO: extract function param 2 41 | param2 = func(request) 42 | # ... extract more params if needed 43 | 44 | # optional params manipulation here 45 | res = # TODO: run https APIs with the respective params, use tools like requests, urllib, etc. 46 | 47 | # optional result formatting here 48 | # Another example: 49 | func = Function('YOUR_PROMPT_3') # TODO: format result if needed 50 | res = func(res) 51 | 52 | # mandatory return statement here 53 | res = str(res) # [MANAGED] must contain this line, do not change 54 | return res # [MANAGED] must return a string, do not change 55 | except Exception as e: # [MANAGED] must catch all exceptions and return them as string 56 | tb = traceback.format_exc() # [MANAGED] return full error stack trace as string 57 | return tb # [MANAGED] return tb as string, do not change 58 | 59 | # mandatory statement here 60 | res = run(value) # [MANAGED] must contain this line, do not change 61 | ``` 62 | """ 63 | 64 | 65 | class APIBuilderPreProcessor(PreProcessor): 66 | def __call__(self, argument): 67 | return '$> {} =>'.format(str(argument.args[0])) 68 | 69 | 70 | class APIBuilder(Expression): 71 | @property 72 | def static_context(self) -> str: 73 | return API_BUILDER_DESCRIPTION 74 | 75 | def __init__(self, **kwargs): 76 | super().__init__(**kwargs) 77 | self.sym_return_type = APIBuilder 78 | 79 | def forward(self, sym: Symbol, **kwargs) -> Symbol: 80 | @core.zero_shot(prompt="Build the API call code:\n", 81 | pre_processors=[APIBuilderPreProcessor()], 82 | post_processors=[CodeExtractPostProcessor()], **kwargs) 83 | def _func(_, text) -> str: 84 | pass 85 | 86 | return _func(self, sym) 87 | 88 | 89 | class StackTraceRetryExecutor(Expression): 90 | def __init__(self, retries: int = 1, **kwargs): 91 | super().__init__(**kwargs) 92 | self.executor = Execute() 93 | self.max_retries = retries 94 | self._runnable = None 95 | 96 | def forward(self, code: Symbol, request: Symbol, **kwargs) -> Symbol: 97 | code = str(code) 98 | # Set value that gets passed on to the 'run' function in the generated code 99 | value = request.value # do not remove this line 100 | # Create the 'run' function 101 | self._runnable = self.executor(code, locals=locals().copy(), globals=globals().copy()) 102 | result = self._runnable['locals']['run'](value) 103 | retry = 0 104 | # Retry if there is a 'Traceback' in the result 105 | while 'Traceback' in result and retry < self.max_retries: 106 | self._runnable = self.executor(code, payload=result, locals=locals().copy(), globals=globals().copy(), **kwargs) 107 | result = self._runnable['locals']['run'](value) 108 | retry += 1 109 | if 'locals_res' in self._runnable: 110 | result = self._runnable['locals_res'] 111 | return result 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /src/evals/components/factorization.py: -------------------------------------------------------------------------------- 1 | from symai import Function 2 | 3 | 4 | FACTORIZATION_CONTEXT = """[Context] 5 | Compute the factorization of expression, ``f``, into irreducibles. (To 6 | factor an integer into primes, use ``factorint``.) 7 | 8 | There two modes implemented: symbolic and formal. If ``f`` is not an 9 | instance of :class:`Poly` and generators are not specified, then the 10 | former mode is used. Otherwise, the formal mode is used. 11 | 12 | In symbolic mode, :func:`factor` will traverse the expression tree and 13 | factor its components without any prior expansion, unless an instance 14 | of :class:`~.Add` is encountered (in this case formal factorization is 15 | used). This way :func:`factor` can handle large or symbolic exponents. 16 | 17 | By default, the factorization is computed over the rationals. To factor 18 | over other domain, e.g. an algebraic or finite field, use appropriate 19 | options: ``extension``, ``modulus`` or ``domain``. 20 | """ 21 | 22 | 23 | class Factorization(Function): 24 | @property 25 | def static_context(self): 26 | return FACTORIZATION_CONTEXT 27 | -------------------------------------------------------------------------------- /src/evals/components/paper.py: -------------------------------------------------------------------------------- 1 | from symai import Function 2 | from symai.components import Sequence, Parallel 3 | from symai.extended import Conversation 4 | from symai.post_processors import StripPostProcessor, CodeExtractPostProcessor 5 | 6 | 7 | SYMBOLIC_AI_PAPER = """Write a scientific paper about the machine learning framework called SymbolicAI which operates on the following principles: 8 | - Symbolic methods 9 | - Sub-symbolic methods 10 | - Neural-symbolic methods 11 | - Probabilistic programming methods 12 | - Cognitive architectures 13 | Be precise in your writing and follow a scientific style. Do not use any colloquial language. However, formulate simple and understandable sentences.""" 14 | 15 | 16 | PAPER_STATIC_CONTEXT = """[General Context] 17 | {context} 18 | 19 | [Format] 20 | Your output format should be parsable by a LaTeX compiler. All produced content should be enclosed between the \n```latex\n ... \n``` blocks. Do not create document classes or other LaTeX meta commands. Always assume that the document class is already defined. Only produce exactly one latex block with all your content. 21 | Only use either `section`, `subsection`, paragraph`, `texttt`, `textbf`, `emph` or `citep` commands to structure your content. Do not use any other LaTeX commands. 22 | The following is an example of your expected output: 23 | 24 | [Example] 25 | ```latex 26 | \\documentclass{{article}} 27 | \\begin{{document}} 28 | % TODO: your content here 29 | \\end{{document}} 30 | ``` 31 | 32 | {description} 33 | """ 34 | 35 | 36 | class Paper(Function): 37 | def __init__(self, *sequence, context: str = SYMBOLIC_AI_PAPER, **kwargs): 38 | super().__init__(**kwargs) 39 | self.sequence = Sequence(*sequence) 40 | self.context = context 41 | 42 | def forward(self, task, **kwargs): 43 | # execute the sequence of tasks 44 | res = self.sequence(task, **kwargs) 45 | # access results from the global root node metadata 46 | results = self.linker.results 47 | # return the reversed results 48 | reverse_res = str(list(reversed(list(results.values())))) 49 | # create the final task by concatenating the results 50 | return super().forward(task | reverse_res | res, **kwargs) 51 | 52 | @property 53 | def static_context(self): 54 | return PAPER_STATIC_CONTEXT.format(context=self.context, description='The final paper must include the title an abstract and a related work section and method section.') 55 | 56 | 57 | class Context(Conversation): 58 | def __init__(self, context: str = SYMBOLIC_AI_PAPER, **kwargs): 59 | super().__init__(**kwargs) 60 | self.auto_print = False 61 | self.prompt = 'Replace the % TODO: with your content and follow the task description below.' 62 | self.context = context 63 | 64 | def forward(self, task, *args, **kwargs): 65 | function = Function(self.prompt, 66 | post_processors=[StripPostProcessor(), CodeExtractPostProcessor()], 67 | static_context=self.static_context, 68 | dynamic_context=self.dynamic_context) 69 | return function(f"{task}\n[Source]\n{self.history()}", *args, **kwargs) 70 | 71 | @property 72 | def description(self): 73 | raise NotImplementedError() 74 | 75 | @property 76 | def static_context(self): 77 | return PAPER_STATIC_CONTEXT.format(context=self.context, description=self.description) 78 | 79 | 80 | class Source(Context): 81 | @property 82 | def description(self): 83 | return """[Task] 84 | Summarize the referenced method to use it as a conditioning context for a large Language model like GPT-3. 85 | Do not create any sections or subsections. Only write one coherent text about the main principles and concepts of the method. 86 | """ 87 | 88 | class Method(Context): 89 | def __init__(self, source, **kwargs): 90 | super().__init__(**kwargs) 91 | self.source = source 92 | 93 | def forward(self, task, **kwargs): 94 | summary = self.source(task, **kwargs) 95 | # update the dynamic context globally for all types 96 | self.adapt(context=summary, types=[RelatedWork, Abstract, Title, Introduction, Cite]) 97 | return super().forward(task | summary, **kwargs) 98 | 99 | @property 100 | def description(self): 101 | return """[Task] 102 | Your goal is to write the method section which describes the main approach and principles used. Add one methodology section with one consistent paragraph. Provide citations and references. 103 | """ 104 | 105 | 106 | class Cite(Source): 107 | @property 108 | def description(self): 109 | return """[Task] 110 | Write a short two sentence related work summary in the context of the paper. Do not add any sections or subsections. 111 | """ 112 | 113 | 114 | class RelatedWork(Context): 115 | def __init__(self, *citations, **kwargs): 116 | super().__init__(**kwargs) 117 | self.citations = Parallel(*citations, sequential=True) # to avoid API rate limits process parallel citations sequentially 118 | 119 | def forward(self, task, **kwargs): 120 | # execute the parallel tasks 121 | res = self.citations(task, **kwargs) 122 | return super().forward(res, **kwargs) 123 | 124 | @property 125 | def description(self): 126 | return """[Task] 127 | Write a coherent related work section in the context of the paper and based on the provided citation sources. Add one related work section with one consistent paragraph. Provide citations and references. 128 | """ 129 | 130 | 131 | class Introduction(Context): 132 | def __init__(self, *citations, **kwargs): 133 | super().__init__(**kwargs) 134 | self.citations = Parallel(*citations, sequential=True) 135 | 136 | def forward(self, task, **kwargs): 137 | # execute the parallel tasks 138 | res = self.citations(task, **kwargs) 139 | return super().forward(res, **kwargs) 140 | 141 | @property 142 | def description(self): 143 | return """[Task] 144 | Write a coherent introduction section in the context of the paper and based on the provided context. Add one introduction section with one consistent paragraph. Provide citations and references. 145 | """ 146 | 147 | 148 | class Abstract(Context): 149 | @property 150 | def description(self): 151 | return """[Task] 152 | Write the paper abstract given the provided context. Add one abstract section with one consistent paragraph. 153 | """ 154 | 155 | 156 | class Title(Context): 157 | @property 158 | def description(self): 159 | return """[Task] 160 | Write the paper title given the provided context. Add one title tag for the document. 161 | """ 162 | -------------------------------------------------------------------------------- /src/evals/eval_in_context_associations.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from symai import Symbol, Expression 4 | from symai.utils import toggle_test 5 | 6 | from src.utils import MOCK_RETURN, RANDOMNESS, bool_success, normalize 7 | 8 | 9 | ACTIVE = True 10 | 11 | 12 | # Define basic test functions 13 | @toggle_test(ACTIVE, default=MOCK_RETURN) 14 | def test_basic_factual_prompt(aggregate): 15 | '''Sanity check test if the basic prompt works''' 16 | sym = Expression.prompt('''[Last Instruction] 17 | Return only a number as an answer. 18 | [Last Query] 19 | Give the meaning of life a number, meaning that the answer to life, the universe and everything is: 20 | [Answer]''') 21 | # sanity check if models are working 22 | # every model must pass this basic test 23 | res = ('42' in str(sym)) | aggregate.res # collect the result value 24 | return res, bool_success(res) 25 | 26 | 27 | @toggle_test(ACTIVE, default=MOCK_RETURN) 28 | def test_basic_factual_prompt_pi(aggregate): 29 | '''Sanity check test if the basic prompt works''' 30 | sym = Expression.prompt('''[Last Instruction] 31 | Return only a number as an answer. 32 | [Last Query] 33 | Write the number of Pi up to the 10th digit after the comma: 34 | [Last Answer]''') | aggregate.sym # collect the symbol value 35 | # sanity check if models are working 36 | # every model must pass this basic test 37 | base = Symbol('3.1415926535') | aggregate.base # collect the base value 38 | score = sym.measure(base) | aggregate.score # collect the score 39 | return True, {'scores': [score.value]} 40 | 41 | 42 | # Define the test functions based on in-context learning associations and compositions 43 | @toggle_test(ACTIVE, default=MOCK_RETURN) 44 | def test_add_and_equals(aggregate): 45 | '''Test if the addition operator between two number symbols works''' 46 | try: 47 | sym = (Symbol(1) + Symbol(2)).int() 48 | except: 49 | sym = 0 # default value for failure 50 | res = (sym == 3) | aggregate.res # collect the result value 51 | return res, bool_success(res) 52 | 53 | 54 | @toggle_test(ACTIVE, default=MOCK_RETURN) 55 | def test_add_and_equals_2(aggregate): 56 | '''Test if the addition operator between a number symbol and linguistic number symbol works''' 57 | # auto cast to Symbol 58 | try: 59 | sym = (Symbol(17) + 'two').int() 60 | except: 61 | sym = 0 # default value for failure 62 | res = (sym == 19) | aggregate.res # collect the result value 63 | return res, bool_success(res) 64 | 65 | 66 | @toggle_test(ACTIVE, default=MOCK_RETURN) 67 | def test_add_and_equals_3(aggregate): 68 | '''Test if the addition operator between a large number symbol and linguistic number symbol works''' 69 | # auto cast to Symbol 70 | try: 71 | sym = ('two hundred and thirty four' + Symbol(7000)).int() 72 | except: 73 | sym = 0 # default value for failure 74 | res = (sym == 7234) | aggregate.res # collect the result value 75 | return res, bool_success(res) 76 | 77 | 78 | @toggle_test(ACTIVE, default=MOCK_RETURN) 79 | def test_check_pi(aggregate): 80 | '''Test if a fuzzy equality between pi string symbol and an number approximation symbol works''' 81 | # semantic understanding of pi 82 | sym = Symbol('pi') | aggregate.sym # collect the symbol value 83 | # test if pi is equal to 3.14159265... by approximating 84 | res = (sym == '3.14159265...') | aggregate.res # collect the result value 85 | return res, bool_success(res) 86 | 87 | 88 | @toggle_test(ACTIVE, default=MOCK_RETURN) 89 | def test_check_pi_2(aggregate): 90 | '''Test if a fuzzy equality between np.pi number symbol and an number approximation symbol works''' 91 | # has high floating point precision 92 | sym = Symbol(np.pi) | aggregate.sym # collect the symbol value 93 | # test if pi is equal to 3.14159265... by approximating 94 | res = (sym == '3.14159265...') | aggregate.res # collect the result value 95 | return res, bool_success(res) 96 | 97 | 98 | @toggle_test(ACTIVE, default=MOCK_RETURN) 99 | def test_sub_and_contains(aggregate): 100 | '''Test if a semantic subtraction operator between two symbols works''' 101 | # semantic understanding of subtraction 102 | base = 'Hello, I would like a cup of coffee.' | aggregate.base # collect the base value 103 | res = ((Symbol('Hello, I would like a cup of tea.') - Symbol('tea')) + 'coffee') | aggregate.res # collect the result value 104 | rand = Symbol(RANDOMNESS).mean().measure(base) | aggregate.rand # collect the random value 105 | # @NOTE: special case, where we expect the exact solution 106 | score = res.measure(base, normalize=normalize(1.0, rand)) | aggregate.score # collect the score 107 | return True, {'scores': [score.value]} 108 | 109 | 110 | @toggle_test(ACTIVE, default=MOCK_RETURN) 111 | def test_compare(aggregate): 112 | '''Test if a comparison operator between two number symbols works''' 113 | res = (Symbol(10) > Symbol('5')) 114 | # @NOTE: Bernoulli trial 115 | res = (res == True) | aggregate.res # collect the result value 116 | return res, bool_success(res) 117 | 118 | 119 | @toggle_test(ACTIVE, default=MOCK_RETURN) 120 | def test_compare_2(aggregate): 121 | '''Test if a semantic comparison operator between two symbols works''' 122 | res = Symbol(10) < Symbol('fifteen thousand') 123 | # @NOTE: Bernoulli trial 124 | res = (res == True) | aggregate.res # collect the result value 125 | return res, bool_success(res) 126 | 127 | 128 | @toggle_test(ACTIVE, default=MOCK_RETURN) 129 | def test_insert_rshift(aggregate): 130 | '''Test if information can be inserted into a symbol using the RSHIFT operator''' 131 | base = 'I love to eat apples and bananas' | aggregate.base # collect the base value 132 | sym = Symbol('I love to eat apples') | aggregate.sym # collect the symbol value 133 | res = ('and bananas' >> sym) | aggregate.res # collect the result value 134 | # expect exact solution 135 | rand = Symbol(RANDOMNESS).mean().measure(base) | aggregate.rand # collect the random value 136 | # @NOTE: special case, where we expect the exact solution 137 | score = res.measure(base, normalize=normalize(1.0, rand)) | aggregate.score # collect the score 138 | return True, {'scores': [score.value]} 139 | 140 | 141 | @toggle_test(ACTIVE, default=MOCK_RETURN) 142 | def test_extract_information(aggregate): 143 | '''Test if information can be extracted from a symbol using the EXTRACT operator''' 144 | sym = Symbol('I have an iPhone from Apple. And it is not cheap. ' + \ 145 | 'I love to eat bananas, mangos, and oranges. ' + \ 146 | 'My hobbies are playing football and basketball.') | aggregate.sym # collect the symbol value 147 | res = sym.extract('fruits') 148 | res = str(res).lower().strip() | aggregate.res # collect the result value 149 | cnt = 0 150 | succ = True 151 | # check if the EXTRACT operator retains the 3 essential words 152 | succ &= 'bananas' in res 153 | # @NOTE: Bernoulli trials 154 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value 155 | succ &= 'mangos' in res 156 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value 157 | succ &= 'oranges' in res 158 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value 159 | return succ, {'scores': [cnt/3.0]} 160 | 161 | 162 | @toggle_test(ACTIVE, default=MOCK_RETURN) 163 | def test_extract_contextual_information(aggregate): 164 | '''Test if number information can be extracted from a symbol using the EXTRACT operator''' 165 | sym = Symbol("""Exception: Failed to query GPT-3 after 3 retries. Errors: [InvalidRequestError(message="This model's maximum context length is 4097 tokens, however you requested 7410 tokens (2988 in your prompt; 4422 for the completion). Please reduce your prompt; or completion length.", 166 | param=None, code=None, http_status=400, request_id=None)]""") | aggregate.sym # collect the symbol value 167 | try: 168 | res = sym.extract('requested tokens').int() # cast to int 169 | except: 170 | res = 0 # default value 171 | # check if the EXTRACT operator gets the correct number of tokens 172 | res = (res == 7410) | aggregate.res # collect the result value 173 | return res, bool_success(res) 174 | 175 | 176 | @toggle_test(ACTIVE, default=MOCK_RETURN) 177 | def test_filter(aggregate): 178 | '''Test if filtering information can be applied to a symbol using the FILTER operator''' 179 | sym = Symbol('Physics, Sports, Mathematics, Music, Art, Theater, Writing') | aggregate.sym # collect the symbol value 180 | res = sym.filter('science related subjects') 181 | res = str(res).lower().strip() | aggregate.res # collect the result value 182 | cnt = 0 183 | succ = True 184 | # check if the FILTER operator retains the essential words 185 | # @NOTE: Bernoulli trials 186 | succ &= 'physics' in res 187 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value 188 | succ &= 'mathematics' in res 189 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value 190 | succ &= 'music' not in res 191 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value 192 | succ &= 'art' not in res 193 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value 194 | succ &= 'theater' not in res 195 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value 196 | succ &= 'writing' not in res 197 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value 198 | succ &= 'sports' not in res 199 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value 200 | return succ, {'scores': [cnt/7.0]} 201 | 202 | 203 | @toggle_test(ACTIVE, default=MOCK_RETURN) 204 | def test_clean(aggregate): 205 | '''Test if cleaning information can be applied to a symbol using the CLEAN operator''' 206 | base = 'Hello World' | aggregate.base # collect the base value 207 | sym = Symbol('Hello *&&7amp;;; \t\t\t\nWorld') | aggregate.sym # collect the symbol value 208 | res = sym.clean() | aggregate.res # collect the result value 209 | # check if the CLEAN operator retains the 2 essential words 210 | # expect exact solution 211 | rand = Symbol(RANDOMNESS).mean().measure(base) | aggregate.rand # collect the random value 212 | # @NOTE: special case, where we expect the exact solution 213 | score = res.measure(base, normalize=normalize(1.0, rand)) | aggregate.score # collect the score 214 | return True, {'scores': [score.value]} 215 | -------------------------------------------------------------------------------- /src/evals/eval_multimodal_bindings.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from src.utils import normalize, RANDOMNESS, MOCK_RETURN 4 | 5 | from symai import core_ext, Symbol, Expression, Interface, Function 6 | from symai.utils import toggle_test 7 | 8 | 9 | ACTIVE = True 10 | 11 | 12 | OPTION0_BASE_REF = ['Mathematics related topic', 13 | 'MATHEMATICS RELATED TOPIC', 14 | 'mathematics and related topics'] 15 | OPTION1_BASE_REF = ['Website Content Scraping and Crawling', 16 | 'web content scraping and crawling', 17 | 'WEBSITE CONTENT RELATED TOPICS'] 18 | OPTION2_BASE_REF = ['Search Engine Query', 19 | 'search engine query', 20 | 'SEARCH ENGINE QUERY'] 21 | OPTION3_BASE_REF = ['Optical Character Recognition', 22 | 'optical character recognition', 23 | 'OPTICAL CHARACTER RECOGNITION'] 24 | OPTION_REFS = [OPTION0_BASE_REF, OPTION1_BASE_REF, OPTION2_BASE_REF, OPTION3_BASE_REF] 25 | 26 | 27 | class Category(Expression): 28 | def __init__(self, **kwargs): 29 | super().__init__(**kwargs) 30 | self.options = { 31 | 0: 'mathematics related topic', 32 | 1: 'website content scraping and crawling', 33 | 2: 'search engine query', 34 | 3: 'optical character recognition', 35 | 4: 'image rendering', 36 | 5: 'image captioning', 37 | 6: 'audio transcription', 38 | 7: 'unknown' 39 | } 40 | 41 | def forward(self): 42 | @core_ext.cache(in_memory=True) 43 | def _embed(_): 44 | def _emb_mapping_(category): 45 | sym = Symbol(category) 46 | return sym.embed() 47 | emb = map(_emb_mapping_, self.options.values()) 48 | return list(emb) 49 | return _embed(self) 50 | 51 | 52 | LINEAR_ALGEBRA = 'linear algebra' 53 | NUMBER_COMPARISON = 'number comparison' 54 | 55 | 56 | class MultiModalExpression(Expression): 57 | def __init__(self, val, **kwargs): 58 | super().__init__(val, **kwargs) 59 | # define interfaces 60 | self.solver = Interface('wolframalpha') 61 | self.crawler = Interface('selenium') 62 | self.search = Interface('serpapi') 63 | self.ocr = Interface('ocr') 64 | self.rendering = Interface('dall_e') 65 | self.captioning = Interface('llava') 66 | self.transcribe = Interface('whisper') 67 | # evaluation interfaces 68 | self.clip = Interface('clip') 69 | # define functions 70 | self.func = Function("Summarize the content:") 71 | self.category = Category() 72 | 73 | def detect_option(self, aggregate, assertion): 74 | option = assertion() | aggregate.category.option 75 | # testing the category detection accuracy 76 | category = self.choice(self.category.options.values(), default='unknown', temperature=0.0) | aggregate.category.category 77 | base = Symbol(OPTION_REFS[option]) 78 | base_mean = base.mean(axis=0) | aggregate.category.base_mean 79 | base_score = base.cvs() | aggregate.category.base_score 80 | rand_seq = Symbol(RANDOMNESS).mean(axis=0) | aggregate.category.rand_mean 81 | rand_score = base_mean.measure(rand_seq) | aggregate.category.rand_score 82 | score = category.measure(self.category.options[option], 83 | normalize=normalize(base_score, rand_score)) | aggregate.category.score 84 | return option, score.value 85 | 86 | def forward(self, aggregate, assertion, presets, **kwargs): 87 | res = None 88 | scoring = [] 89 | success = False 90 | # detect the type of expression 91 | option, score = self.detect_option(aggregate, assertion) 92 | scoring.append(score) 93 | 94 | # mathematical formula 95 | if option == 0: 96 | ref_formula, instance_type, details = presets() 97 | ref_formula = Symbol(ref_formula) | aggregate.ref_formula 98 | formula = self.extract('mathematical formula') | aggregate.formula 99 | score = ref_formula.measure(formula) | aggregate.formula_score 100 | scoring.append(score.value) 101 | # subtypes of mathematical formula 102 | if self.isinstanceof(LINEAR_ALGEBRA, temperature=0.0): 103 | score = (1.0 if str(instance_type) == LINEAR_ALGEBRA else 0.0) | aggregate.linear_function.score 104 | scoring.append(score) 105 | if score == 0.0: # avoid error when in wrong category 106 | # no score for other types of mathematical formula 107 | score = 0.0 | aggregate.linear_function.answer_score 108 | scoring.append(score) 109 | return success, scoring 110 | answer, solutions = details 111 | answer = Symbol(answer) | aggregate.linear_function.answer 112 | # prepare for wolframalpha 113 | res = self.solver(formula) 114 | res = res.query('write a one sentence summary of the answer') | aggregate.number_comparison.res 115 | rand_seq = Symbol(RANDOMNESS).mean(axis=0) | aggregate.number_comparison.rand_mean 116 | sol_mean = solutions.mean(axis=0) | aggregate.number_comparison.solutions_mean 117 | base_score = solutions.cvs() | aggregate.number_comparison.base_score 118 | rand_score = answer.measure(rand_seq) | aggregate.number_comparison.rand_score 119 | score = answer.measure(sol_mean, normalize=normalize(base_score, rand_score)) | aggregate.number_comparison.answer_score 120 | scoring.append(score.value) 121 | success = True 122 | 123 | elif self.isinstanceof(NUMBER_COMPARISON, temperature=0.0): 124 | score = (1.0 if str(instance_type) == NUMBER_COMPARISON else 0.0) | aggregate.number_comparison.score 125 | scoring.append(score) 126 | if score == 0.0: # avoid error when in wrong category 127 | # no score for other types of mathematical formula 128 | score = 0.0 | aggregate.number_comparison.answer_score 129 | scoring.append(score) 130 | return success, scoring 131 | answer = details | aggregate.number_comparison.answer 132 | res = self.solver(formula) # send directly to wolframalpha 133 | score = (1.0 if res == answer else 0.0) | aggregate.number_comparison.answer_score 134 | scoring.append(score) 135 | success = True 136 | 137 | else: 138 | # no score for other types of mathematical formula 139 | score = 0.0 | aggregate.unknown.score 140 | scoring.append(score) 141 | success = False 142 | 143 | # website content scraping and crawling 144 | elif option == 1: 145 | ori_url, page, content_sym, base_score, rand_score = presets() 146 | ori_url_sym = Symbol(ori_url) | aggregate.website_scraping.ori_url 147 | url = self.extract('url') | aggregate.website_scraping.gen_url 148 | score = ori_url_sym.measure(url) | aggregate.website_scraping.score 149 | scoring.append(score.value) 150 | res = self.func(page) | aggregate.website_scraping.res 151 | # normalize the score towards the original content 152 | score = content_sym.measure(res, normalize=normalize(base_score, rand_score)) | aggregate.website_scraping.score 153 | scoring.append(score.value) 154 | success = True 155 | 156 | # search engine query 157 | elif option == 2: 158 | answer = presets() | aggregate.search_engine.answer 159 | 160 | if kwargs.get('real_time'): 161 | res = self.search(self.value) 162 | res = res.raw.organic_results.to_list() 163 | else: 164 | snippet_path = Path(__file__).parent / "snippets" / "google_organic_results_20240111_query=What-is-sulfuric-acid.txt" 165 | res = open(snippet_path, "r").read() 166 | 167 | res = Symbol(res) | aggregate.search_engine.res 168 | res = res.extract("The answer based on the CDC source.") 169 | score = res.measure(answer) | aggregate.search_engine.score 170 | scoring.append(score.value) 171 | success = True 172 | 173 | # optical character recognition 174 | elif option == 3: 175 | answer = presets() | aggregate.ocr_engine.answer 176 | if kwargs.get('real_time'): 177 | res = self.ocr((Path(__file__).parent / "assets" / "sample_bill.jpg").as_posix()) 178 | else: 179 | snippet_path = Path(__file__).parent / "snippets" / "sample_bill.txt" 180 | res = open(snippet_path, "r").read() 181 | res = Symbol(res) 182 | 183 | res = res.extract(self.value) | aggregate.ocr_engine.res 184 | score = res.measure(answer) | aggregate.ocr_engine.score 185 | scoring.append(score.value) 186 | success = True 187 | 188 | # Other modalities we could evaluate and include in the score in the future, but exceeds the scope of this benchmark. 189 | # image rendering 190 | # elif option == 4: 191 | # query = self.extract('image url') 192 | # res = self.rendering(query) 193 | 194 | # image captioning 195 | # elif option == 5: 196 | # image = self.extract('image path') 197 | # res = self.captioning(image) 198 | 199 | # audio transcription 200 | # elif option == 6: 201 | # audio = self.extract('audio path') 202 | # res = self.transcribe(audio) 203 | 204 | else: 205 | score = 0.0 | aggregate.unknown.score 206 | scoring.append(score) 207 | success = False 208 | 209 | return success, scoring 210 | 211 | 212 | @toggle_test(ACTIVE, default=MOCK_RETURN) 213 | def test_website_scraping(aggregate): 214 | # scraped content 215 | content = """ChatGPT back online after ‘major outage,’ OpenAI says 216 | PUBLISHED THU, DEC 14 20231:58 AM EST 217 | 218 | KEY POINTS 219 | OpenAI on Thursday said that a major outage on its artificial intelligence chatbot ChatGPT was resolved. 220 | ChatGPT had issues for around 40 minutes, during which service was “intermittently unavailable.” 221 | OpenAI did not give an explanation on what caused the latest issues. 222 | 223 | OpenAI on Thursday said that a major outage on its artificial intelligence chatbot, ChatGPT, was resolved. 224 | 225 | ChatGPT had issues for around 40 minutes, during which the service was “intermittently unavailable.” 226 | 227 | OpenAI also said that some users of ChatGPT Enterprise, which is designed for businesses, were encountering “elevated error rates.” 228 | 229 | Earlier this month, ChatGPT suffered another issue, where the company said around 10% of users may have been unable to send a message to ChatGPT. The AI technology had another major outage in November. 230 | 231 | OpenAI did not give an explanation on what caused the latest issues. 232 | 233 | ChatGPT broke records as the fastest-growing consumer app in history and now has about 100 million weekly active users, while more than 92% of Fortune 500 companies employ the platform, according to OpenAI. 234 | 235 | The Microsoft 236 | -backed company has had a rocky time of late, as the board fired CEO Sam Altman in November, only for him to be reinstated days later after pressure from employees and investors. 237 | 238 | — CNBC’s Hayden Field contributed to this article.""" 239 | summary = """OpenAI reported that a significant outage affecting its AI chatbot, ChatGPT, was resolved following a 40-minute disruption that left the service intermittently unavailable. It was noted that users of the ChatGPT Enterprise experienced elevated error rates as well. Earlier in the month and in November, ChatGPT had faced other service issues. OpenAI did not disclose the cause of the recent outage. ChatGPT has become immensely popular, touted as the fastest-growing consumer app ever, with approximately 100 million weekly active users and adoption by many top companies. Despite its success, OpenAI, supported by Microsoft, has experienced some turbulence, including the brief dismissal and subsequent reinstatement of CEO Sam Altman.""" 240 | url = "https://www.cnbc.com/2023/12/14/chatgpt-back-online-after-major-outage-openai-says.html" 241 | val = f"crawl the news site from {url}" 242 | expr = MultiModalExpression(val) 243 | content_sym = Symbol(content) | aggregate.content 244 | summary_sym = Symbol(summary) | aggregate.summary 245 | base_score = content_sym.measure(summary_sym) | aggregate.content_score 246 | rand_seq = Symbol(RANDOMNESS).mean(axis=0) | aggregate.rand_seq 247 | rand_score = content_sym.measure(rand_seq) | aggregate.rand_score 248 | succ, scoring = expr( 249 | aggregate, 250 | lambda: 1, 251 | lambda: (url, content, content_sym, base_score, rand_score) 252 | ) 253 | return succ, {'scores': scoring} 254 | 255 | 256 | @toggle_test(ACTIVE, default=MOCK_RETURN) 257 | def test_search_engine(aggregate): 258 | query = "What is sulfuric acid?" 259 | # Let's test whether or not it can extract the answer based on the CDC source. 260 | answer = Symbol("Sulfuric acid (H2S04) is a corrosive substance, destructive to the skin, eyes, teeth, and lungs. Severe exposure can result in death.") 261 | expr = MultiModalExpression(query) 262 | succ, scoring = expr( 263 | aggregate, 264 | lambda: 2, 265 | lambda: answer, 266 | real_time=False 267 | ) 268 | 269 | return succ, {'scores': scoring} 270 | 271 | 272 | @toggle_test(ACTIVE, default=MOCK_RETURN) 273 | def test_linear_function_computation(aggregate): 274 | query = Symbol('Analyse the following vectors and asses if (2, -11, 2) and (14, 2, 2) are linearly dependent?') 275 | ref = Symbol("(2, -11, 2) and (14, 2, 2) are linearly independent.") 276 | solutions = Symbol([ 277 | "(2, -11, 2) and (14, 2, 2) are actually linearly independent.", 278 | "No, the vectors (2, -11, 2) and (14, 2, 2) demonstrate linear independence.", 279 | "The vectors (2, -11, 2) and (14, 2, 2) are not linearly dependent." 280 | ]) 281 | expr = MultiModalExpression(query) 282 | succ, scoring = expr( 283 | aggregate, 284 | lambda: 0, 285 | lambda: ('(2, -11, 2) and (14, 2, 2) are linearly independent?', Symbol(LINEAR_ALGEBRA), (ref, solutions)) 286 | ) 287 | 288 | return succ, {'scores': scoring} 289 | 290 | 291 | @toggle_test(ACTIVE, default=MOCK_RETURN) 292 | def test_comparison(aggregate): 293 | val = Symbol("is 100044347 bigger than 129981063.472?") 294 | expr = MultiModalExpression(val) 295 | succ, res = expr( 296 | aggregate, 297 | lambda: 0, 298 | lambda: ('100044347 > 129981063.472', Symbol(NUMBER_COMPARISON), False) 299 | ) 300 | return succ, {'scores': res} 301 | 302 | 303 | @toggle_test(ACTIVE, default=MOCK_RETURN) 304 | def test_ocr_engine(aggregate): 305 | query = "Extract the current balance from the bill image." 306 | answer = Symbol("$ 21,920.37") 307 | expr = MultiModalExpression(query) 308 | succ, scoring = expr( 309 | aggregate, 310 | lambda: 3, 311 | lambda: answer, 312 | real_time=False 313 | ) 314 | return succ, {'scores': scoring} 315 | 316 | -------------------------------------------------------------------------------- /src/evals/eval_program_synthesis.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from src.utils import normalize, rand_ast_measure, ast_measure, RANDOMNESS, MOCK_RETURN 4 | 5 | from symai import Symbol, Expression, Conversation, Call 6 | from symai.components import FileReader, Execute, RuntimeExpression, ExpressionBuilder 7 | from symai.processor import ProcessorPipeline 8 | from symai.post_processors import StripPostProcessor, CodeExtractPostProcessor 9 | from symai.utils import toggle_test 10 | from symai.extended.api_builder import APIBuilder, StackTraceRetryExecutor 11 | 12 | 13 | ACTIVE = True 14 | cur_file_dir = os.path.dirname(os.path.abspath(__file__)) 15 | 16 | 17 | @toggle_test(ACTIVE, default=MOCK_RETURN) 18 | def test_application_template(aggregate): 19 | task = """[Task] 20 | Create a function `create_latex_result` that takes in the `benchmark_results` as `data` and parses the LaTeX table rows and columns based on the `data` results. The table should follow the `latex_template` format and populate the rows table as indicated by the placeholder variables. Mark the best performing model per row with bold text. At the bottom of the benchmarks, place the values of the total row by computing the average over all columns and populating the `total_values` entry in the `latex_template`. 21 | The table should be returned as a string by the function. 22 | All required imports are already provided. The code of the `create_latex_result` function should be written between a 23 | ```python 24 | ... 25 | ``` 26 | code block. 27 | The `create_latex_result` function must be self-contained, fully functional and pass all tests. 28 | No other functions or explanations are required. 29 | """ 30 | # Define random sequence to normalize data 31 | random_seq = Symbol(RANDOMNESS).mean(axis=0) | aggregate.random_seq 32 | # Create a template 33 | template = os.path.join(cur_file_dir, 'snippets/latex_templating_problem.txt') 34 | conv = Conversation(file_link=[template], auto_print=False) 35 | raw_res = conv(task) | aggregate.gen_raw_res 36 | scoring = [] 37 | processors = ProcessorPipeline([StripPostProcessor(), CodeExtractPostProcessor()]) 38 | code = Symbol(processors(str(raw_res), None)) | aggregate.gen_code 39 | reader = FileReader() 40 | solution1 = reader(os.path.join(cur_file_dir, 'snippets/latex_templating_solution_1.txt')) | aggregate.solution1 41 | solution2 = reader(os.path.join(cur_file_dir, 'snippets/latex_templating_solution_2.txt')) | aggregate.solution2 42 | solutions = Symbol([solution1, solution2]).mean(axis=0) | aggregate.solutions 43 | base_score = solution1.measure(solution2) | aggregate.conv_base_score 44 | # remove the chance of simply rephrasing the task description 45 | rand_score = solutions.measure(random_seq) | aggregate.conv_rand_score 46 | score = solutions.measure(raw_res, normalize=normalize(base_score, rand_score)) | aggregate.conv_score 47 | scoring.append(score.value) 48 | 49 | # Read the source code from files 50 | solution1 = Symbol(solution1, callables=[Call('measure', ast_measure)]) 51 | # compute again normalization score but this time for AST measure 52 | base_score = solution1.measure(solution2) | aggregate.ast_base_score 53 | rand_score = (0.5*(rand_ast_measure(solution1) + rand_ast_measure(solution2))) | aggregate.ast_rand_score 54 | score = solution1.measure(code, normalize=normalize(base_score, rand_score)) | aggregate.ast_score 55 | scoring.append(score.value) 56 | 57 | # Execute the code 58 | code = reader(template).str().replace('{TODO}', str(code)) 59 | runner = Execute(enclosure=True) 60 | success = False 61 | try: 62 | res = runner(code) 63 | # extract the output from the locals 64 | out = Symbol(res['locals']['_output_']) | aggregate.code_output 65 | ori = reader(os.path.join(cur_file_dir, 'snippets/latex_templating_output.txt')) | aggregate.code_solution 66 | # no normalization is needed here since the output has to be an exact match 67 | score = out.measure(ori) | aggregate.code_score 68 | scoring.append(score.value) 69 | success = True 70 | except Exception as e: 71 | score = 0.0 | aggregate.code_score 72 | scoring.append(score) 73 | 74 | return success, {'scores': scoring} 75 | 76 | 77 | class APIExecutor(Expression): 78 | def __init__(self, verbose=False, **kwargs): 79 | super().__init__(**kwargs) 80 | self.builder = APIBuilder() 81 | self.executor = StackTraceRetryExecutor(retries=0) # disable retries 82 | self._verbose = verbose 83 | self._request = None 84 | self._code = None 85 | self._result = None 86 | 87 | @property 88 | def _runnable(self): 89 | return self.executor._runnable 90 | 91 | def forward(self, aggregate, request: Symbol, presets, **kwargs) -> Symbol: 92 | answer, refs, code, code2, rand = presets() 93 | self._request = self._to_symbol(request) 94 | if self._verbose: print('[REQUEST]', self._request) 95 | # Generate the code to implement the API call 96 | try: 97 | self._code = self.builder(self._request) 98 | except Exception as e: 99 | code_score = 0.0 | aggregate.code_score 100 | web_score = 0.0 | aggregate.web_score 101 | return [code_score, web_score] 102 | if self._verbose: print('[GENERATED_CODE]', self._code) 103 | base_score = code.measure(code2) | aggregate.base_score 104 | rand_score = rand.measure(refs) | aggregate.rand_score 105 | code_score = code.measure(self._code, normalize=normalize(base_score, rand_score)) | aggregate.code_score 106 | code_score = code_score.value 107 | # Execute the code to define the 'run' function 108 | try: 109 | self._result = self.executor(str(self._code), request=self._request) | aggregate.output 110 | if self._verbose: print('[RESULT]:', self._result) 111 | web_score = answer.measure(self._result) | aggregate.web_score 112 | web_score = web_score.value 113 | except Exception as e: 114 | self._result = str(e) 115 | web_score = 0.0 | aggregate.web_score 116 | self._value = self._result 117 | return [code_score, web_score] 118 | 119 | 120 | @toggle_test(ACTIVE, default=MOCK_RETURN) 121 | def test_api_builder(aggregate): 122 | answer = Symbol("Yannic Kilcher") | aggregate.answer 123 | rand_seq = Symbol(RANDOMNESS).mean(axis=0) | aggregate.random_seq 124 | reader = FileReader() 125 | website = reader(os.path.join(cur_file_dir, 'snippets/code_api_builder_website_result.txt')) 126 | ref_code = reader(os.path.join(cur_file_dir, 'snippets/code_api_builder.txt')) | aggregate.ref_code 127 | ref_code2 = reader(os.path.join(cur_file_dir, 'snippets/code_api_builder2.txt')) | aggregate.ref_code2 128 | refs = Symbol([ref_code, ref_code2]).mean(axis=0) | aggregate.refs 129 | executor = APIExecutor() # creates code on the fly and executes it 130 | scores = executor(aggregate, 131 | 'Fetch data from URL https://www.ykilcher.com/ and use Function to extract the full name of the author.', # the request 132 | lambda: (answer, refs, ref_code, ref_code2, rand_seq)) # interprets the instruction to generate a HTTP request 133 | return True, {'scores': scores} 134 | 135 | 136 | @toggle_test(ACTIVE, default=MOCK_RETURN) 137 | def test_expression_builder(aggregate): 138 | solution1 = Symbol(""" 139 | # do not remove or change the imports 140 | from symai import Expression, Function, Symbol 141 | class QueryExpression(Expression): 142 | # initialize the expression with task specific arguments 143 | def __init__(self, prompt: str, **kwargs): 144 | super().__init__(**kwargs) 145 | self.func = Function(prompt, **kwargs) 146 | 147 | # define the forward function with data specific arguments 148 | def forward(self, sym: Symbol, *args, **kwargs) -> Symbol: 149 | sym = self._to_symbol(sym) 150 | result = self.func(sym, *args, **kwargs) 151 | return result 152 | # assign the expression type to the variable _value_obj_ 153 | _value_obj_ = QueryExpression 154 | """) | aggregate.solution1 155 | solution2 = Symbol(""" 156 | from symai import Expression, Function, Symbol 157 | class QueryExpression(Expression): 158 | def __init__(self, prompt: str, **kwargs): 159 | super().__init__(**kwargs) 160 | self.func = Function(prompt, **kwargs) 161 | def forward(self, sym: Symbol, *args, **kwargs) -> Symbol: 162 | sym = self._to_symbol(sym) 163 | return self.func(sym, *args, **kwargs) 164 | _value_obj_ = QueryExpression 165 | """) | aggregate.solution2 166 | solutions = Symbol([solution1, solution2]).mean(axis=0) | aggregate.solutions 167 | rand_seq = Symbol(RANDOMNESS).mean(axis=0) | aggregate.random_seq 168 | builder = ExpressionBuilder() 169 | code = builder("Create a query Expression that is initializes a Function with a prompt and processes a data Symbol based on the custom Function.") 170 | runner = RuntimeExpression() 171 | scoring = [] 172 | try: 173 | expr = runner(code) 174 | score = 1.0 | aggregate.code_score 175 | scoring.append(score) 176 | # initialize the expression with the prompt 177 | query = expr('extract the names from the text') 178 | except: 179 | score = 0.0 | aggregate.code_score 180 | scoring.append(score) 181 | base_score = solution1.measure(solution2) | aggregate.base_score 182 | rand_score = solutions.measure(rand_seq) | aggregate.rand_score 183 | score = solution1.measure(code, normalize=normalize(base_score, rand_score)) | aggregate.code_score 184 | scoring.append(score.value) 185 | try: 186 | # run the expression on the data 187 | res = query('Hello my name is Max and I am 20 years old.') | aggregate.query_res 188 | score = res.measure('Max') | aggregate.query_score 189 | scoring.append(score.value) 190 | except: 191 | score = 0.0 | aggregate.query_score 192 | scoring.append(score) 193 | return True, {'scores': scoring} 194 | -------------------------------------------------------------------------------- /src/evals/snippets/code_api_builder.txt: -------------------------------------------------------------------------------- 1 | def run(text: str) -> str: # [MANAGED] entry point cannot be changed 2 | # [MANAGED-BEGIN] mandatory imports here 3 | import traceback 4 | import requests 5 | from symai import Function 6 | # [MANAGED-END] mandatory imports here 7 | 8 | # executable code here 9 | try: # [MANAGED] must contain this line, do not change 10 | # API call to fetch data from URL 11 | response = requests.get('https://www.ykilcher.com/') 12 | 13 | # Check if the request was successful 14 | if response.status_code == 200: 15 | res = response.text # Get the content of the response 16 | else: 17 | res = f"Error: {response.status_code}" 18 | 19 | # mandatory return statement here 20 | res = str(res) # [MANAGED] must contain this line, do not change 21 | 22 | # Use the Function class to log messages 23 | func = Function('Extract full name from text') 24 | res = func(res) 25 | 26 | return res # [MANAGED] must return a string, do not change 27 | except Exception as e: # [MANAGED] must catch all exceptions and return them as string 28 | tb = traceback.format_exc() # [MANAGED] return full error stack trace as string 29 | return tb # [MANAGED] return tb as string, do not change 30 | 31 | # Example request value 32 | value = "Fetch data from URL https://www.ykilcher.com/" 33 | # mandatory statement here 34 | res = run(value) # [MANAGED] must contain this line, do not change 35 | -------------------------------------------------------------------------------- /src/evals/snippets/code_api_builder2.txt: -------------------------------------------------------------------------------- 1 | def run(text: str) -> str: 2 | import requests 3 | from symai import Function 4 | url = 'https://www.ykilcher.com/' 5 | rsp = requests.get(url) 6 | if rsp.status_code != 200: 7 | raise Exception(f"Error: {rsp.status_code}") 8 | res = rsp.text 9 | res = str(res) 10 | func = Function('extract the name from text') 11 | return func(res) 12 | value = "Fetch data from URL https://www.ykilcher.com/" 13 | res = run(value) 14 | -------------------------------------------------------------------------------- /src/evals/snippets/code_api_builder_website_result.txt: -------------------------------------------------------------------------------- 1 | 'Yannic Kilcher