├── .gitignore
├── LICENSE
├── README.md
├── assets
└── results.png
├── config.json
├── package.json
├── populate_index.py
├── requirements.txt
├── src
├── __init__.py
├── engines
│ ├── __init__.py
│ ├── engine_google_vertex.py
│ ├── engine_llamacpp.py
│ └── engine_mockup.py
├── evals
│ ├── __init__.py
│ ├── assets
│ │ └── sample_bill.jpg
│ ├── components
│ │ ├── __init__.py
│ │ ├── api_builder.py
│ │ ├── factorization.py
│ │ └── paper.py
│ ├── eval_computation_graphs.py
│ ├── eval_in_context_associations.py
│ ├── eval_logic_components.py
│ ├── eval_multimodal_bindings.py
│ ├── eval_program_synthesis.py
│ └── snippets
│ │ ├── code_api_builder.txt
│ │ ├── code_api_builder2.txt
│ │ ├── code_api_builder_website_result.txt
│ │ ├── einstein_puzzle.txt
│ │ ├── einstein_puzzle_human_solution.txt
│ │ ├── einstein_puzzle_logic_solution.txt
│ │ ├── formulations_dsl_rewriting.txt
│ │ ├── google_organic_results_20240111_query=What-is-sulfuric-acid.txt
│ │ ├── google_organic_results_20240121_query=Search-for-U-235.txt
│ │ ├── jays_brother_human_solution.txt
│ │ ├── jays_brother_trajectories.txt
│ │ ├── latex_templating_output.txt
│ │ ├── latex_templating_problem.txt
│ │ ├── latex_templating_solution_1.txt
│ │ ├── latex_templating_solution_2.txt
│ │ ├── paper
│ │ ├── bib
│ │ │ └── related_work
│ │ │ │ ├── laird87.txt
│ │ │ │ ├── mccarthy06.txt
│ │ │ │ ├── newell56.txt
│ │ │ │ ├── newell57.txt
│ │ │ │ └── newell72.txt
│ │ ├── method
│ │ │ └── symbolicai_docs.txt
│ │ ├── ref
│ │ │ ├── reference_abstract.txt
│ │ │ ├── reference_paper.txt
│ │ │ ├── reference_section_framework.txt
│ │ │ ├── reference_section_relatedwork.txt
│ │ │ └── reference_title.txt
│ │ └── traj
│ │ │ ├── reference_abstract.txt
│ │ │ ├── reference_paper.txt
│ │ │ ├── reference_section_framework.txt
│ │ │ ├── reference_section_relatedwork.txt
│ │ │ └── reference_title.txt
│ │ ├── richard_feynman_summary.txt
│ │ ├── sample_bill.txt
│ │ └── wiki_page_20240121.txt
├── func.py
├── report.py
└── utils.py
└── test.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | src/evals/.DS_Store
162 | src/.DS_Store
163 | .DS_Store
164 | experiments
165 | symai.config.json
166 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2023, ExtensityAI
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | 1. Redistributions of source code must retain the above copyright notice, this
9 | list of conditions and the following disclaimer.
10 |
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 |
15 | 3. Neither the name of the copyright holder nor the names of its
16 | contributors may be used to endorse or promote products derived from
17 | this software without specific prior written permission.
18 |
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Benchmark
2 |
3 | ## SymbolicAI: A framework for logic-based approaches combining generative models and solvers
4 |
5 | We introduce SymbolicAI, a versatile and modular framework employing a logic-based approach to concept learning and flow management in generative processes. SymbolicAI enables the seamless integration of generative models with a diverse range of solvers by treating large language models (LLMs) as semantic parsers that execute tasks based on both natural and formal language instructions, thus bridging the gap between symbolic reasoning and generative AI. We leverage probabilistic programming principles to tackle complex tasks, and utilize differentiable and classical programming paradigms with their respective strengths. The framework introduces a set of polymorphic, compositional, and self-referential operations for data stream manipulation, aligning LLM outputs with user objectives. As a result, we can transition between the capabilities of various foundation models endowed with zero- and few-shot learning capabilities and specialized, fine-tuned models or solvers proficient in addressing specific problems. In turn, the framework facilitates the creation and evaluation of explainable computational graphs. We conclude by introducing a quality measure and its empirical score for evaluating these computational graphs, and propose a benchmark that compares various state-of-the-art LLMs across a set of complex workflows. We refer to the empirical score as the "Vector Embedding for Relational Trajectory Evaluation through Cross-similarity", or VERTEX score for short. The SymbolicAI framework codebase is available [here](https://github.com/ExtensityAI/symbolicai).
6 |
7 |
8 |
9 | ## Installation
10 |
11 | ### Requirements
12 |
13 | Install dependencies.
14 |
15 | ```bash
16 | pip install "symbolicai[all]"
17 | pip install -r requirements.txt
18 | ```
19 |
20 | Install LlamaCpp backend.
21 |
22 | ```bash
23 | sympkg i ExtensityAI/llamacpp
24 | ```
25 |
26 | Then follow the instructions in the [ExtensityAI/llamacpp](https://github.com/ExtensityAI/llamacpp) repository to install and run the LlamaCpp backend with various HuggingFace models.
27 |
28 | Install embeddings backend.
29 |
30 | ```bash
31 | sympkg i ExtensityAI/embeddings
32 | ```
33 |
34 | ## Configuration
35 |
36 | Set the respective `config.json` properties for engine API keys and local models as shown below, and run the local models with the configured port and host name.
37 |
38 | ```json
39 | {
40 | "gpt4": {
41 | "api_key": "",
42 | "model": "gpt-4-1106-preview"
43 | },
44 | "gpt3.5": {
45 | "api_key": "",
46 | "model": "gpt-3.5-turbo-1106"
47 | },
48 | "gemini": {
49 | "api_key": "",
50 | "model": "gemini-pro"
51 | },
52 | "llama": {
53 | "host": "http://localhost",
54 | "port": 8080
55 | },
56 | ...
57 | }
58 | ```
59 |
60 | ## Usage
61 |
62 | Run the full benchmark.
63 |
64 | ```bash
65 | python test.py --context_associations --program_synthesis --multimodal_bindings --logic_components --computation_graphs
66 | ```
67 |
68 | This will run all the evaluations in the benchmark.
69 |
70 | ## Cite us
71 |
72 | ```bibtex
73 | @article{
74 | Dinu:24,
75 | title={SymbolicAI: A framework for logic-based approaches combining generative models and solvers},
76 | author={Marius–Constantin Dinu and Claudiu Leoveanu–Condrei and Markus Holzleitner and Werner Zellinger and Sepp Hochreiter},
77 | year={2024},
78 | eprint={2402.00854},
79 | archivePrefix={arXiv},
80 | primaryClass={cs.LG},
81 | url={https://arxiv.org/abs/2402.00854}
82 | }
83 | ```
84 |
--------------------------------------------------------------------------------
/assets/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExtensityAI/benchmark/24d3e93681d454b379d7f1e787b2a2284c41922f/assets/results.png
--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "gpt4": {
3 | "api_key": "",
4 | "model": "gpt-4-1106-preview"
5 | },
6 | "gpt3.5": {
7 | "api_key": "",
8 | "model": "gpt-3.5-turbo"
9 | },
10 | "gemini": {
11 | "api_key": "",
12 | "model": "gemini-1.0-pro"
13 | },
14 | "llama": {
15 | "host": "http://localhost",
16 | "port": 8080
17 | },
18 | "zephyr": {
19 | "host": "http://localhost",
20 | "port": 8081
21 | },
22 | "mistral": {
23 | "host": "http://localhost",
24 | "port": 8082
25 | },
26 | "llama3_8B": {
27 | "host": "http://localhost",
28 | "port": 8083
29 | },
30 | "llama3_70B": {
31 | "host": "http://localhost",
32 | "port": 8084
33 | },
34 | "gemini1.5": {
35 | "api_key": "",
36 | "model": "gemini-1.5-pro-latest"
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "0.0.1",
3 | "name": "ExtensityAI/benchmark",
4 | "description": "Evaluation of the neuro-symbolic framework.",
5 | "expressions": [
6 | {
7 | "module": "src/func",
8 | "type": "EvaluateBenchmark"
9 | }
10 | ],
11 | "run": {
12 | "module": "src/func",
13 | "type": "EvaluateBenchmark"
14 | },
15 | "dependencies": [
16 | "ExtensityAI/llamacpp",
17 | "ExtensityAI/tuning"
18 | ]
19 | }
--------------------------------------------------------------------------------
/populate_index.py:
--------------------------------------------------------------------------------
1 | from symai.shellsv import retrieval_augmented_indexing
2 | from symai.functional import EngineRepository
3 | from symai.backend.engines.index.engine_pinecone import PineconeIndexEngine
4 | from symai.backend.engines.index.engine_vectordb import VectorDBIndexEngine
5 |
6 |
7 | def run():
8 | # Register embeddings engine globally for all Symbols from plugin
9 | EngineRepository.register_from_plugin('embedding', plugin='ExtensityAI/embeddings', kwargs={'model': 'all-mpnet-base-v2'}, allow_engine_override=True)
10 | # EngineRepository.register('index', PineconeIndexEngine(index_name='dataindex',
11 | # index_dims=768,
12 | # index_top_k=5))
13 | vectorDB = VectorDBIndexEngine(index_name='dataindex',
14 | index_dims=768,
15 | index_top_k=5)
16 | EngineRepository.register('index', vectorDB)
17 | # insert into the index
18 | retrieval_augmented_indexing('!src/evals/snippets', index_name='dataindex')
19 | # # need to persist in-memory to disk
20 | vectorDB.save()
21 |
22 |
23 | if __name__ == '__main__':
24 | run()
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | backoff
2 | seaborn
3 | google-cloud-aiplatform
4 | google-generativeai
5 | anthropic
6 | wandb
7 | parso
8 | sympy
9 | z3-solver
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExtensityAI/benchmark/24d3e93681d454b379d7f1e787b2a2284c41922f/src/__init__.py
--------------------------------------------------------------------------------
/src/engines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExtensityAI/benchmark/24d3e93681d454b379d7f1e787b2a2284c41922f/src/engines/__init__.py
--------------------------------------------------------------------------------
/src/engines/engine_google_vertex.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import google.generativeai as genai
3 |
4 | from typing import List, Optional
5 |
6 | from symai.backend.base import Engine
7 | from symai.backend.settings import SYMAI_CONFIG
8 |
9 |
10 | logging.getLogger("requests").setLevel(logging.ERROR)
11 | logging.getLogger("urllib").setLevel(logging.ERROR)
12 | logging.getLogger("httpx").setLevel(logging.ERROR)
13 | logging.getLogger("httpcore").setLevel(logging.ERROR)
14 |
15 |
16 | class GoogleGeminiEngine(Engine):
17 | def __init__(self, api_key: Optional[str] = None, model: Optional[str] = None):
18 | super().__init__()
19 | logger = logging.getLogger('vertexai')
20 | logger.setLevel(logging.WARNING)
21 | self.config = SYMAI_CONFIG
22 | # Initialize the Vertex AI project
23 | self.api_key = api_key
24 | genai.configure(api_key=api_key)
25 | # Create a generative model instance from Vertex AI
26 | self.model = genai.GenerativeModel(model_name=model)
27 | self.max_tokens = 32_760 - 100 # @NOTE: account for tolerance.
28 | self.seed = None
29 | self.except_remedy = None
30 |
31 | def id(self) -> str:
32 | if self.config['NEUROSYMBOLIC_ENGINE_MODEL'] and \
33 | self.config['NEUROSYMBOLIC_ENGINE_MODEL'].startswith('gemini'):
34 | return 'neurosymbolic'
35 | return super().id() # default to unregistered
36 |
37 | def command(self, *args, **kwargs):
38 | super().command(*args, **kwargs)
39 | if 'NEUROSYMBOLIC_ENGINE_MODEL' in kwargs:
40 | self.model = kwargs['NEUROSYMBOLIC_ENGINE_MODEL']
41 | if 'seed' in kwargs:
42 | self.seed = kwargs['seed']
43 | if 'except_remedy' in kwargs:
44 | self.except_remedy = kwargs['except_remedy']
45 |
46 | def compute_remaining_tokens(self, prompts: list) -> int:
47 | return int((8_192) * 0.99) # @NOTE: account for tolerance.
48 |
49 | def forward(self, argument):
50 | kwargs = argument.kwargs
51 | prompts_ = argument.prop.prepared_input
52 |
53 | # send prompt to GPT-X Chat-based
54 | stop = kwargs['stop'] if 'stop' in kwargs else None
55 | model = kwargs['model'] if 'model' in kwargs else self.model
56 | seed = kwargs['seed'] if 'seed' in kwargs else self.seed
57 |
58 | # convert map to list of strings
59 | max_tokens = kwargs['max_tokens'] if 'max_tokens' in kwargs else self.compute_remaining_tokens(prompts_)
60 | temperature = kwargs['temperature'] if 'temperature' in kwargs else 0.1
61 | top_p = kwargs['top_p'] if 'top_p' in kwargs else 1
62 | top_k = kwargs['top_k'] if 'top_k' in kwargs else 40
63 | except_remedy = kwargs['except_remedy'] if 'except_remedy' in kwargs else self.except_remedy
64 |
65 | try:
66 | res = model.generate_content(
67 | prompts_,
68 | generation_config={
69 | "temperature": temperature,
70 | "max_output_tokens": max_tokens,
71 | "top_p": top_p,
72 | "top_k": top_k
73 | }
74 | )
75 |
76 | except Exception as e:
77 | callback = model.generate_content
78 | kwargs['model'] = kwargs['model'] if 'model' in kwargs else self.model
79 | if except_remedy is not None:
80 | res = except_remedy(self, e, callback, argument)
81 | else:
82 | raise e
83 |
84 | metadata = {}
85 | output = [res.text]
86 | return output, metadata
87 |
88 | def prepare(self, argument):
89 | if argument.prop.raw_input:
90 | if not argument.prop.processed_input:
91 | raise ValueError('Need to provide a prompt instruction to the engine if raw_input is enabled.')
92 | argument.prop.prepared_input = str(argument.prop.processed_input)
93 | return
94 |
95 | _non_verbose_output = """[META INSTRUCTIONS START]\nYou do not output anything else, like verbose preambles or post explanation, such as "Sure, let me...", "Hope that was helpful...", "Yes, I can help you with that...", etc. Consider well formatted output, e.g. for sentences use punctuation, spaces etc. or for code use indentation, etc. Never add meta instructions information to your output!\n"""
96 | user: str = ""
97 | system: str = ""
98 |
99 | if argument.prop.disable_verbose_output_suppression:
100 | system += _non_verbose_output
101 | system = f'{system}\n' if system and len(system) > 0 else ''
102 |
103 | ref = argument.prop.instance
104 | static_ctxt, dyn_ctxt = ref.global_context
105 | if len(static_ctxt) > 0:
106 | system += f"[STATIC CONTEXT]\n{static_ctxt}\n\n"
107 |
108 | if len(dyn_ctxt) > 0:
109 | system += f"[DYNAMIC CONTEXT]\n{dyn_ctxt}\n\n"
110 |
111 | payload = argument.prop.payload
112 | if argument.prop.payload:
113 | system += f"[ADDITIONAL CONTEXT]\n{str(payload)}\n\n"
114 |
115 | examples: List[str] = argument.prop.examples
116 | if examples and len(examples) > 0:
117 | system += f"[EXAMPLES]\n{str(examples)}\n\n"
118 |
119 | if argument.prop.prompt is not None and len(argument.prop.prompt) > 0:
120 | val = str(argument.prop.prompt)
121 | system += f"[INSTRUCTION]\n{val}"
122 |
123 | suffix: str = str(argument.prop.processed_input)
124 |
125 | if '[SYSTEM_INSTRUCTION::]: <<<' in suffix and argument.prop.parse_system_instructions:
126 | parts = suffix.split('\n>>>\n')
127 | # first parts are the system instructions
128 | c = 0
129 | for i, p in enumerate(parts):
130 | if 'SYSTEM_INSTRUCTION' in p:
131 | system += f"{p}\n"
132 | c += 1
133 | else:
134 | break
135 | # last part is the user input
136 | suffix = '\n>>>\n'.join(parts[c:])
137 | user += f"{suffix}"
138 |
139 | if argument.prop.template_suffix:
140 | user += f"\n[[PLACEHOLDER]]\n{str(argument.prop.template_suffix)}\n\n"
141 | user += f"Only generate content for the placeholder `[[PLACEHOLDER]]` following the instructions and context information. Do NOT write `[[PLACEHOLDER]]` or anything else in your output.\n\n"
142 |
143 | argument.prop.prepared_input = f'---------SYSTEM BEHAVIOR--------\n{system}\n\n---------USER REQUEST--------\n{user}'
144 |
--------------------------------------------------------------------------------
/src/engines/engine_llamacpp.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import requests
3 | import json
4 |
5 | from typing import List
6 | from requests_toolbelt.multipart.encoder import MultipartEncoder
7 |
8 | from symai.backend.base import Engine
9 | from symai.backend.settings import SYMAI_CONFIG
10 | from symai.symbol import Result
11 |
12 |
13 | class LLaMAResult(Result):
14 | def __init__(self, value=None, *args, **kwargs):
15 | super().__init__(value, *args, **kwargs)
16 | self._value = value
17 | self.error = None
18 | self.raw = value
19 | self._perse_result()
20 |
21 | def _perse_result(self):
22 | val = json.loads(self.value)
23 | self.value = val
24 | if 'error' in val:
25 | self.error = val['error']
26 | if 'content' in val:
27 | self.value = val['content']
28 |
29 |
30 | class LLaMACppClientEngine(Engine):
31 | def __init__(self, host: str = 'http://localhost', port: int = 8080, uri: str = 'completion', timeout: int = 600):
32 | super().__init__()
33 | logger = logging.getLogger('nesy_client')
34 | logger.setLevel(logging.WARNING)
35 | self.config = SYMAI_CONFIG
36 | self.host = host
37 | self.port = port
38 | self.uri = uri
39 | self.timeout = timeout
40 | self.seed = None
41 | self.except_remedy = None
42 |
43 | def id(self) -> str:
44 | if self.config['CAPTION_ENGINE_MODEL'] and \
45 | 'llamacpp' in self.config['CAPTION_ENGINE_MODEL']:
46 | return 'neurosymbolic'
47 | return super().id() # default to unregistered
48 |
49 | def command(self, *args, **kwargs):
50 | super().command(*args, **kwargs)
51 | if 'seed' in kwargs:
52 | self.seed = kwargs['seed']
53 | if 'except_remedy' in kwargs:
54 | self.except_remedy = kwargs['except_remedy']
55 |
56 | @property
57 | def max_tokens(self):
58 | return 2048
59 |
60 | def compute_remaining_tokens(self, prompts: list) -> int:
61 | return int((1024) * 0.99) # @NOTE: account for tolerance.
62 |
63 | def forward(self, argument):
64 | prompts = argument.prop.prepared_input
65 | kwargs = argument.kwargs
66 |
67 | model_kwargs = {}
68 |
69 | # convert map to list of strings
70 | stop = kwargs['stop'] if 'stop' in kwargs else None
71 | seed = kwargs['seed'] if 'seed' in kwargs else self.seed
72 | max_tokens = kwargs['max_tokens'] if 'max_tokens' in kwargs else self.compute_remaining_tokens(prompts)
73 | temperature = kwargs['temperature'] if 'temperature' in kwargs else 0.7
74 | top_p = kwargs['top_p'] if 'top_p' in kwargs else 0.95
75 | top_k = kwargs['top_k'] if 'top_k' in kwargs else 40
76 | except_remedy = kwargs['except_remedy'] if 'except_remedy' in kwargs else self.except_remedy
77 |
78 | if stop is not None:
79 | model_kwargs['stop'] = stop
80 | if seed is not None:
81 | model_kwargs['seed'] = seed
82 | if max_tokens is not None:
83 | model_kwargs['n_predict'] = max_tokens
84 | if temperature is not None:
85 | model_kwargs['temperature'] = temperature
86 | if top_p is not None:
87 | model_kwargs['top_p'] = top_p
88 | if top_k is not None:
89 | model_kwargs['top_k'] = top_k
90 |
91 | # Create multipart/form-data payload
92 | # Since the LLaMA server expects a JSON payload, we construct JSON data
93 | prompt = prompts[0] if prompts[0] is not None and len(prompts[0]) > 0 else ' ' # @NOTE: space char to produce at least empty prompt and avoid exception on server side
94 | payload = {
95 | 'prompt': prompt,
96 | **model_kwargs
97 | }
98 | headers = {'Content-Type': 'application/json'}
99 | api = f'{self.host}:{self.port}/{self.uri}'
100 | try:
101 | rsp = requests.post(api, json=payload, headers=headers, timeout=self.timeout)
102 | # Verify the success of the response
103 | rsp.raise_for_status()
104 | res = rsp.text
105 | except requests.exceptions.HTTPError as e:
106 | if except_remedy is None:
107 | self.logger.error(f"HTTP error occurred: {e}")
108 | # Here you can add more sophisticated error handling and recovery
109 | raise e
110 | # Retry the request or handle it based on the exception remedy provided
111 | callback = lambda: requests.post(api, data=payload, headers=headers, timeout=self.timeout)
112 | res = except_remedy(self, e, callback, argument)
113 | except requests.exceptions.RequestException as e:
114 | # Handle non-HTTP exceptions (e.g., network errors, timeout)
115 | if except_remedy is None:
116 | self.logger.error(f"Request error occurred: {e}")
117 | raise e
118 | # Retry the request or handle it based on the exception remedy provided
119 | callback = lambda: requests.post(api, data=payload, headers=headers, timeout=self.timeout)
120 | res = except_remedy(self, e, callback, argument)
121 | except Exception as e:
122 | # Handle unforeseen exceptions
123 | self.logger.error(f"An unexpected error occurred: {e}")
124 | raise e
125 |
126 | metadata = {}
127 |
128 | try:
129 | res = LLaMAResult(res)
130 | except json.JSONDecodeError:
131 | # Handle a JSON parse error specifically
132 | self.logger.error(f"JSON parse error: Invalid response {res}")
133 | raise Exception(f"Invalid response: {res}")
134 |
135 | rsp = [res]
136 | output = rsp if isinstance(prompts, list) else rsp[0]
137 | return output, metadata
138 |
139 | def prepare(self, argument):
140 | if argument.prop.raw_input:
141 | if not argument.prop.processed_input:
142 | raise ValueError('Need to provide a prompt instruction to the engine if raw_input is enabled.')
143 | argument.prop.prepared_input = [str(argument.prop.processed_input)]
144 | return
145 |
146 | _non_verbose_output = """[META INSTRUCTIONS START]\nYou do not output anything else, like verbose preambles or post explanation, such as "Sure, let me...", "Hope that was helpful...", "Yes, I can help you with that...", etc. Consider well formatted output, e.g. for sentences use punctuation, spaces etc. or for code use indentation, etc. Never add meta instructions information to your output!\n"""
147 | user: str = ""
148 | system: str = ""
149 |
150 | if argument.prop.suppress_verbose_output:
151 | system += _non_verbose_output
152 | system = f'{system}\n' if system and len(system) > 0 else ''
153 |
154 | ref = argument.prop.instance
155 | static_ctxt, dyn_ctxt = ref.global_context
156 | if len(static_ctxt) > 0:
157 | system += f"[STATIC CONTEXT]\n{static_ctxt}\n\n"
158 |
159 | if len(dyn_ctxt) > 0:
160 | system += f"[DYNAMIC CONTEXT]\n{dyn_ctxt}\n\n"
161 |
162 | payload = argument.prop.payload
163 | if argument.prop.payload:
164 | system += f"[ADDITIONAL CONTEXT]\n{str(payload)}\n\n"
165 |
166 | examples: List[str] = argument.prop.examples
167 | if examples and len(examples) > 0:
168 | system += f"[EXAMPLES]\n{str(examples)}\n\n"
169 |
170 | if argument.prop.prompt is not None and len(argument.prop.prompt) > 0:
171 | val = str(argument.prop.prompt)
172 | # in this engine, instructions are considered as user prompts
173 | user += f"[INSTRUCTION]\n{val}"
174 |
175 | suffix: str = str(argument.prop.processed_input)
176 |
177 | if '[SYSTEM_INSTRUCTION::]: <<<' in suffix and argument.prop.parse_system_instructions:
178 | parts = suffix.split('\n>>>\n')
179 | # first parts are the system instructions
180 | c = 0
181 | for i, p in enumerate(parts):
182 | if 'SYSTEM_INSTRUCTION' in p:
183 | system += f"{p}\n"
184 | c += 1
185 | else:
186 | break
187 | # last part is the user input
188 | suffix = '\n>>>\n'.join(parts[c:])
189 | user += f"{suffix}"
190 |
191 | if argument.prop.template_suffix:
192 | user += f"\n[[PLACEHOLDER]]\n{str(argument.prop.template_suffix)}\n\n"
193 | user += f"Only generate content for the placeholder `[[PLACEHOLDER]]` following the instructions and context information. Do NOT write `[[PLACEHOLDER]]` or anything else in your output.\n\n"
194 |
195 | argument.prop.prepared_input = [f'---------SYSTEM BEHAVIOR--------\n{system}\n\n---------USER REQUEST--------\n{user}']
196 |
--------------------------------------------------------------------------------
/src/engines/engine_mockup.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from box import Box
4 | from typing import List
5 |
6 | from symai.backend.base import Engine
7 | from symai.backend.settings import SYMAI_CONFIG
8 | from symai.symbol import Result
9 |
10 | from ..utils import RANDOM_RESPONSE
11 |
12 |
13 | class MockupResult(Result):
14 | def __init__(self, value=None, *args, **kwargs):
15 | super().__init__(value, *args, **kwargs)
16 | self._value = value
17 | self.error = None
18 | self.raw = value
19 |
20 |
21 | class MockupEngine(Engine):
22 | def __init__(self, verbose: bool = False):
23 | super().__init__()
24 | self.logger = logging.getLogger('mockup')
25 | self.logger.setLevel(logging.DEBUG)
26 | self.config = SYMAI_CONFIG
27 | self.seed = None
28 | self.except_remedy = None
29 | self.verbose = verbose
30 |
31 | def id(self) -> str:
32 | return super().id() # default to unregistered
33 |
34 | def command(self, *args, **kwargs):
35 | super().command(*args, **kwargs)
36 |
37 | @property
38 | def max_tokens(self):
39 | return 2048
40 |
41 | def compute_remaining_tokens(self, prompts: list) -> int:
42 | return int((1024) * 0.99)
43 |
44 | def forward(self, argument):
45 | prompts = argument.prop.prepared_input
46 | kwargs = argument.kwargs
47 |
48 | model_kwargs = {}
49 |
50 | # convert map to list of strings
51 | stop = kwargs['stop'] if 'stop' in kwargs else None
52 | seed = kwargs['seed'] if 'seed' in kwargs else self.seed
53 | max_tokens = kwargs['max_tokens'] if 'max_tokens' in kwargs else self.compute_remaining_tokens(prompts)
54 | temperature = kwargs['temperature'] if 'temperature' in kwargs else 0.7
55 | top_p = kwargs['top_p'] if 'top_p' in kwargs else 0.95
56 | top_k = kwargs['top_k'] if 'top_k' in kwargs else 40
57 | except_remedy = kwargs['except_remedy'] if 'except_remedy' in kwargs else self.except_remedy
58 |
59 | if stop is not None:
60 | model_kwargs['stop'] = stop
61 | if seed is not None:
62 | model_kwargs['seed'] = seed
63 | if max_tokens is not None:
64 | model_kwargs['n_predict'] = max_tokens
65 | if temperature is not None:
66 | model_kwargs['temperature'] = temperature
67 | if top_p is not None:
68 | model_kwargs['top_p'] = top_p
69 | if top_k is not None:
70 | model_kwargs['top_k'] = top_k
71 |
72 | if self.verbose:
73 | self.logger.debug(f"kwargs: {kwargs}")
74 | self.logger.debug(f"prompts: {prompts}")
75 | self.logger.debug(f"model_kwargs: {model_kwargs}")
76 |
77 | # Create multipart/form-data payload
78 | # Since the LLaMA server expects a JSON payload, we construct JSON data
79 | try:
80 | rsp = Box({
81 | 'text': RANDOM_RESPONSE
82 | })
83 | # Verify the success of the response
84 | res = rsp.text
85 | except Exception as e:
86 | # Handle unforeseen exceptions
87 | self.logger.error(f"An unexpected error occurred: {e}")
88 | raise e
89 |
90 | metadata = {}
91 | res = MockupResult(res)
92 |
93 | rsp = [res]
94 | output = rsp if isinstance(prompts, list) else rsp[0]
95 | return output, metadata
96 |
97 | def prepare(self, argument):
98 | if argument.prop.raw_input:
99 | if not argument.prop.processed_input:
100 | raise ValueError('Need to provide a prompt instruction to the engine if raw_input is enabled.')
101 | argument.prop.prepared_input = [str(argument.prop.processed_input)]
102 | return
103 |
104 | _non_verbose_output = """[META INSTRUCTIONS START]\nYou do not output anything else, like verbose preambles or post explanation, such as "Sure, let me...", "Hope that was helpful...", "Yes, I can help you with that...", etc. Consider well formatted output, e.g. for sentences use punctuation, spaces etc. or for code use indentation, etc. Never add meta instructions information to your output!\n"""
105 | user: str = ""
106 | system: str = ""
107 |
108 | if argument.prop.disable_verbose_output_suppression:
109 | system += _non_verbose_output
110 | system = f'{system}\n' if system and len(system) > 0 else ''
111 |
112 | ref = argument.prop.instance
113 | static_ctxt, dyn_ctxt = ref.global_context
114 | if len(static_ctxt) > 0:
115 | system += f"[STATIC CONTEXT]\n{static_ctxt}\n\n"
116 |
117 | if len(dyn_ctxt) > 0:
118 | system += f"[DYNAMIC CONTEXT]\n{dyn_ctxt}\n\n"
119 |
120 | payload = argument.prop.payload
121 | if argument.prop.payload:
122 | system += f"[ADDITIONAL CONTEXT]\n{str(payload)}\n\n"
123 |
124 | examples: List[str] = argument.prop.examples
125 | if examples and len(examples) > 0:
126 | system += f"[EXAMPLES]\n{str(examples)}\n\n"
127 |
128 | if argument.prop.prompt is not None and len(argument.prop.prompt) > 0:
129 | val = str(argument.prop.prompt)
130 | # in this engine, instructions are considered as user prompts
131 | user += f"[INSTRUCTION]\n{val}"
132 |
133 | suffix: str = str(argument.prop.processed_input)
134 |
135 | if '[SYSTEM_INSTRUCTION::]: <<<' in suffix and argument.prop.parse_system_instructions:
136 | parts = suffix.split('\n>>>\n')
137 | # first parts are the system instructions
138 | c = 0
139 | for i, p in enumerate(parts):
140 | if 'SYSTEM_INSTRUCTION' in p:
141 | system += f"{p}\n"
142 | c += 1
143 | else:
144 | break
145 | # last part is the user input
146 | suffix = '\n>>>\n'.join(parts[c:])
147 | user += f"{suffix}"
148 |
149 | if argument.prop.template_suffix:
150 | user += f"\n[[PLACEHOLDER]]\n{str(argument.prop.template_suffix)}\n\n"
151 | user += f"Only generate content for the placeholder `[[PLACEHOLDER]]` following the instructions and context information. Do NOT write `[[PLACEHOLDER]]` or anything else in your output.\n\n"
152 |
153 | argument.prop.prepared_input = [f'---------SYSTEM BEHAVIOR--------\n{system}\n\n---------USER REQUEST--------\n{user}']
154 |
--------------------------------------------------------------------------------
/src/evals/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExtensityAI/benchmark/24d3e93681d454b379d7f1e787b2a2284c41922f/src/evals/__init__.py
--------------------------------------------------------------------------------
/src/evals/assets/sample_bill.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ExtensityAI/benchmark/24d3e93681d454b379d7f1e787b2a2284c41922f/src/evals/assets/sample_bill.jpg
--------------------------------------------------------------------------------
/src/evals/components/__init__.py:
--------------------------------------------------------------------------------
1 | from .factorization import *
--------------------------------------------------------------------------------
/src/evals/components/api_builder.py:
--------------------------------------------------------------------------------
1 | from symai import core
2 | from symai import Expression, Symbol
3 | from symai.pre_processors import PreProcessor
4 | from symai.post_processors import CodeExtractPostProcessor
5 | from symai.components import Execute
6 |
7 |
8 | API_BUILDER_DESCRIPTION = """[Description]
9 | You are an API coding tool for Python that creates API calls to any web URL based on user requests.
10 | For example, if the user wants to use the X API (former Twitter) to post a tweet, you will create the required API post call for that, i.e. 'Write Twitter post `hey, what's up` API-Key:...'.
11 | If the user wants to use the X API to get the latest tweets, you will create the API call for that, e.g. 'Read Twitter post https://twitter.com/...'.
12 | Each created function is atomic and can be used as a building block for more complex functions.
13 | You can also create a function that calls other functions. However, all code must be self-contained in one function `run` including all imports.
14 | Another constraint is that there is one mandatory function called `run` as an entry point to the executable runnable and one provided pre-build function that uses an large language model to extract and parse API calls parameters of user requests or manipulates string-based data as you see fit.
15 | All code parts marked with [MANAGED] are strictly forbidden to be changed! They must be provided as is.
16 | Always generate the entire code for the `run` function, including the `try` and `except` blocks, imports, etc. and the unchanged managed code parts.
17 |
18 | For example, you can write yourself prompts to extract parameters from user requests and use them to create API calls:
19 | ```python
20 | # all code must be self-contained in one function called `run` including all imports
21 | def run(text: str) -> str: # [MANAGED] entry point cannot be changed
22 | # [MANAGED-BEGIN] mandatory imports here
23 | import traceback
24 | import requests
25 | from symai import Function
26 | # [MANAGED-END] mandatory imports here
27 |
28 | # optional imports here
29 | # TODO: all your imports and code here
30 |
31 | # executable code here
32 | try: # [MANAGED] must contain this line, do not change
33 | # optional helper functions here
34 |
35 | # optional params extraction here
36 | # TODO: extract params from request full-text if needed
37 | # Example:
38 | func = Function('YOUR_PROMPT_1') # TODO: extract function param 1
39 | param1 = func(request)
40 | func = Function('YOUR_PROMPT_2') # TODO: extract function param 2
41 | param2 = func(request)
42 | # ... extract more params if needed
43 |
44 | # optional params manipulation here
45 | res = # TODO: run https APIs with the respective params, use tools like requests, urllib, etc.
46 |
47 | # optional result formatting here
48 | # Another example:
49 | func = Function('YOUR_PROMPT_3') # TODO: format result if needed
50 | res = func(res)
51 |
52 | # mandatory return statement here
53 | res = str(res) # [MANAGED] must contain this line, do not change
54 | return res # [MANAGED] must return a string, do not change
55 | except Exception as e: # [MANAGED] must catch all exceptions and return them as string
56 | tb = traceback.format_exc() # [MANAGED] return full error stack trace as string
57 | return tb # [MANAGED] return tb as string, do not change
58 |
59 | # mandatory statement here
60 | res = run(value) # [MANAGED] must contain this line, do not change
61 | ```
62 | """
63 |
64 |
65 | class APIBuilderPreProcessor(PreProcessor):
66 | def __call__(self, argument):
67 | return '$> {} =>'.format(str(argument.args[0]))
68 |
69 |
70 | class APIBuilder(Expression):
71 | @property
72 | def static_context(self) -> str:
73 | return API_BUILDER_DESCRIPTION
74 |
75 | def __init__(self, **kwargs):
76 | super().__init__(**kwargs)
77 | self.sym_return_type = APIBuilder
78 |
79 | def forward(self, sym: Symbol, **kwargs) -> Symbol:
80 | @core.zero_shot(prompt="Build the API call code:\n",
81 | pre_processors=[APIBuilderPreProcessor()],
82 | post_processors=[CodeExtractPostProcessor()], **kwargs)
83 | def _func(_, text) -> str:
84 | pass
85 |
86 | return _func(self, sym)
87 |
88 |
89 | class StackTraceRetryExecutor(Expression):
90 | def __init__(self, retries: int = 1, **kwargs):
91 | super().__init__(**kwargs)
92 | self.executor = Execute()
93 | self.max_retries = retries
94 | self._runnable = None
95 |
96 | def forward(self, code: Symbol, request: Symbol, **kwargs) -> Symbol:
97 | code = str(code)
98 | # Set value that gets passed on to the 'run' function in the generated code
99 | value = request.value # do not remove this line
100 | # Create the 'run' function
101 | self._runnable = self.executor(code, locals=locals().copy(), globals=globals().copy())
102 | result = self._runnable['locals']['run'](value)
103 | retry = 0
104 | # Retry if there is a 'Traceback' in the result
105 | while 'Traceback' in result and retry < self.max_retries:
106 | self._runnable = self.executor(code, payload=result, locals=locals().copy(), globals=globals().copy(), **kwargs)
107 | result = self._runnable['locals']['run'](value)
108 | retry += 1
109 | if 'locals_res' in self._runnable:
110 | result = self._runnable['locals_res']
111 | return result
112 |
113 |
114 |
115 |
--------------------------------------------------------------------------------
/src/evals/components/factorization.py:
--------------------------------------------------------------------------------
1 | from symai import Function
2 |
3 |
4 | FACTORIZATION_CONTEXT = """[Context]
5 | Compute the factorization of expression, ``f``, into irreducibles. (To
6 | factor an integer into primes, use ``factorint``.)
7 |
8 | There two modes implemented: symbolic and formal. If ``f`` is not an
9 | instance of :class:`Poly` and generators are not specified, then the
10 | former mode is used. Otherwise, the formal mode is used.
11 |
12 | In symbolic mode, :func:`factor` will traverse the expression tree and
13 | factor its components without any prior expansion, unless an instance
14 | of :class:`~.Add` is encountered (in this case formal factorization is
15 | used). This way :func:`factor` can handle large or symbolic exponents.
16 |
17 | By default, the factorization is computed over the rationals. To factor
18 | over other domain, e.g. an algebraic or finite field, use appropriate
19 | options: ``extension``, ``modulus`` or ``domain``.
20 | """
21 |
22 |
23 | class Factorization(Function):
24 | @property
25 | def static_context(self):
26 | return FACTORIZATION_CONTEXT
27 |
--------------------------------------------------------------------------------
/src/evals/components/paper.py:
--------------------------------------------------------------------------------
1 | from symai import Function
2 | from symai.components import Sequence, Parallel
3 | from symai.extended import Conversation
4 | from symai.post_processors import StripPostProcessor, CodeExtractPostProcessor
5 |
6 |
7 | SYMBOLIC_AI_PAPER = """Write a scientific paper about the machine learning framework called SymbolicAI which operates on the following principles:
8 | - Symbolic methods
9 | - Sub-symbolic methods
10 | - Neural-symbolic methods
11 | - Probabilistic programming methods
12 | - Cognitive architectures
13 | Be precise in your writing and follow a scientific style. Do not use any colloquial language. However, formulate simple and understandable sentences."""
14 |
15 |
16 | PAPER_STATIC_CONTEXT = """[General Context]
17 | {context}
18 |
19 | [Format]
20 | Your output format should be parsable by a LaTeX compiler. All produced content should be enclosed between the \n```latex\n ... \n``` blocks. Do not create document classes or other LaTeX meta commands. Always assume that the document class is already defined. Only produce exactly one latex block with all your content.
21 | Only use either `section`, `subsection`, paragraph`, `texttt`, `textbf`, `emph` or `citep` commands to structure your content. Do not use any other LaTeX commands.
22 | The following is an example of your expected output:
23 |
24 | [Example]
25 | ```latex
26 | \\documentclass{{article}}
27 | \\begin{{document}}
28 | % TODO: your content here
29 | \\end{{document}}
30 | ```
31 |
32 | {description}
33 | """
34 |
35 |
36 | class Paper(Function):
37 | def __init__(self, *sequence, context: str = SYMBOLIC_AI_PAPER, **kwargs):
38 | super().__init__(**kwargs)
39 | self.sequence = Sequence(*sequence)
40 | self.context = context
41 |
42 | def forward(self, task, **kwargs):
43 | # execute the sequence of tasks
44 | res = self.sequence(task, **kwargs)
45 | # access results from the global root node metadata
46 | results = self.linker.results
47 | # return the reversed results
48 | reverse_res = str(list(reversed(list(results.values()))))
49 | # create the final task by concatenating the results
50 | return super().forward(task | reverse_res | res, **kwargs)
51 |
52 | @property
53 | def static_context(self):
54 | return PAPER_STATIC_CONTEXT.format(context=self.context, description='The final paper must include the title an abstract and a related work section and method section.')
55 |
56 |
57 | class Context(Conversation):
58 | def __init__(self, context: str = SYMBOLIC_AI_PAPER, **kwargs):
59 | super().__init__(**kwargs)
60 | self.auto_print = False
61 | self.prompt = 'Replace the % TODO: with your content and follow the task description below.'
62 | self.context = context
63 |
64 | def forward(self, task, *args, **kwargs):
65 | function = Function(self.prompt,
66 | post_processors=[StripPostProcessor(), CodeExtractPostProcessor()],
67 | static_context=self.static_context,
68 | dynamic_context=self.dynamic_context)
69 | return function(f"{task}\n[Source]\n{self.history()}", *args, **kwargs)
70 |
71 | @property
72 | def description(self):
73 | raise NotImplementedError()
74 |
75 | @property
76 | def static_context(self):
77 | return PAPER_STATIC_CONTEXT.format(context=self.context, description=self.description)
78 |
79 |
80 | class Source(Context):
81 | @property
82 | def description(self):
83 | return """[Task]
84 | Summarize the referenced method to use it as a conditioning context for a large Language model like GPT-3.
85 | Do not create any sections or subsections. Only write one coherent text about the main principles and concepts of the method.
86 | """
87 |
88 | class Method(Context):
89 | def __init__(self, source, **kwargs):
90 | super().__init__(**kwargs)
91 | self.source = source
92 |
93 | def forward(self, task, **kwargs):
94 | summary = self.source(task, **kwargs)
95 | # update the dynamic context globally for all types
96 | self.adapt(context=summary, types=[RelatedWork, Abstract, Title, Introduction, Cite])
97 | return super().forward(task | summary, **kwargs)
98 |
99 | @property
100 | def description(self):
101 | return """[Task]
102 | Your goal is to write the method section which describes the main approach and principles used. Add one methodology section with one consistent paragraph. Provide citations and references.
103 | """
104 |
105 |
106 | class Cite(Source):
107 | @property
108 | def description(self):
109 | return """[Task]
110 | Write a short two sentence related work summary in the context of the paper. Do not add any sections or subsections.
111 | """
112 |
113 |
114 | class RelatedWork(Context):
115 | def __init__(self, *citations, **kwargs):
116 | super().__init__(**kwargs)
117 | self.citations = Parallel(*citations, sequential=True) # to avoid API rate limits process parallel citations sequentially
118 |
119 | def forward(self, task, **kwargs):
120 | # execute the parallel tasks
121 | res = self.citations(task, **kwargs)
122 | return super().forward(res, **kwargs)
123 |
124 | @property
125 | def description(self):
126 | return """[Task]
127 | Write a coherent related work section in the context of the paper and based on the provided citation sources. Add one related work section with one consistent paragraph. Provide citations and references.
128 | """
129 |
130 |
131 | class Introduction(Context):
132 | def __init__(self, *citations, **kwargs):
133 | super().__init__(**kwargs)
134 | self.citations = Parallel(*citations, sequential=True)
135 |
136 | def forward(self, task, **kwargs):
137 | # execute the parallel tasks
138 | res = self.citations(task, **kwargs)
139 | return super().forward(res, **kwargs)
140 |
141 | @property
142 | def description(self):
143 | return """[Task]
144 | Write a coherent introduction section in the context of the paper and based on the provided context. Add one introduction section with one consistent paragraph. Provide citations and references.
145 | """
146 |
147 |
148 | class Abstract(Context):
149 | @property
150 | def description(self):
151 | return """[Task]
152 | Write the paper abstract given the provided context. Add one abstract section with one consistent paragraph.
153 | """
154 |
155 |
156 | class Title(Context):
157 | @property
158 | def description(self):
159 | return """[Task]
160 | Write the paper title given the provided context. Add one title tag for the document.
161 | """
162 |
--------------------------------------------------------------------------------
/src/evals/eval_in_context_associations.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from symai import Symbol, Expression
4 | from symai.utils import toggle_test
5 |
6 | from src.utils import MOCK_RETURN, RANDOMNESS, bool_success, normalize
7 |
8 |
9 | ACTIVE = True
10 |
11 |
12 | # Define basic test functions
13 | @toggle_test(ACTIVE, default=MOCK_RETURN)
14 | def test_basic_factual_prompt(aggregate):
15 | '''Sanity check test if the basic prompt works'''
16 | sym = Expression.prompt('''[Last Instruction]
17 | Return only a number as an answer.
18 | [Last Query]
19 | Give the meaning of life a number, meaning that the answer to life, the universe and everything is:
20 | [Answer]''')
21 | # sanity check if models are working
22 | # every model must pass this basic test
23 | res = ('42' in str(sym)) | aggregate.res # collect the result value
24 | return res, bool_success(res)
25 |
26 |
27 | @toggle_test(ACTIVE, default=MOCK_RETURN)
28 | def test_basic_factual_prompt_pi(aggregate):
29 | '''Sanity check test if the basic prompt works'''
30 | sym = Expression.prompt('''[Last Instruction]
31 | Return only a number as an answer.
32 | [Last Query]
33 | Write the number of Pi up to the 10th digit after the comma:
34 | [Last Answer]''') | aggregate.sym # collect the symbol value
35 | # sanity check if models are working
36 | # every model must pass this basic test
37 | base = Symbol('3.1415926535') | aggregate.base # collect the base value
38 | score = sym.measure(base) | aggregate.score # collect the score
39 | return True, {'scores': [score.value]}
40 |
41 |
42 | # Define the test functions based on in-context learning associations and compositions
43 | @toggle_test(ACTIVE, default=MOCK_RETURN)
44 | def test_add_and_equals(aggregate):
45 | '''Test if the addition operator between two number symbols works'''
46 | try:
47 | sym = (Symbol(1) + Symbol(2)).int()
48 | except:
49 | sym = 0 # default value for failure
50 | res = (sym == 3) | aggregate.res # collect the result value
51 | return res, bool_success(res)
52 |
53 |
54 | @toggle_test(ACTIVE, default=MOCK_RETURN)
55 | def test_add_and_equals_2(aggregate):
56 | '''Test if the addition operator between a number symbol and linguistic number symbol works'''
57 | # auto cast to Symbol
58 | try:
59 | sym = (Symbol(17) + 'two').int()
60 | except:
61 | sym = 0 # default value for failure
62 | res = (sym == 19) | aggregate.res # collect the result value
63 | return res, bool_success(res)
64 |
65 |
66 | @toggle_test(ACTIVE, default=MOCK_RETURN)
67 | def test_add_and_equals_3(aggregate):
68 | '''Test if the addition operator between a large number symbol and linguistic number symbol works'''
69 | # auto cast to Symbol
70 | try:
71 | sym = ('two hundred and thirty four' + Symbol(7000)).int()
72 | except:
73 | sym = 0 # default value for failure
74 | res = (sym == 7234) | aggregate.res # collect the result value
75 | return res, bool_success(res)
76 |
77 |
78 | @toggle_test(ACTIVE, default=MOCK_RETURN)
79 | def test_check_pi(aggregate):
80 | '''Test if a fuzzy equality between pi string symbol and an number approximation symbol works'''
81 | # semantic understanding of pi
82 | sym = Symbol('pi') | aggregate.sym # collect the symbol value
83 | # test if pi is equal to 3.14159265... by approximating
84 | res = (sym == '3.14159265...') | aggregate.res # collect the result value
85 | return res, bool_success(res)
86 |
87 |
88 | @toggle_test(ACTIVE, default=MOCK_RETURN)
89 | def test_check_pi_2(aggregate):
90 | '''Test if a fuzzy equality between np.pi number symbol and an number approximation symbol works'''
91 | # has high floating point precision
92 | sym = Symbol(np.pi) | aggregate.sym # collect the symbol value
93 | # test if pi is equal to 3.14159265... by approximating
94 | res = (sym == '3.14159265...') | aggregate.res # collect the result value
95 | return res, bool_success(res)
96 |
97 |
98 | @toggle_test(ACTIVE, default=MOCK_RETURN)
99 | def test_sub_and_contains(aggregate):
100 | '''Test if a semantic subtraction operator between two symbols works'''
101 | # semantic understanding of subtraction
102 | base = 'Hello, I would like a cup of coffee.' | aggregate.base # collect the base value
103 | res = ((Symbol('Hello, I would like a cup of tea.') - Symbol('tea')) + 'coffee') | aggregate.res # collect the result value
104 | rand = Symbol(RANDOMNESS).mean().measure(base) | aggregate.rand # collect the random value
105 | # @NOTE: special case, where we expect the exact solution
106 | score = res.measure(base, normalize=normalize(1.0, rand)) | aggregate.score # collect the score
107 | return True, {'scores': [score.value]}
108 |
109 |
110 | @toggle_test(ACTIVE, default=MOCK_RETURN)
111 | def test_compare(aggregate):
112 | '''Test if a comparison operator between two number symbols works'''
113 | res = (Symbol(10) > Symbol('5'))
114 | # @NOTE: Bernoulli trial
115 | res = (res == True) | aggregate.res # collect the result value
116 | return res, bool_success(res)
117 |
118 |
119 | @toggle_test(ACTIVE, default=MOCK_RETURN)
120 | def test_compare_2(aggregate):
121 | '''Test if a semantic comparison operator between two symbols works'''
122 | res = Symbol(10) < Symbol('fifteen thousand')
123 | # @NOTE: Bernoulli trial
124 | res = (res == True) | aggregate.res # collect the result value
125 | return res, bool_success(res)
126 |
127 |
128 | @toggle_test(ACTIVE, default=MOCK_RETURN)
129 | def test_insert_rshift(aggregate):
130 | '''Test if information can be inserted into a symbol using the RSHIFT operator'''
131 | base = 'I love to eat apples and bananas' | aggregate.base # collect the base value
132 | sym = Symbol('I love to eat apples') | aggregate.sym # collect the symbol value
133 | res = ('and bananas' >> sym) | aggregate.res # collect the result value
134 | # expect exact solution
135 | rand = Symbol(RANDOMNESS).mean().measure(base) | aggregate.rand # collect the random value
136 | # @NOTE: special case, where we expect the exact solution
137 | score = res.measure(base, normalize=normalize(1.0, rand)) | aggregate.score # collect the score
138 | return True, {'scores': [score.value]}
139 |
140 |
141 | @toggle_test(ACTIVE, default=MOCK_RETURN)
142 | def test_extract_information(aggregate):
143 | '''Test if information can be extracted from a symbol using the EXTRACT operator'''
144 | sym = Symbol('I have an iPhone from Apple. And it is not cheap. ' + \
145 | 'I love to eat bananas, mangos, and oranges. ' + \
146 | 'My hobbies are playing football and basketball.') | aggregate.sym # collect the symbol value
147 | res = sym.extract('fruits')
148 | res = str(res).lower().strip() | aggregate.res # collect the result value
149 | cnt = 0
150 | succ = True
151 | # check if the EXTRACT operator retains the 3 essential words
152 | succ &= 'bananas' in res
153 | # @NOTE: Bernoulli trials
154 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value
155 | succ &= 'mangos' in res
156 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value
157 | succ &= 'oranges' in res
158 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value
159 | return succ, {'scores': [cnt/3.0]}
160 |
161 |
162 | @toggle_test(ACTIVE, default=MOCK_RETURN)
163 | def test_extract_contextual_information(aggregate):
164 | '''Test if number information can be extracted from a symbol using the EXTRACT operator'''
165 | sym = Symbol("""Exception: Failed to query GPT-3 after 3 retries. Errors: [InvalidRequestError(message="This model's maximum context length is 4097 tokens, however you requested 7410 tokens (2988 in your prompt; 4422 for the completion). Please reduce your prompt; or completion length.",
166 | param=None, code=None, http_status=400, request_id=None)]""") | aggregate.sym # collect the symbol value
167 | try:
168 | res = sym.extract('requested tokens').int() # cast to int
169 | except:
170 | res = 0 # default value
171 | # check if the EXTRACT operator gets the correct number of tokens
172 | res = (res == 7410) | aggregate.res # collect the result value
173 | return res, bool_success(res)
174 |
175 |
176 | @toggle_test(ACTIVE, default=MOCK_RETURN)
177 | def test_filter(aggregate):
178 | '''Test if filtering information can be applied to a symbol using the FILTER operator'''
179 | sym = Symbol('Physics, Sports, Mathematics, Music, Art, Theater, Writing') | aggregate.sym # collect the symbol value
180 | res = sym.filter('science related subjects')
181 | res = str(res).lower().strip() | aggregate.res # collect the result value
182 | cnt = 0
183 | succ = True
184 | # check if the FILTER operator retains the essential words
185 | # @NOTE: Bernoulli trials
186 | succ &= 'physics' in res
187 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value
188 | succ &= 'mathematics' in res
189 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value
190 | succ &= 'music' not in res
191 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value
192 | succ &= 'art' not in res
193 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value
194 | succ &= 'theater' not in res
195 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value
196 | succ &= 'writing' not in res
197 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value
198 | succ &= 'sports' not in res
199 | cnt += (1 if succ else 0) | aggregate.cnt # collect the result value
200 | return succ, {'scores': [cnt/7.0]}
201 |
202 |
203 | @toggle_test(ACTIVE, default=MOCK_RETURN)
204 | def test_clean(aggregate):
205 | '''Test if cleaning information can be applied to a symbol using the CLEAN operator'''
206 | base = 'Hello World' | aggregate.base # collect the base value
207 | sym = Symbol('Hello *&&7amp;;; \t\t\t\nWorld') | aggregate.sym # collect the symbol value
208 | res = sym.clean() | aggregate.res # collect the result value
209 | # check if the CLEAN operator retains the 2 essential words
210 | # expect exact solution
211 | rand = Symbol(RANDOMNESS).mean().measure(base) | aggregate.rand # collect the random value
212 | # @NOTE: special case, where we expect the exact solution
213 | score = res.measure(base, normalize=normalize(1.0, rand)) | aggregate.score # collect the score
214 | return True, {'scores': [score.value]}
215 |
--------------------------------------------------------------------------------
/src/evals/eval_multimodal_bindings.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from src.utils import normalize, RANDOMNESS, MOCK_RETURN
4 |
5 | from symai import core_ext, Symbol, Expression, Interface, Function
6 | from symai.utils import toggle_test
7 |
8 |
9 | ACTIVE = True
10 |
11 |
12 | OPTION0_BASE_REF = ['Mathematics related topic',
13 | 'MATHEMATICS RELATED TOPIC',
14 | 'mathematics and related topics']
15 | OPTION1_BASE_REF = ['Website Content Scraping and Crawling',
16 | 'web content scraping and crawling',
17 | 'WEBSITE CONTENT RELATED TOPICS']
18 | OPTION2_BASE_REF = ['Search Engine Query',
19 | 'search engine query',
20 | 'SEARCH ENGINE QUERY']
21 | OPTION3_BASE_REF = ['Optical Character Recognition',
22 | 'optical character recognition',
23 | 'OPTICAL CHARACTER RECOGNITION']
24 | OPTION_REFS = [OPTION0_BASE_REF, OPTION1_BASE_REF, OPTION2_BASE_REF, OPTION3_BASE_REF]
25 |
26 |
27 | class Category(Expression):
28 | def __init__(self, **kwargs):
29 | super().__init__(**kwargs)
30 | self.options = {
31 | 0: 'mathematics related topic',
32 | 1: 'website content scraping and crawling',
33 | 2: 'search engine query',
34 | 3: 'optical character recognition',
35 | 4: 'image rendering',
36 | 5: 'image captioning',
37 | 6: 'audio transcription',
38 | 7: 'unknown'
39 | }
40 |
41 | def forward(self):
42 | @core_ext.cache(in_memory=True)
43 | def _embed(_):
44 | def _emb_mapping_(category):
45 | sym = Symbol(category)
46 | return sym.embed()
47 | emb = map(_emb_mapping_, self.options.values())
48 | return list(emb)
49 | return _embed(self)
50 |
51 |
52 | LINEAR_ALGEBRA = 'linear algebra'
53 | NUMBER_COMPARISON = 'number comparison'
54 |
55 |
56 | class MultiModalExpression(Expression):
57 | def __init__(self, val, **kwargs):
58 | super().__init__(val, **kwargs)
59 | # define interfaces
60 | self.solver = Interface('wolframalpha')
61 | self.crawler = Interface('selenium')
62 | self.search = Interface('serpapi')
63 | self.ocr = Interface('ocr')
64 | self.rendering = Interface('dall_e')
65 | self.captioning = Interface('llava')
66 | self.transcribe = Interface('whisper')
67 | # evaluation interfaces
68 | self.clip = Interface('clip')
69 | # define functions
70 | self.func = Function("Summarize the content:")
71 | self.category = Category()
72 |
73 | def detect_option(self, aggregate, assertion):
74 | option = assertion() | aggregate.category.option
75 | # testing the category detection accuracy
76 | category = self.choice(self.category.options.values(), default='unknown', temperature=0.0) | aggregate.category.category
77 | base = Symbol(OPTION_REFS[option])
78 | base_mean = base.mean(axis=0) | aggregate.category.base_mean
79 | base_score = base.cvs() | aggregate.category.base_score
80 | rand_seq = Symbol(RANDOMNESS).mean(axis=0) | aggregate.category.rand_mean
81 | rand_score = base_mean.measure(rand_seq) | aggregate.category.rand_score
82 | score = category.measure(self.category.options[option],
83 | normalize=normalize(base_score, rand_score)) | aggregate.category.score
84 | return option, score.value
85 |
86 | def forward(self, aggregate, assertion, presets, **kwargs):
87 | res = None
88 | scoring = []
89 | success = False
90 | # detect the type of expression
91 | option, score = self.detect_option(aggregate, assertion)
92 | scoring.append(score)
93 |
94 | # mathematical formula
95 | if option == 0:
96 | ref_formula, instance_type, details = presets()
97 | ref_formula = Symbol(ref_formula) | aggregate.ref_formula
98 | formula = self.extract('mathematical formula') | aggregate.formula
99 | score = ref_formula.measure(formula) | aggregate.formula_score
100 | scoring.append(score.value)
101 | # subtypes of mathematical formula
102 | if self.isinstanceof(LINEAR_ALGEBRA, temperature=0.0):
103 | score = (1.0 if str(instance_type) == LINEAR_ALGEBRA else 0.0) | aggregate.linear_function.score
104 | scoring.append(score)
105 | if score == 0.0: # avoid error when in wrong category
106 | # no score for other types of mathematical formula
107 | score = 0.0 | aggregate.linear_function.answer_score
108 | scoring.append(score)
109 | return success, scoring
110 | answer, solutions = details
111 | answer = Symbol(answer) | aggregate.linear_function.answer
112 | # prepare for wolframalpha
113 | res = self.solver(formula)
114 | res = res.query('write a one sentence summary of the answer') | aggregate.number_comparison.res
115 | rand_seq = Symbol(RANDOMNESS).mean(axis=0) | aggregate.number_comparison.rand_mean
116 | sol_mean = solutions.mean(axis=0) | aggregate.number_comparison.solutions_mean
117 | base_score = solutions.cvs() | aggregate.number_comparison.base_score
118 | rand_score = answer.measure(rand_seq) | aggregate.number_comparison.rand_score
119 | score = answer.measure(sol_mean, normalize=normalize(base_score, rand_score)) | aggregate.number_comparison.answer_score
120 | scoring.append(score.value)
121 | success = True
122 |
123 | elif self.isinstanceof(NUMBER_COMPARISON, temperature=0.0):
124 | score = (1.0 if str(instance_type) == NUMBER_COMPARISON else 0.0) | aggregate.number_comparison.score
125 | scoring.append(score)
126 | if score == 0.0: # avoid error when in wrong category
127 | # no score for other types of mathematical formula
128 | score = 0.0 | aggregate.number_comparison.answer_score
129 | scoring.append(score)
130 | return success, scoring
131 | answer = details | aggregate.number_comparison.answer
132 | res = self.solver(formula) # send directly to wolframalpha
133 | score = (1.0 if res == answer else 0.0) | aggregate.number_comparison.answer_score
134 | scoring.append(score)
135 | success = True
136 |
137 | else:
138 | # no score for other types of mathematical formula
139 | score = 0.0 | aggregate.unknown.score
140 | scoring.append(score)
141 | success = False
142 |
143 | # website content scraping and crawling
144 | elif option == 1:
145 | ori_url, page, content_sym, base_score, rand_score = presets()
146 | ori_url_sym = Symbol(ori_url) | aggregate.website_scraping.ori_url
147 | url = self.extract('url') | aggregate.website_scraping.gen_url
148 | score = ori_url_sym.measure(url) | aggregate.website_scraping.score
149 | scoring.append(score.value)
150 | res = self.func(page) | aggregate.website_scraping.res
151 | # normalize the score towards the original content
152 | score = content_sym.measure(res, normalize=normalize(base_score, rand_score)) | aggregate.website_scraping.score
153 | scoring.append(score.value)
154 | success = True
155 |
156 | # search engine query
157 | elif option == 2:
158 | answer = presets() | aggregate.search_engine.answer
159 |
160 | if kwargs.get('real_time'):
161 | res = self.search(self.value)
162 | res = res.raw.organic_results.to_list()
163 | else:
164 | snippet_path = Path(__file__).parent / "snippets" / "google_organic_results_20240111_query=What-is-sulfuric-acid.txt"
165 | res = open(snippet_path, "r").read()
166 |
167 | res = Symbol(res) | aggregate.search_engine.res
168 | res = res.extract("The answer based on the CDC source.")
169 | score = res.measure(answer) | aggregate.search_engine.score
170 | scoring.append(score.value)
171 | success = True
172 |
173 | # optical character recognition
174 | elif option == 3:
175 | answer = presets() | aggregate.ocr_engine.answer
176 | if kwargs.get('real_time'):
177 | res = self.ocr((Path(__file__).parent / "assets" / "sample_bill.jpg").as_posix())
178 | else:
179 | snippet_path = Path(__file__).parent / "snippets" / "sample_bill.txt"
180 | res = open(snippet_path, "r").read()
181 | res = Symbol(res)
182 |
183 | res = res.extract(self.value) | aggregate.ocr_engine.res
184 | score = res.measure(answer) | aggregate.ocr_engine.score
185 | scoring.append(score.value)
186 | success = True
187 |
188 | # Other modalities we could evaluate and include in the score in the future, but exceeds the scope of this benchmark.
189 | # image rendering
190 | # elif option == 4:
191 | # query = self.extract('image url')
192 | # res = self.rendering(query)
193 |
194 | # image captioning
195 | # elif option == 5:
196 | # image = self.extract('image path')
197 | # res = self.captioning(image)
198 |
199 | # audio transcription
200 | # elif option == 6:
201 | # audio = self.extract('audio path')
202 | # res = self.transcribe(audio)
203 |
204 | else:
205 | score = 0.0 | aggregate.unknown.score
206 | scoring.append(score)
207 | success = False
208 |
209 | return success, scoring
210 |
211 |
212 | @toggle_test(ACTIVE, default=MOCK_RETURN)
213 | def test_website_scraping(aggregate):
214 | # scraped content
215 | content = """ChatGPT back online after ‘major outage,’ OpenAI says
216 | PUBLISHED THU, DEC 14 20231:58 AM EST
217 |
218 | KEY POINTS
219 | OpenAI on Thursday said that a major outage on its artificial intelligence chatbot ChatGPT was resolved.
220 | ChatGPT had issues for around 40 minutes, during which service was “intermittently unavailable.”
221 | OpenAI did not give an explanation on what caused the latest issues.
222 |
223 | OpenAI on Thursday said that a major outage on its artificial intelligence chatbot, ChatGPT, was resolved.
224 |
225 | ChatGPT had issues for around 40 minutes, during which the service was “intermittently unavailable.”
226 |
227 | OpenAI also said that some users of ChatGPT Enterprise, which is designed for businesses, were encountering “elevated error rates.”
228 |
229 | Earlier this month, ChatGPT suffered another issue, where the company said around 10% of users may have been unable to send a message to ChatGPT. The AI technology had another major outage in November.
230 |
231 | OpenAI did not give an explanation on what caused the latest issues.
232 |
233 | ChatGPT broke records as the fastest-growing consumer app in history and now has about 100 million weekly active users, while more than 92% of Fortune 500 companies employ the platform, according to OpenAI.
234 |
235 | The Microsoft
236 | -backed company has had a rocky time of late, as the board fired CEO Sam Altman in November, only for him to be reinstated days later after pressure from employees and investors.
237 |
238 | — CNBC’s Hayden Field contributed to this article."""
239 | summary = """OpenAI reported that a significant outage affecting its AI chatbot, ChatGPT, was resolved following a 40-minute disruption that left the service intermittently unavailable. It was noted that users of the ChatGPT Enterprise experienced elevated error rates as well. Earlier in the month and in November, ChatGPT had faced other service issues. OpenAI did not disclose the cause of the recent outage. ChatGPT has become immensely popular, touted as the fastest-growing consumer app ever, with approximately 100 million weekly active users and adoption by many top companies. Despite its success, OpenAI, supported by Microsoft, has experienced some turbulence, including the brief dismissal and subsequent reinstatement of CEO Sam Altman."""
240 | url = "https://www.cnbc.com/2023/12/14/chatgpt-back-online-after-major-outage-openai-says.html"
241 | val = f"crawl the news site from {url}"
242 | expr = MultiModalExpression(val)
243 | content_sym = Symbol(content) | aggregate.content
244 | summary_sym = Symbol(summary) | aggregate.summary
245 | base_score = content_sym.measure(summary_sym) | aggregate.content_score
246 | rand_seq = Symbol(RANDOMNESS).mean(axis=0) | aggregate.rand_seq
247 | rand_score = content_sym.measure(rand_seq) | aggregate.rand_score
248 | succ, scoring = expr(
249 | aggregate,
250 | lambda: 1,
251 | lambda: (url, content, content_sym, base_score, rand_score)
252 | )
253 | return succ, {'scores': scoring}
254 |
255 |
256 | @toggle_test(ACTIVE, default=MOCK_RETURN)
257 | def test_search_engine(aggregate):
258 | query = "What is sulfuric acid?"
259 | # Let's test whether or not it can extract the answer based on the CDC source.
260 | answer = Symbol("Sulfuric acid (H2S04) is a corrosive substance, destructive to the skin, eyes, teeth, and lungs. Severe exposure can result in death.")
261 | expr = MultiModalExpression(query)
262 | succ, scoring = expr(
263 | aggregate,
264 | lambda: 2,
265 | lambda: answer,
266 | real_time=False
267 | )
268 |
269 | return succ, {'scores': scoring}
270 |
271 |
272 | @toggle_test(ACTIVE, default=MOCK_RETURN)
273 | def test_linear_function_computation(aggregate):
274 | query = Symbol('Analyse the following vectors and asses if (2, -11, 2) and (14, 2, 2) are linearly dependent?')
275 | ref = Symbol("(2, -11, 2) and (14, 2, 2) are linearly independent.")
276 | solutions = Symbol([
277 | "(2, -11, 2) and (14, 2, 2) are actually linearly independent.",
278 | "No, the vectors (2, -11, 2) and (14, 2, 2) demonstrate linear independence.",
279 | "The vectors (2, -11, 2) and (14, 2, 2) are not linearly dependent."
280 | ])
281 | expr = MultiModalExpression(query)
282 | succ, scoring = expr(
283 | aggregate,
284 | lambda: 0,
285 | lambda: ('(2, -11, 2) and (14, 2, 2) are linearly independent?', Symbol(LINEAR_ALGEBRA), (ref, solutions))
286 | )
287 |
288 | return succ, {'scores': scoring}
289 |
290 |
291 | @toggle_test(ACTIVE, default=MOCK_RETURN)
292 | def test_comparison(aggregate):
293 | val = Symbol("is 100044347 bigger than 129981063.472?")
294 | expr = MultiModalExpression(val)
295 | succ, res = expr(
296 | aggregate,
297 | lambda: 0,
298 | lambda: ('100044347 > 129981063.472', Symbol(NUMBER_COMPARISON), False)
299 | )
300 | return succ, {'scores': res}
301 |
302 |
303 | @toggle_test(ACTIVE, default=MOCK_RETURN)
304 | def test_ocr_engine(aggregate):
305 | query = "Extract the current balance from the bill image."
306 | answer = Symbol("$ 21,920.37")
307 | expr = MultiModalExpression(query)
308 | succ, scoring = expr(
309 | aggregate,
310 | lambda: 3,
311 | lambda: answer,
312 | real_time=False
313 | )
314 | return succ, {'scores': scoring}
315 |
316 |
--------------------------------------------------------------------------------
/src/evals/eval_program_synthesis.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from src.utils import normalize, rand_ast_measure, ast_measure, RANDOMNESS, MOCK_RETURN
4 |
5 | from symai import Symbol, Expression, Conversation, Call
6 | from symai.components import FileReader, Execute, RuntimeExpression, ExpressionBuilder
7 | from symai.processor import ProcessorPipeline
8 | from symai.post_processors import StripPostProcessor, CodeExtractPostProcessor
9 | from symai.utils import toggle_test
10 | from symai.extended.api_builder import APIBuilder, StackTraceRetryExecutor
11 |
12 |
13 | ACTIVE = True
14 | cur_file_dir = os.path.dirname(os.path.abspath(__file__))
15 |
16 |
17 | @toggle_test(ACTIVE, default=MOCK_RETURN)
18 | def test_application_template(aggregate):
19 | task = """[Task]
20 | Create a function `create_latex_result` that takes in the `benchmark_results` as `data` and parses the LaTeX table rows and columns based on the `data` results. The table should follow the `latex_template` format and populate the rows table as indicated by the placeholder variables. Mark the best performing model per row with bold text. At the bottom of the benchmarks, place the values of the total row by computing the average over all columns and populating the `total_values` entry in the `latex_template`.
21 | The table should be returned as a string by the function.
22 | All required imports are already provided. The code of the `create_latex_result` function should be written between a
23 | ```python
24 | ...
25 | ```
26 | code block.
27 | The `create_latex_result` function must be self-contained, fully functional and pass all tests.
28 | No other functions or explanations are required.
29 | """
30 | # Define random sequence to normalize data
31 | random_seq = Symbol(RANDOMNESS).mean(axis=0) | aggregate.random_seq
32 | # Create a template
33 | template = os.path.join(cur_file_dir, 'snippets/latex_templating_problem.txt')
34 | conv = Conversation(file_link=[template], auto_print=False)
35 | raw_res = conv(task) | aggregate.gen_raw_res
36 | scoring = []
37 | processors = ProcessorPipeline([StripPostProcessor(), CodeExtractPostProcessor()])
38 | code = Symbol(processors(str(raw_res), None)) | aggregate.gen_code
39 | reader = FileReader()
40 | solution1 = reader(os.path.join(cur_file_dir, 'snippets/latex_templating_solution_1.txt')) | aggregate.solution1
41 | solution2 = reader(os.path.join(cur_file_dir, 'snippets/latex_templating_solution_2.txt')) | aggregate.solution2
42 | solutions = Symbol([solution1, solution2]).mean(axis=0) | aggregate.solutions
43 | base_score = solution1.measure(solution2) | aggregate.conv_base_score
44 | # remove the chance of simply rephrasing the task description
45 | rand_score = solutions.measure(random_seq) | aggregate.conv_rand_score
46 | score = solutions.measure(raw_res, normalize=normalize(base_score, rand_score)) | aggregate.conv_score
47 | scoring.append(score.value)
48 |
49 | # Read the source code from files
50 | solution1 = Symbol(solution1, callables=[Call('measure', ast_measure)])
51 | # compute again normalization score but this time for AST measure
52 | base_score = solution1.measure(solution2) | aggregate.ast_base_score
53 | rand_score = (0.5*(rand_ast_measure(solution1) + rand_ast_measure(solution2))) | aggregate.ast_rand_score
54 | score = solution1.measure(code, normalize=normalize(base_score, rand_score)) | aggregate.ast_score
55 | scoring.append(score.value)
56 |
57 | # Execute the code
58 | code = reader(template).str().replace('{TODO}', str(code))
59 | runner = Execute(enclosure=True)
60 | success = False
61 | try:
62 | res = runner(code)
63 | # extract the output from the locals
64 | out = Symbol(res['locals']['_output_']) | aggregate.code_output
65 | ori = reader(os.path.join(cur_file_dir, 'snippets/latex_templating_output.txt')) | aggregate.code_solution
66 | # no normalization is needed here since the output has to be an exact match
67 | score = out.measure(ori) | aggregate.code_score
68 | scoring.append(score.value)
69 | success = True
70 | except Exception as e:
71 | score = 0.0 | aggregate.code_score
72 | scoring.append(score)
73 |
74 | return success, {'scores': scoring}
75 |
76 |
77 | class APIExecutor(Expression):
78 | def __init__(self, verbose=False, **kwargs):
79 | super().__init__(**kwargs)
80 | self.builder = APIBuilder()
81 | self.executor = StackTraceRetryExecutor(retries=0) # disable retries
82 | self._verbose = verbose
83 | self._request = None
84 | self._code = None
85 | self._result = None
86 |
87 | @property
88 | def _runnable(self):
89 | return self.executor._runnable
90 |
91 | def forward(self, aggregate, request: Symbol, presets, **kwargs) -> Symbol:
92 | answer, refs, code, code2, rand = presets()
93 | self._request = self._to_symbol(request)
94 | if self._verbose: print('[REQUEST]', self._request)
95 | # Generate the code to implement the API call
96 | try:
97 | self._code = self.builder(self._request)
98 | except Exception as e:
99 | code_score = 0.0 | aggregate.code_score
100 | web_score = 0.0 | aggregate.web_score
101 | return [code_score, web_score]
102 | if self._verbose: print('[GENERATED_CODE]', self._code)
103 | base_score = code.measure(code2) | aggregate.base_score
104 | rand_score = rand.measure(refs) | aggregate.rand_score
105 | code_score = code.measure(self._code, normalize=normalize(base_score, rand_score)) | aggregate.code_score
106 | code_score = code_score.value
107 | # Execute the code to define the 'run' function
108 | try:
109 | self._result = self.executor(str(self._code), request=self._request) | aggregate.output
110 | if self._verbose: print('[RESULT]:', self._result)
111 | web_score = answer.measure(self._result) | aggregate.web_score
112 | web_score = web_score.value
113 | except Exception as e:
114 | self._result = str(e)
115 | web_score = 0.0 | aggregate.web_score
116 | self._value = self._result
117 | return [code_score, web_score]
118 |
119 |
120 | @toggle_test(ACTIVE, default=MOCK_RETURN)
121 | def test_api_builder(aggregate):
122 | answer = Symbol("Yannic Kilcher") | aggregate.answer
123 | rand_seq = Symbol(RANDOMNESS).mean(axis=0) | aggregate.random_seq
124 | reader = FileReader()
125 | website = reader(os.path.join(cur_file_dir, 'snippets/code_api_builder_website_result.txt'))
126 | ref_code = reader(os.path.join(cur_file_dir, 'snippets/code_api_builder.txt')) | aggregate.ref_code
127 | ref_code2 = reader(os.path.join(cur_file_dir, 'snippets/code_api_builder2.txt')) | aggregate.ref_code2
128 | refs = Symbol([ref_code, ref_code2]).mean(axis=0) | aggregate.refs
129 | executor = APIExecutor() # creates code on the fly and executes it
130 | scores = executor(aggregate,
131 | 'Fetch data from URL https://www.ykilcher.com/ and use Function to extract the full name of the author.', # the request
132 | lambda: (answer, refs, ref_code, ref_code2, rand_seq)) # interprets the instruction to generate a HTTP request
133 | return True, {'scores': scores}
134 |
135 |
136 | @toggle_test(ACTIVE, default=MOCK_RETURN)
137 | def test_expression_builder(aggregate):
138 | solution1 = Symbol("""
139 | # do not remove or change the imports
140 | from symai import Expression, Function, Symbol
141 | class QueryExpression(Expression):
142 | # initialize the expression with task specific arguments
143 | def __init__(self, prompt: str, **kwargs):
144 | super().__init__(**kwargs)
145 | self.func = Function(prompt, **kwargs)
146 |
147 | # define the forward function with data specific arguments
148 | def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
149 | sym = self._to_symbol(sym)
150 | result = self.func(sym, *args, **kwargs)
151 | return result
152 | # assign the expression type to the variable _value_obj_
153 | _value_obj_ = QueryExpression
154 | """) | aggregate.solution1
155 | solution2 = Symbol("""
156 | from symai import Expression, Function, Symbol
157 | class QueryExpression(Expression):
158 | def __init__(self, prompt: str, **kwargs):
159 | super().__init__(**kwargs)
160 | self.func = Function(prompt, **kwargs)
161 | def forward(self, sym: Symbol, *args, **kwargs) -> Symbol:
162 | sym = self._to_symbol(sym)
163 | return self.func(sym, *args, **kwargs)
164 | _value_obj_ = QueryExpression
165 | """) | aggregate.solution2
166 | solutions = Symbol([solution1, solution2]).mean(axis=0) | aggregate.solutions
167 | rand_seq = Symbol(RANDOMNESS).mean(axis=0) | aggregate.random_seq
168 | builder = ExpressionBuilder()
169 | code = builder("Create a query Expression that is initializes a Function with a prompt and processes a data Symbol based on the custom Function.")
170 | runner = RuntimeExpression()
171 | scoring = []
172 | try:
173 | expr = runner(code)
174 | score = 1.0 | aggregate.code_score
175 | scoring.append(score)
176 | # initialize the expression with the prompt
177 | query = expr('extract the names from the text')
178 | except:
179 | score = 0.0 | aggregate.code_score
180 | scoring.append(score)
181 | base_score = solution1.measure(solution2) | aggregate.base_score
182 | rand_score = solutions.measure(rand_seq) | aggregate.rand_score
183 | score = solution1.measure(code, normalize=normalize(base_score, rand_score)) | aggregate.code_score
184 | scoring.append(score.value)
185 | try:
186 | # run the expression on the data
187 | res = query('Hello my name is Max and I am 20 years old.') | aggregate.query_res
188 | score = res.measure('Max') | aggregate.query_score
189 | scoring.append(score.value)
190 | except:
191 | score = 0.0 | aggregate.query_score
192 | scoring.append(score)
193 | return True, {'scores': scoring}
194 |
--------------------------------------------------------------------------------
/src/evals/snippets/code_api_builder.txt:
--------------------------------------------------------------------------------
1 | def run(text: str) -> str: # [MANAGED] entry point cannot be changed
2 | # [MANAGED-BEGIN] mandatory imports here
3 | import traceback
4 | import requests
5 | from symai import Function
6 | # [MANAGED-END] mandatory imports here
7 |
8 | # executable code here
9 | try: # [MANAGED] must contain this line, do not change
10 | # API call to fetch data from URL
11 | response = requests.get('https://www.ykilcher.com/')
12 |
13 | # Check if the request was successful
14 | if response.status_code == 200:
15 | res = response.text # Get the content of the response
16 | else:
17 | res = f"Error: {response.status_code}"
18 |
19 | # mandatory return statement here
20 | res = str(res) # [MANAGED] must contain this line, do not change
21 |
22 | # Use the Function class to log messages
23 | func = Function('Extract full name from text')
24 | res = func(res)
25 |
26 | return res # [MANAGED] must return a string, do not change
27 | except Exception as e: # [MANAGED] must catch all exceptions and return them as string
28 | tb = traceback.format_exc() # [MANAGED] return full error stack trace as string
29 | return tb # [MANAGED] return tb as string, do not change
30 |
31 | # Example request value
32 | value = "Fetch data from URL https://www.ykilcher.com/"
33 | # mandatory statement here
34 | res = run(value) # [MANAGED] must contain this line, do not change
35 |
--------------------------------------------------------------------------------
/src/evals/snippets/code_api_builder2.txt:
--------------------------------------------------------------------------------
1 | def run(text: str) -> str:
2 | import requests
3 | from symai import Function
4 | url = 'https://www.ykilcher.com/'
5 | rsp = requests.get(url)
6 | if rsp.status_code != 200:
7 | raise Exception(f"Error: {rsp.status_code}")
8 | res = rsp.text
9 | res = str(res)
10 | func = Function('extract the name from text')
11 | return func(res)
12 | value = "Fetch data from URL https://www.ykilcher.com/"
13 | res = run(value)
14 |
--------------------------------------------------------------------------------
/src/evals/snippets/code_api_builder_website_result.txt:
--------------------------------------------------------------------------------
1 | '