├── tests ├── __init__.py ├── test-emptiness.aipl ├── test_colon_rejoin.aipl ├── match-filter.aipl ├── test-def.aipl ├── input_cols.aipl ├── op-globals.aipl ├── globals.aipl ├── test_scripts.py ├── toplevel-ravel.aipl ├── test-named-ravel.aipl └── test-xml.aipl ├── aipl ├── ops │ ├── __init__.py │ ├── sleep.py │ ├── print.py │ ├── comment.py │ ├── abort.py │ ├── literal.py │ ├── replace.py │ ├── unbox.py │ ├── save.py │ ├── match.py │ ├── table.py │ ├── nop.py │ ├── pdf.py │ ├── dedup.py │ ├── name.py │ ├── format.py │ ├── csv.py │ ├── join.py │ ├── ref.py │ ├── input.py │ ├── take.py │ ├── sample.py │ ├── cluster.py │ ├── url.py │ ├── filter.py │ ├── db.py │ ├── sh.py │ ├── def.py │ ├── split.py │ ├── ravel.py │ ├── columns.py │ ├── cross.py │ ├── test.py │ ├── regex.py │ ├── read.py │ ├── groupby.py │ ├── sort.py │ ├── python.py │ ├── extract.py │ ├── xml.py │ ├── json.py │ ├── debug.py │ ├── metrics.py │ └── llm.py ├── __main__.py ├── test_format.aipl ├── test_db.py ├── repl.py ├── utils.py ├── caching.py ├── __init__.py ├── test_core.py ├── db.py ├── main.py ├── test_parse.py ├── parser.py ├── clients.py ├── table.py └── interpreter.py ├── requirements-geo.txt ├── examples ├── benchmarks │ ├── models.txt │ ├── tasks.txt │ ├── bigbench-binary-classification.aipl │ └── bigbench-binary-classification-local.aipl ├── random-link.aipl ├── rewrite.aipl ├── git-commit.aipl ├── rowan │ └── load-json-v4.aipl ├── saulpw │ └── crossword-log.aipl ├── cluster.aipl ├── summarize.aipl ├── nyt-cooking.aipl ├── hanukkah-of-data-5783.aipl └── wiki-to-map.aipl ├── .github ├── FUNDING.yml └── workflows │ └── main.yml ├── .gitignore ├── bin └── aipl ├── pytest.ini ├── about ├── 23-design-log.md ├── README.md ├── roadmap.md ├── vision.md ├── 23-faq.md └── 23-design.md ├── conftest.py ├── tools ├── vscode │ ├── README.md │ ├── language-configuration.json │ ├── package.json │ └── syntaxes │ │ └── aipl.tmLanguage.json ├── README.md ├── translate-dialect.py └── aipl.vim ├── requirements.txt ├── pyproject.toml ├── setup.py ├── LICENSE.mit ├── docs ├── writing-operators.md ├── operators.md └── tutorial.md ├── README.aipl └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aipl/ops/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements-geo.txt: -------------------------------------------------------------------------------- 1 | folium==0.14 -------------------------------------------------------------------------------- /examples/benchmarks/models.txt: -------------------------------------------------------------------------------- 1 | gpt-3.5-turbo -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: saulpw 2 | patreon: saulpw 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.log 3 | aipl-cache.sqlite 4 | wip/ 5 | tags 6 | -------------------------------------------------------------------------------- /bin/aipl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from aipl import main 4 | 5 | main() 6 | -------------------------------------------------------------------------------- /aipl/__main__.py: -------------------------------------------------------------------------------- 1 | from . import main 2 | 3 | if __name__ == '__main__': 4 | main() 5 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | python_files = test_*.py ops/*.py 3 | python_functions = test_ 4 | 5 | -------------------------------------------------------------------------------- /tests/test-emptiness.aipl: -------------------------------------------------------------------------------- 1 | !literal 2 | # test a whole lotta nothing 3 | !extract-links 4 | !name url 5 | -------------------------------------------------------------------------------- /about/23-design-log.md: -------------------------------------------------------------------------------- 1 | 2 | ## The prompt is used as verbatim input (except for whitespace dedent) 3 | 4 | ## 5 | -------------------------------------------------------------------------------- /tests/test_colon_rejoin.aipl: -------------------------------------------------------------------------------- 1 | !test-input 2 | a 3 | b 4 | c 5 | !split 6 | !join sep=: 7 | !test-equal 8 | a:b:c 9 | -------------------------------------------------------------------------------- /examples/benchmarks/tasks.txt: -------------------------------------------------------------------------------- 1 | causal_judgment 2 | strategyqa 3 | moral_permissibility 4 | anachronisms 5 | mathematical_induction -------------------------------------------------------------------------------- /tests/match-filter.aipl: -------------------------------------------------------------------------------- 1 | !test-input 2 | ab zh cd zq azzz z 3 | !split>keep 4 | !match ^z 5 | !filter 6 | !join 7 | !test-equal 8 | zh zq z 9 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from aipl import AIPL 4 | 5 | @pytest.fixture() 6 | def aipl(): 7 | r = AIPL(debug=True, test=True) 8 | return r 9 | -------------------------------------------------------------------------------- /examples/random-link.aipl: -------------------------------------------------------------------------------- 1 | # pick a random link from the given webpages(s) 2 | 3 | !name baseurl 4 | !read 5 | !extract-links {baseurl} 6 | !sample 1 7 | !print 8 | -------------------------------------------------------------------------------- /aipl/ops/sleep.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from aipl import defop 4 | 5 | @defop('sleep', 0, 0) 6 | def _(aipl, n:float) -> float: 7 | time.sleep(n) 8 | return n 9 | -------------------------------------------------------------------------------- /aipl/ops/print.py: -------------------------------------------------------------------------------- 1 | from aipl import defop 2 | 3 | @defop('print', 0, None) 4 | def op_print(aipl, v:str): 5 | 'Print to stdout.' 6 | print(v, file=aipl.stdout) 7 | 8 | -------------------------------------------------------------------------------- /aipl/ops/comment.py: -------------------------------------------------------------------------------- 1 | from aipl import defop 2 | 3 | 4 | @defop('comment', None, None) 5 | def op_comment(aipl, *args, **kwargs): 6 | 'Do nothing (ignore args and prompt).' 7 | pass 8 | -------------------------------------------------------------------------------- /aipl/ops/abort.py: -------------------------------------------------------------------------------- 1 | from aipl import defop, UserAbort 2 | 3 | 4 | @defop('abort', None, None) 5 | def op_abort(aipl, *args): 6 | "Abort the current chain." 7 | raise UserAbort(*args) 8 | -------------------------------------------------------------------------------- /aipl/ops/literal.py: -------------------------------------------------------------------------------- 1 | from aipl import defop 2 | 3 | @defop('literal', 0, 0) 4 | def op_literal(aipl, prompt=''): 5 | 'Set prompt as top-level input, without formatting.' 6 | return prompt 7 | -------------------------------------------------------------------------------- /tests/test-def.aipl: -------------------------------------------------------------------------------- 1 | 2 | !!def first-3-words 3 | !split 4 | !take 3 5 | !join 6 | 7 | !test-input 8 | now is the time for all good men 9 | 10 | !first-3-words 11 | 12 | !test-equal 13 | now is the 14 | -------------------------------------------------------------------------------- /tests/input_cols.aipl: -------------------------------------------------------------------------------- 1 | !split sep=: >abc << a:b:c 2 | !format>dundered << _{_}_ 3 | 4 | !join sep='' str: 5 | 'Replace `find` in all leaf values with `repl`.' 6 | return s.replace(find, repl) 7 | 8 | -------------------------------------------------------------------------------- /tests/op-globals.aipl: -------------------------------------------------------------------------------- 1 | import string 2 | 3 | @defop('capwords', 0) 4 | def _(aipl, v): 5 | return string.capwords(v) 6 | 7 | !split 8 | abc def ghi 9 | 10 | !capwords 11 | !join sep=, 12 | 13 | !test-equal 14 | Abc,Def,Ghi 15 | -------------------------------------------------------------------------------- /aipl/ops/unbox.py: -------------------------------------------------------------------------------- 1 | from aipl import defop, Table 2 | 3 | 4 | @defop('unbox', 1.5, 1.5) 5 | def op_unbox(aipl, t:Table) -> Table: 6 | 'Return value of single-row table (remove outermost layer).' 7 | assert len(t) == 1 8 | return Table(t[0].value) 9 | -------------------------------------------------------------------------------- /aipl/ops/save.py: -------------------------------------------------------------------------------- 1 | from aipl import defop 2 | 3 | @defop('save', 0, None) 4 | def op_save(aipl, v:str, filename=''): 5 | 'Save to given filename.' 6 | assert '{' not in filename, filename 7 | with open(filename, 'w') as fp: 8 | fp.write(v) 9 | 10 | -------------------------------------------------------------------------------- /aipl/ops/match.py: -------------------------------------------------------------------------------- 1 | from aipl import defop 2 | 3 | @defop('match', 0, 0) 4 | def op_match(aipl, v:str, regex:str) -> bool: 5 | 'Return a bool with whether value matched regex. Used with !filter.' 6 | import re 7 | m = re.search(regex, v) 8 | return m is not None 9 | 10 | -------------------------------------------------------------------------------- /aipl/ops/table.py: -------------------------------------------------------------------------------- 1 | from aipl import defop, Table, alias 2 | 3 | 4 | @defop('table', 100, 1.5) 5 | def op_table(aipl, t:Table, tname:str) -> Table: 6 | 'Save toplevel input into global list of tables.' 7 | aipl.tables[tname] = t 8 | return t 9 | 10 | 11 | alias('global', 'table') 12 | -------------------------------------------------------------------------------- /about/README.md: -------------------------------------------------------------------------------- 1 | # About AIPL 2 | 3 | AIPL is a pseudo-computer language (skin on top of Python) that makes it easy to develop prototypes for data processing tasks, with language models as first class citizens. 4 | 5 | - [Announcement](23-announcement.md) 6 | - [Design](23-design.md) 7 | - [FAQ](23-faq.md) 8 | -------------------------------------------------------------------------------- /aipl/ops/nop.py: -------------------------------------------------------------------------------- 1 | from aipl import defop, alias 2 | 3 | 4 | @defop('nop', None, None) 5 | def op_nop(aipl): 6 | 'No operation.' 7 | pass 8 | 9 | 10 | #@defop('identity', 0, 0) 11 | #def op_identity(aipl, v): 12 | # return v 13 | alias('identity', 'nop') # functionally equivalent in AIPL 14 | -------------------------------------------------------------------------------- /tools/vscode/README.md: -------------------------------------------------------------------------------- 1 | Loosely based on this [Redshift syntax highlighter](https://github.com/ronsoak/vsc_redshift_extension). This is rudimentary and far from complete. 2 | 3 | ## installing 4 | 5 | On a Unix system, copy the vscode folder to `~/.vscode/extensions/aipl-syntax` (or whatever you want to name the new dir). 6 | -------------------------------------------------------------------------------- /aipl/ops/pdf.py: -------------------------------------------------------------------------------- 1 | from aipl import defop 2 | 3 | 4 | @defop('pdf-extract', 0, 0) 5 | def op_pdf_extract(aipl, pdfdata:bytes) -> str: 6 | 'Extract contents of pdf to value.' 7 | from pdfminer.high_level import extract_text 8 | from io import BytesIO 9 | s = BytesIO(pdfdata) 10 | return extract_text(s) 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai 2 | scikit-learn 3 | numpy 4 | trafilatura # for html extraction 5 | beautifulsoup4 # for xml/html extraction 6 | pdfminer # for pdf extraction 7 | tiktoken # for cost-estimation 8 | lxml # for xml/html extraction 9 | lark # for parser 10 | rich # for --step-rich table display 11 | -------------------------------------------------------------------------------- /tests/globals.aipl: -------------------------------------------------------------------------------- 1 | !format>>feigenbaum 2 | 4.66920 3 | !!python 4 | from aipl import defop 5 | from aipl.table import Table 6 | @defop('test', 1.5, 0) 7 | def op_test(aipl, t:Table) -> str: 8 | return '42' 9 | 10 | !test 11 | !format>answer 12 | {feigenbaum[0].value} 13 | !test-json 14 | [{"answer": "4.66920"}] 15 | !nop 16 | -------------------------------------------------------------------------------- /aipl/ops/dedup.py: -------------------------------------------------------------------------------- 1 | from aipl import defop 2 | 3 | @defop('dedup', 1, 1) 4 | def _(aipl, v:list) -> list: 5 | 'Deduplicate a list of scalars.' 6 | return list(set(v)) 7 | 8 | 9 | def test_dedup(aipl): 10 | r = aipl.run_test('!split !dedup !sort !join', 'a b a b d c c c a b') 11 | assert r[0].value == 'a b c d' 12 | -------------------------------------------------------------------------------- /examples/rewrite.aipl: -------------------------------------------------------------------------------- 1 | # Rewrite the contents of each given file. 2 | 3 | !name filename 4 | !read-file 5 | !split sep=\n\n maxsize=3000 6 | !format 7 | Rewrite this content, keeping the structure the same as a drop-in replacement: 8 | 9 | """ 10 | {_} 11 | """ 12 | 13 | !llm model=gpt-3.5-turbo 14 | !join 15 | !save {filename}.out 16 | -------------------------------------------------------------------------------- /aipl/ops/name.py: -------------------------------------------------------------------------------- 1 | from copy import copy 2 | 3 | from aipl import defop, Table 4 | 5 | 6 | @defop('name', 1.5, 1.5) 7 | def op_name(aipl, t:Table, name) -> Table: 8 | 'Rename current input column to given name.' 9 | ret = copy(t) 10 | ret.rows = copy(t.rows) 11 | c = ret.current_col 12 | c.name = name 13 | return ret 14 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | ## Vim Syntax Highlighting 2 | 3 | mkdir -p ~/.vim/syntax 4 | cp tools/aipl.vim ~/.vim/syntax/aipl.vim 5 | mkdir -p ~/.vim/ftdetect 6 | cat > ~/.vim/ftdetect/aipl.vim 7 | 8 | au BufRead,BufNewFile *.aipl set filetype=aipl 9 | 10 | Reference: https://vim.fandom.com/wiki/Creating_your_own_syntax_files#Install_the_syntax_file 11 | -------------------------------------------------------------------------------- /tests/test_scripts.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from glob import glob 3 | import pytest 4 | from aipl.interpreter import AIPL 5 | 6 | @pytest.mark.parametrize("input_file", glob("tests/*.aipl")) 7 | def test_script(aipl, input_file): 8 | aipl.stdout = StringIO() 9 | aipl.options.test = True 10 | 11 | with open(input_file) as fh: 12 | aipl.run_test(fh.read(), '') 13 | -------------------------------------------------------------------------------- /aipl/test_format.aipl: -------------------------------------------------------------------------------- 1 | # test: prompt is dedented 2 | 3 | !format>golden 4 | 1 5 | 2 6 | # literal line that starts with '#' 7 | 3 8 | 9 | !format 10 | 11 | 1 12 | # comments are not part of the prompt and can be interspersed 13 | 2 14 | # literal line that starts with '#' 15 | 3 16 | 17 | # leading and trailing newlines are also stripped 18 | 19 | !test-equal {golden} 20 | -------------------------------------------------------------------------------- /aipl/ops/format.py: -------------------------------------------------------------------------------- 1 | from collections import ChainMap 2 | 3 | from aipl import defop, LazyRow 4 | 5 | @defop('format', 0.5, 0, rankin2=0) 6 | def op_format(aipl, row:LazyRow, prompt:str='') -> str: 7 | 'Format prompt text (right operand) as a Python string template, substituting values from row (left operand) and global context.' 8 | return prompt.format_map(ChainMap(row, aipl.tables, aipl.globals)) 9 | -------------------------------------------------------------------------------- /aipl/ops/csv.py: -------------------------------------------------------------------------------- 1 | from aipl import defop 2 | from typing import List 3 | 4 | # assumes header row 5 | @defop('csv-parse', None, 1.5) 6 | def op_csv_parse(aipl, fname:str) -> List[dict]: 7 | 'Converts a .csv into a table of rows.' 8 | import csv 9 | with open(fname, newline='') as csvfile: 10 | reader = csv.DictReader(csvfile) 11 | for row in reader: 12 | yield row 13 | -------------------------------------------------------------------------------- /aipl/ops/join.py: -------------------------------------------------------------------------------- 1 | from aipl import defop 2 | 3 | from typing import List 4 | 5 | @defop('join', 1, 0) 6 | def op_join(aipl, v:List[str], sep=' ') -> str: 7 | 'Join inputs with sep into a single output scalar.' 8 | return sep.join(v) 9 | 10 | def test_toplevel_join(aipl): 11 | t = aipl.run_test('!join', 'now', 'is', 'the') 12 | assert len(t.rows) == 1 13 | assert t[0].value == 'now is the' 14 | 15 | -------------------------------------------------------------------------------- /tests/toplevel-ravel.aipl: -------------------------------------------------------------------------------- 1 | 2 | 3 | !test-input 4 | 5 | pizza toppings:tomato,cheese,onion 6 | sex:m,f,y,n 7 | 8 | !split sep=\n 9 | !split-into sep=: category members 10 | !split>member sep=, 11 | !ravel 12 | 13 | !format << {category}: {member} 14 | !join sep=\n 15 | !test-equal 16 | pizza toppings: tomato 17 | pizza toppings: cheese 18 | pizza toppings: onion 19 | sex: m 20 | sex: f 21 | sex: y 22 | sex: n 23 | -------------------------------------------------------------------------------- /aipl/ops/ref.py: -------------------------------------------------------------------------------- 1 | from aipl import defop, Table, AIPLException 2 | 3 | 4 | @defop('ref', 1.5, 1.5) 5 | def op_ref(aipl, t:Table, name): 6 | 'Move column on table to end of columns list (becoming the new .value)' 7 | col = t.get_column(name) 8 | if col not in t.columns: 9 | raise AIPLException(f'no such column {name}') 10 | 11 | t.columns.remove(col) 12 | t.add_column(col) 13 | return t 14 | -------------------------------------------------------------------------------- /aipl/test_db.py: -------------------------------------------------------------------------------- 1 | from . import Database 2 | 3 | 4 | def test_db(): 5 | import tempfile 6 | with tempfile.NamedTemporaryFile() as f: 7 | with Database(f.name) as db: 8 | db.insert('people', id=10, name='James Jones') 9 | db.insert('people', id=11, name='Maria Garcia') 10 | db.insert('people', id=12, name='Michael Smith') 11 | 12 | db = Database(f.name) 13 | assert len(db.table('people')) == 3 14 | assert db.query('SELECT * FROM people WHERE id=?', 12)[0].name == 'Michael Smith' 15 | -------------------------------------------------------------------------------- /examples/git-commit.aipl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bin/aipl 2 | 3 | !format 4 | 5 | gpt-3.5-turbo 6 | #gpt-4 7 | 8 | !split>model 9 | 10 | !ravel 11 | 12 | # write commit msg for git diff 13 | !sh git diff --cached 14 | !format 15 | Add a commit subject and message that explains the following commit. 16 | Keep same exact commit format, as it will be piped directly into git. 17 | Be terse. 18 | 19 | """ 20 | {_} 21 | """ 22 | 23 | !llm model={model} 24 | !print 25 | 26 | !format 27 | {_} 28 | 29 | [Commit message generated by {model}] 30 | 31 | !join sep=\n\n 32 | 33 | !shtty git commit -v -m {_} --edit 34 | -------------------------------------------------------------------------------- /aipl/ops/input.py: -------------------------------------------------------------------------------- 1 | ''' 2 | !require-input prompts the user for input, if none provided in the script. 3 | ''' 4 | 5 | import sys 6 | 7 | from aipl import defop, Table 8 | 9 | 10 | @defop('require-input', 100, 100) 11 | def op_require_input(aipl, t:'Table', prompt=''): 12 | 'Ensure there is any input at all; if not, display the prompt and read input from the user.' 13 | if len(t.rows) == 0 or not t[0].value: 14 | print(prompt, file=sys.stderr) 15 | print('Ctrl+D to end input', file=sys.stderr) 16 | return Table([{'input':sys.stdin.read().strip('\n')}]) 17 | return t 18 | -------------------------------------------------------------------------------- /aipl/ops/take.py: -------------------------------------------------------------------------------- 1 | ''' 2 | !take `n` returns a copy of an input `Table` with only the first `n` rows. 3 | ''' 4 | 5 | from copy import copy 6 | 7 | from aipl import defop 8 | from aipl.table import Table 9 | 10 | @defop('take', 1.5, 1.5) 11 | def op_take(aipl, t:Table, n=1) -> Table: 12 | 'Return a table with first n rows of `t`' 13 | ret = copy(t) 14 | ret.rows = t.rows[:n] 15 | return ret 16 | 17 | 18 | def test_take(aipl): 19 | r = aipl.run_test('!take 2', '1 2 3', '4 5 6', '7 8 9') 20 | assert len(r.rows) == 2 21 | assert r[0].value == '1 2 3' 22 | assert r[1].value == '4 5 6' 23 | -------------------------------------------------------------------------------- /aipl/ops/sample.py: -------------------------------------------------------------------------------- 1 | ''' 2 | !sample 3 | 4 | Replace input table with table of n sampled rows. 5 | ''' 6 | 7 | from aipl import defop, Table 8 | 9 | 10 | @defop('sample', 1.5, 1.5) 11 | def op_sample(aipl, t:Table, n:int=1) -> Table: 12 | 'Sample n random rows from the input table.' 13 | import random 14 | return Table(random.sample(t.rows, n), parent=t) 15 | 16 | 17 | def test_sample(aipl): 18 | r = aipl.run_test('!split !sample 2', 'a b c', 'd e f') 19 | assert len(r[0].value) == 2 20 | for row in r[0].value: 21 | assert row.value in 'abc' 22 | for row in r[1].value: 23 | assert row.value in 'def' 24 | -------------------------------------------------------------------------------- /aipl/ops/cluster.py: -------------------------------------------------------------------------------- 1 | ''' 2 | !cluster will partition input vectors into n clusters, 3 | using k-means clustering which minimises within cluster 4 | variances. 5 | ''' 6 | 7 | from typing import List 8 | 9 | from aipl import defop 10 | 11 | @defop('cluster', 1, 1) 12 | def op_cluster(aipl, v:List[List[float]], n=10): 13 | 'Cluster rows by embedding into n clusters; add label column.' 14 | import numpy as np 15 | from sklearn.cluster import KMeans 16 | 17 | matrix = np.vstack(v) 18 | kmeans = KMeans(n_clusters=n, init='k-means++', random_state=42, n_init='auto') 19 | kmeans.fit(matrix) 20 | 21 | return [int(x) for x in kmeans.labels_] 22 | -------------------------------------------------------------------------------- /aipl/ops/url.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse, urlunparse 2 | 3 | from aipl import defop 4 | 5 | 6 | @defop('url-split', 0, 0.5) 7 | def op_url_split(aipl, url:str) -> dict: 8 | 'Split url into components (scheme, netloc, path, params, query, fragment).' 9 | r = urlparse(url) 10 | return dict(scheme=r.scheme, 11 | netloc=r.netloc, 12 | path=r.path, 13 | params=r.params, 14 | query=r.query, 15 | fragment=r.fragment) 16 | 17 | 18 | @defop('url-defrag', 0, 0) 19 | def op_url_defrag(aipl, url:str) -> str: 20 | 'Remove fragment from url.' 21 | return urlunparse(urlparse(url)._replace(fragment='')) 22 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "aipl" 3 | version = "0.1.0" 4 | description = "Array-Inspired Pipeline Language" 5 | authors = ["Saul Pwanson "] 6 | license = "Proprietary" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.8" 11 | openai = "^0.27.6" 12 | scikit-learn = "^1.2.2" 13 | numpy = "^1.24.3" 14 | trafilatura = "^1.6.0" 15 | beautifulsoup4 = "^4.12.2" 16 | lxml = "^4.9.2" 17 | lark = "^1.1.5" 18 | rich = "^13.4.1" 19 | 20 | [tool.poetry.group.dev.dependencies] 21 | pytest = "^7.3.1" 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | 27 | [tool.poetry.scripts] 28 | aipl = "aipl.main:main" 29 | -------------------------------------------------------------------------------- /aipl/ops/filter.py: -------------------------------------------------------------------------------- 1 | ''' 2 | !filter returns the table, containing only the rows 3 | that were truthy in the value column. 4 | The value column is then discarded. 5 | ''' 6 | from copy import copy 7 | 8 | from aipl import defop 9 | from aipl.table import Table 10 | 11 | @defop('filter', 1.5, 1.5) 12 | def op_filter(aipl, t:Table) -> Table: 13 | 'Return copy of table, keeping only rows whose value is Truthy.' 14 | ret = copy(t) 15 | ret.rows = [r._row for r in t if r.value] 16 | ret.columns = ret.columns[:-1] # discard bool column 17 | return ret 18 | 19 | def test_filter(aipl): 20 | r = aipl.run_test('!match ^c !filter', 'a b c', 'b c d', 'c d e') 21 | assert len(r.rows) == 1 22 | assert r[0].value == 'c d e' 23 | -------------------------------------------------------------------------------- /tools/vscode/language-configuration.json: -------------------------------------------------------------------------------- 1 | { 2 | "comments": { 3 | // symbol used for single line comment. Remove this entry if your language does not support line comments 4 | "lineComment": "#", 5 | }, 6 | // symbols used as brackets 7 | "brackets": [ 8 | ["{", "}"], 9 | ["[", "]"], 10 | ["(", ")"] 11 | ], 12 | // symbols that are auto closed when typing 13 | "autoClosingPairs": [ 14 | ["{", "}"], 15 | ["[", "]"], 16 | ["(", ")"], 17 | ["\"", "\""], 18 | ["'", "'"] 19 | ], 20 | // symbols that that can be used to surround a selection 21 | "surroundingPairs": [ 22 | ["{", "}"], 23 | ["[", "]"], 24 | ["(", ")"] 25 | ] 26 | } 27 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: aipl-testing 2 | on: 3 | pull_request: 4 | branches: 5 | - develop 6 | push: 7 | branches: 8 | - develop 9 | 10 | jobs: 11 | run-tests: 12 | 13 | strategy: 14 | matrix: 15 | python-version: ["3.10", "3.11"] 16 | 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v2 20 | 21 | - name: Set up Python ${{ matrix.pythonversion }} 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | 26 | - name: Install 27 | run: | 28 | pip3 install . 29 | pip3 install pytest 30 | 31 | - name: Ensure it exe starts up 32 | run: aipl -h 33 | 34 | - name: Run pytests 35 | run: pytest . 36 | -------------------------------------------------------------------------------- /tools/vscode/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "vsc-aipl-extension", 3 | "displayName": "vsc_aipl_extension", 4 | "description": "A VSC extension to support basic AIPL syntax highlighting", 5 | "publisher": "0", 6 | "version": "0.0.1", 7 | "engines": { 8 | "vscode": "^1.35.0" 9 | }, 10 | "categories": [ 11 | "Programming Languages" 12 | ], 13 | "contributes": { 14 | "languages": [{ 15 | "id": "aipl", 16 | "aliases": ["AIPL", "aipl"], 17 | "extensions": [".aipl"], 18 | "configuration": "./language-configuration.json" 19 | }], 20 | "grammars": [{ 21 | "language": "aipl", 22 | "scopeName": "source.aipl", 23 | "path": "./syntaxes/aipl.tmLanguage.json" 24 | }] 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /examples/rowan/load-json-v4.aipl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bin/aipl 2 | 3 | # loads a JSON file from a hardcoded URL and accesses a sample from a list inside (AIPL idiomatic), filtering on string matching 4 | 5 | # allow user to select if they want to filter by ironic=True/1 or False/0 6 | !require-input 7 | ironic? 1 or 0 8 | !split>choice sep=\n 9 | 10 | # load the JSON 11 | !format 12 | https://raw.githubusercontent.com/google/BIG-bench/main/bigbench/benchmark_tasks/irony_identification/task.json 13 | !fetch-url 14 | 15 | # access 'examples' list in JSON blob 16 | !json-parse examples=examples 17 | 18 | # map each element to its 'input' attribute 19 | !format>statement 20 | {input} 21 | 22 | # !ravel 23 | !format 24 | {target_scores_ironic}: {statement} 25 | 26 | # filter to only ironic or non-ironic statements 27 | !match {choice}: 28 | !filter 29 | !print 30 | -------------------------------------------------------------------------------- /aipl/ops/db.py: -------------------------------------------------------------------------------- 1 | from aipl import defop, Database 2 | 3 | 4 | @defop('dbopen', None, 0) 5 | def op_dbopen(aipl, url:str): 6 | 'Open connection to database.' 7 | return Database(url) 8 | 9 | @defop('dbquery', 0.5, 1.5) 10 | def op_dbquery(aipl, row:'LazyRow', dbname:str, tblname:str, *colnames, **kwargs): 11 | 'Query database table.' 12 | for r in aipl.globals[dbname].select(tblname, **kwargs): 13 | yield {colname:r[colname] for colname in colnames} 14 | 15 | 16 | @defop('dbdrop', None, None) 17 | def op_dbdrop(aipl, tblname:str): 18 | 'Drop database table.' 19 | aipl.output_db.sql(f'DROP TABLE IF EXISTS {tblname}') 20 | 21 | 22 | @defop('dbinsert', 0.5, None) 23 | def op_dbinsert(aipl, row, tblname:str, **kwargs): 24 | 'Insert each row into database table.' 25 | aipl.output_db.insert(tblname, **row._asdict(), **kwargs) 26 | -------------------------------------------------------------------------------- /examples/saulpw/crossword-log.aipl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env aipl 2 | 3 | # Usage: 4 | # $0 book="Book Title" attempted_date="2022-2023" 5 | 6 | !regex-capture 7 | (?P\w+) ?(?P.*)? 8 | 9 | !regex-translate>rating 10 | # rating: 11 | # -2: hated 12 | # -1: didn't like 13 | # 0: attempted, unfinished 14 | # +1: completed 15 | # 2: liked 16 | \* 0 17 | \+ +2 18 | \- -1 19 | ^$ +1 20 | 21 | !ravel 22 | 23 | !!python 24 | from dateutil.parser import parse 25 | def date(s): 26 | return parse(s) 27 | 28 | !!dbopen>gxd gxd.sqlite 29 | !dbquery gxd puzzles xdid A1_D1={A1_D1} 30 | 31 | !python-expr>dotw date('{xdid}'[3:]).strftime('%A') 32 | 33 | !format>attempted_date 34 | {attempted_date} 35 | !format>book 36 | {book} 37 | 38 | !columns attempted_date dotw xdid A1_D1 rating book 39 | !dbdrop attempted_puzzles 40 | !dbinsert attempted_puzzles 41 | -------------------------------------------------------------------------------- /tests/test-named-ravel.aipl: -------------------------------------------------------------------------------- 1 | !test-input 2 | 3 | a b c 4 | d e f g 5 | 6 | # for an op with rankout=1, !op>var1>var2 will name the deepest (scalar) 'inner column' var2, and the outer column that contains the vector var1. 7 | 8 | !split>line sep=\n 9 | 10 | # make sure columns are named correctly 11 | # and that named columns remain visible to a top-level json 12 | 13 | !test-json 14 | [{"_": [{"line": "a b c"}, {"line": "d e f g"}]}] 15 | 16 | !split>chars>char 17 | 18 | !test-json 19 | 20 | [{ 21 | "_": [ 22 | {"line": "a b c", "chars": [{"char": "a"}, {"char": "b"}, {"char": "c"}]}, 23 | {"line": "d e f g", "chars": [{"char": "d"}, {"char": "e"}, {"char": "f"}, {"char": "g"}]} 24 | ] 25 | }] 26 | 27 | !ravel>letter 28 | 29 | !join>out sep=. 30 | !columns out 31 | 32 | !test-json 33 | [{ 34 | "out": "a.b.c.d.e.f.g" 35 | }] 36 | 37 | !print 38 | -------------------------------------------------------------------------------- /tools/translate-dialect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | from aipl import AIPL 5 | 6 | 7 | def dialectize(cmd:Command) -> str: 8 | ret = f'!{cmd.opname}' 9 | if cmd.args: 10 | ret += ' ' + ' '.join(str(arg) for arg in cmd.args) 11 | if cmd.kwargs: 12 | ret += ' ' + ' '.join(f'{k}={v}' for k,v in cmd.kwargs.items()) 13 | 14 | ret += '\n' 15 | if cmd.prompt: 16 | ret += cmd.prompt + '\n\n' 17 | return ret 18 | 19 | 20 | def main(*args): 21 | aipl = AIPL() 22 | for fn in args: 23 | code = open(fn).read() 24 | with open(fn, 'w') as outfp: 25 | cmds = aipl.parse(code) 26 | for cmd in cmds: 27 | # XXX: need to handle comments and outputting a particular dialect 28 | print(dialectize(cmd), file=outfp) 29 | 30 | 31 | main(*sys.argv[1:]) 32 | -------------------------------------------------------------------------------- /aipl/ops/sh.py: -------------------------------------------------------------------------------- 1 | from aipl import defop, Table 2 | 3 | 4 | @defop('sh', 0, 1.5) 5 | def op_sh(aipl, cmdline:str, **kwargs) -> dict: 6 | 'Run the command described by args. Return (retcode, stderr, stdout) columns.' 7 | import subprocess 8 | r = subprocess.run(cmdline, shell=True, text=True, 9 | # stdin=subprocess.PIPE, 10 | stdout=subprocess.PIPE, 11 | stderr=subprocess.PIPE) 12 | return Table([dict(retcode=r.returncode, 13 | stderr=r.stderr, 14 | stdout=r.stdout)]) 15 | 16 | @defop('shtty', None, 0.5) 17 | def op_shtty(aipl, _:'LazyRow', *args) -> dict: 18 | 'Run the command described by args. Return (retcode, stderr, stdout) columns.' 19 | import subprocess 20 | r = subprocess.run(args, text=True, 21 | stderr=subprocess.PIPE) 22 | return dict(retcode=r.returncode, 23 | stderr=r.stderr) 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | from setuptools import setup, find_packages 4 | from pathlib import Path 5 | 6 | 7 | def readme(): 8 | with open('README.md') as f: 9 | return f.read() 10 | 11 | def requirements(): 12 | return Path('requirements.txt').read_text().splitlines() 13 | 14 | 15 | setup( 16 | name="AIPL", 17 | version="0.1", 18 | description="A tiny DSL to make it easier to explore and experiment with AI pipelines.", 19 | long_description=readme(), 20 | long_description_content_type="text/markdown", 21 | python_requires=">=3.10", 22 | py_modules=["aipl"], 23 | scripts=['bin/aipl'], 24 | install_requires=requirements(), 25 | packages=find_packages(), 26 | author="Saul Pwanson", 27 | url="https://github.com/saulpw/aipl", 28 | classifiers=[ 29 | "Development Status :: 4 - Beta", 30 | "Programming Language :: Python :: 3", 31 | ], 32 | keywords="GPT aipl visidata array", 33 | ) 34 | -------------------------------------------------------------------------------- /aipl/ops/def.py: -------------------------------------------------------------------------------- 1 | ''' 2 | !!def 3 | !op1 4 | !op2 5 | 6 | Create a new op named that runs the AIPL in the prompt when invoked. 7 | ''' 8 | 9 | from aipl import defop, Table 10 | 11 | 12 | @defop('def', 0, None) # immediate 13 | def op_def(aipl, prompt, opname): 14 | 'Define composite operator from cmds in prompt (must be indented).' 15 | cmds = aipl.parse(prompt) 16 | 17 | @defop(opname, 18 | rankin=cmds[0].op.rankin, 19 | rankout=cmds[-1].op.rankout) 20 | def new_operator(aipl, *args, **kwargs): 21 | arity = 0 if cmds[0].op.rankin is None else 1 22 | if arity == 0: 23 | t = aipl.new_input() 24 | elif arity == 1: 25 | t = aipl.new_input(args[0]) 26 | ret = aipl.run_cmdlist(cmds, [t], *args[arity:]) 27 | return ret[-1][0].value 28 | 29 | 30 | def test_def(aipl): 31 | r = aipl.run_test(''' 32 | !!def split-join 33 | !split 34 | !join 35 | 36 | !split-join 37 | ''', 'a b c', 'd e f') 38 | assert r[0].value == 'a b c' 39 | assert r[1].value == 'd e f' 40 | -------------------------------------------------------------------------------- /aipl/ops/split.py: -------------------------------------------------------------------------------- 1 | from aipl import defop 2 | 3 | from typing import List 4 | 5 | @defop('split', 0, 1) 6 | def op_split(aipl, v: str, sep:str=None, maxsize:int=0, trim=False) -> List[str]: 7 | 'Split text into chunks based on sep, keeping each chunk below maxsize.' 8 | if trim: 9 | v = v.strip() 10 | win = [] 11 | tot = 0 12 | for i, unit in enumerate(v.split(sep)): 13 | n = len(unit) 14 | if tot+n > int(maxsize): 15 | if win: 16 | yield (sep or ' ').join(win) 17 | win = [] 18 | tot = 0 19 | 20 | win.append(unit) 21 | tot += n 22 | 23 | if win: 24 | yield (sep or ' ').join(win) 25 | 26 | 27 | @defop('split-into', 0, 0.5) 28 | def op_split_into(aipl, v:str, *args, sep=None) -> dict: 29 | 'Split text by sep into the given column names.' 30 | return dict(zip(args, v.split(sep))) 31 | 32 | def test_split_join(aipl): 33 | t = aipl.run_test('!split !take 3 !join', 'now is the time') 34 | assert len(t.rows) == 1 35 | assert t[0].value == 'now is the' 36 | 37 | -------------------------------------------------------------------------------- /LICENSE.mit: -------------------------------------------------------------------------------- 1 | Copyright (c) 2023 Saul Pwanson and the Devottys 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /aipl/ops/ravel.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from copy import copy 3 | 4 | 5 | from aipl import defop 6 | from aipl.table import Table, Column 7 | 8 | 9 | @defop('ravel', 100, 1.5) 10 | def op_ravel(aipl, v:Table, rank=0) -> Table: 11 | 'All of the leaf scalars in the value column become a single 1-D array.' 12 | def _ravel(t:Table, newkey:str, parent=None) -> List['Scalar']: 13 | for row in t: 14 | if isinstance(row.value, Table) and row.value.rank > rank: 15 | yield from _ravel(row.value, newkey, parent=row) 16 | else: 17 | if '__parent' not in row._row and parent is not None: 18 | row._row['__parent'] = parent 19 | 20 | yield row 21 | 22 | newkey = aipl.unique_key 23 | ret = Table(parent=v) 24 | for row in _ravel(v, newkey): 25 | ret.rows.append(row._row) 26 | 27 | for c in row._table.columns: 28 | ret.add_column(copy(c)) 29 | return ret 30 | 31 | 32 | def test_ravel(aipl): 33 | t = aipl.run_test('!split !take 2 !ravel !join', 'a b c d', 'e f g') 34 | assert t[0].value == 'a b e f' 35 | -------------------------------------------------------------------------------- /aipl/ops/columns.py: -------------------------------------------------------------------------------- 1 | ''' 2 | !column takes a space-seperated list of columns 3 | in the current table, and returns a copy of the table 4 | with only those columns. 5 | Akin to SQLite SELECT. 6 | ''' 7 | 8 | from copy import copy 9 | 10 | from aipl import defop, Table, Column 11 | 12 | 13 | @defop('columns', 1.5, 1.5) 14 | def op_columns(aipl, t:'Table', *colnames, **renamedcols) -> Table: 15 | 'Create new table containing only these columns.' 16 | namings = [(n,n) for n in colnames] # from_name:to_name 17 | namings.extend((v,k) for k,v in renamedcols.items()) 18 | newcols = [] 19 | ret = copy(t) 20 | ret.rows = [] 21 | for row in t: 22 | d = {'__parent':row} 23 | d.update({to_name:row[from_name] for from_name, to_name in namings}) 24 | ret.rows.append(d) 25 | 26 | for from_name, to_name in namings: 27 | ret.add_column(Column(to_name)) 28 | 29 | return ret 30 | 31 | def test_columns(aipl): 32 | r = aipl.run_test('!name letters !split !ravel !columns letters', 'a b c', 'd e f') 33 | assert r[0].value == 'a b c' 34 | assert r[3].value == 'd e f' 35 | assert len(r) == 6 36 | -------------------------------------------------------------------------------- /examples/benchmarks/bigbench-binary-classification.aipl: -------------------------------------------------------------------------------- 1 | # in: list of models; out: % accuracy in classifying the given task 2 | 3 | !csv-parse model-task.csv 4 | 5 | !format 6 | https://raw.githubusercontent.com/google/BIG-bench/main/bigbench/benchmark_tasks/{task}/task.json 7 | !fetch-url 8 | # name=name description=description 9 | !json-parse examples=examples 10 | 11 | !format>statement 12 | {input} 13 | !take 20 14 | 15 | # try these tasks without any prompt context and see what happens! 16 | !format>zero-shot 17 | {statement} 18 | --- 19 | Classify with 1 if yes, 0 if no. 20 | Classification: 21 | !llm>classification model={model} max_tokens=1 22 | 23 | # TODO: be able to look at responses per-model; currently can't tell what model had what classification 24 | # !format 25 | # {model} {classification} ({target_scores_Yes}): {statement} 26 | !format 27 | {classification} ({target_scores_Yes}): {statement} 28 | !print 29 | 30 | !metrics-accuracy>accuracy classification target_scores_Yes 31 | !format 32 | {model:15} {task:25} {accuracy:.2f} 33 | !print 34 | # !columns zero-shot classification target_scores_Yes 35 | # !json 2 36 | # !save {model}_{task}.json 37 | 38 | !print -------------------------------------------------------------------------------- /aipl/ops/cross.py: -------------------------------------------------------------------------------- 1 | from aipl import defop, Table, SubColumn, LazyRow 2 | 3 | __test__ = ''' 4 | !test-input 5 | a b c 6 | !split>col1 7 | !table t1 8 | !test-input 9 | d e f 10 | !split>col2 11 | !cross < Table: 34 | 'Construct cross-product of left and right inputs (pass right input via `<statement 12 | {input} 13 | !take 20 14 | 15 | # try these tasks without any prompt context and see what happens! 16 | !format>zero-shot 17 | {statement} 18 | --- 19 | Classify with 1 if yes, 0 if no. 20 | Classification: 21 | !llm-local>classification model={model} max_tokens=2 22 | 23 | # TODO: be able to look at responses per-model; currently can't tell what model had what classification 24 | # !format 25 | # {model} {classification} ({target_scores_Yes}): {statement} 26 | !format 27 | {classification} ({target_scores_Yes}): {statement} 28 | !print 29 | 30 | !metrics-accuracy>accuracy classification target_scores_Yes 31 | !format 32 | {model:15} {task:25} {accuracy:.2f} 33 | !print 34 | # !columns zero-shot classification target_scores_Yes 35 | # !json 2 36 | # !save {model}_{task}.json 37 | 38 | !print -------------------------------------------------------------------------------- /aipl/ops/regex.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from aipl import defop 4 | 5 | 6 | def preprompt_capture(prompt:str='') -> re.Pattern: 7 | import re 8 | return re.compile(prompt) 9 | 10 | @defop('regex-capture', 0, 0.5, preprompt=preprompt_capture) 11 | def regex_capture(aipl, v:str, prompt:re.Pattern=None) -> dict: 12 | 'Capture from prompt regex into named matching groups.' 13 | m = prompt.match(v) 14 | if not m: 15 | return {} 16 | return m.groupdict() 17 | 18 | 19 | def preprompt_translate(prompt:str=''): 20 | import re 21 | d = [] 22 | for line in prompt.splitlines(): 23 | regex, output = line.split(maxsplit=1) 24 | d.append((re.compile(regex), output)) 25 | 26 | return d 27 | 28 | @defop('regex-translate', 0, 0, preprompt=preprompt_translate) 29 | def regex_translate(aipl, v:str, prompt:list): 30 | r'''Translate input according to regex translation rules in prompt, one per line, with regex and output separated by whitespace: 31 | \bDr\.?\b Doctor 32 | \bJr\.?\b Junior 33 | ''' 34 | for regex, output in prompt: 35 | m = regex.match(v) 36 | if m: 37 | return output 38 | return v 39 | -------------------------------------------------------------------------------- /about/roadmap.md: -------------------------------------------------------------------------------- 1 | # Roadmap 2 | 3 | ## Overall goals 4 | 5 | - To compile and curate the hundreds of operators into a single place, 6 | - to design them to interoperate with each other in straightforward-by-default ways. 7 | - to port concepts from langchain and elsewhere as needed, clarifying and testing and upgrading them. 8 | 9 | - To compile and curate recipes for dozens of well-commented reference chains, 10 | 11 | - To allow a smart but "not a programmer" enduser to take a reference recipe and tweak it for their own custom use; 12 | - to allow them to get moving on a prototype for their idea immediately; 13 | - to allow them to see into the process step-by-step; 14 | 15 | - To provide them a platform for experimentation and small-scale production. 16 | 17 | - to provide an incentive and process for them to submit their work, for others to learn, and also to potentially improve the reference; 18 | - to compile test suites, so we can run A/B tests and gather data on specific prompt improvements, to continually optimize the reference chains. 19 | 20 | operators + scripts + models + standardized tests = aipl 21 | 22 | So that an AI proof-of-concept can be thrown together in an evening. 23 | -------------------------------------------------------------------------------- /aipl/ops/read.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse, urlunparse 2 | 3 | from aipl import defop, dbcache, stderr, alias 4 | 5 | 6 | @dbcache 7 | def _fetch_url_bytes(aipl, url:str) -> bytes: 8 | import urllib.request 9 | stderr(f'fetching {url}...') 10 | with urllib.request.urlopen(url) as resp: 11 | return resp.read() 12 | 13 | 14 | @dbcache 15 | def _fetch_url(aipl, url:str) -> str: 16 | import trafilatura 17 | stderr(f'fetching {url}...') 18 | # guess at decoding and other helpful things 19 | return trafilatura.fetch_url(url) 20 | 21 | 22 | @defop('read', 0, 0) 23 | def op_read(aipl, url:str) -> str: 24 | 'Return contents of local filename.' 25 | if '://' in url: 26 | url = urlunparse(urlparse(url)._replace(fragment='')) 27 | return _fetch_url(aipl, url) 28 | 29 | return open(url).read() 30 | 31 | 32 | @defop('read-bytes', 0, 0) 33 | def op_read_bytes(aipl, url:str) -> bytes: 34 | 'Return contents of URL or local filename as bytes.' 35 | if '://' in url: 36 | url = urlunparse(urlparse(url)._replace(fragment='')) 37 | return _fetch_url_bytes(url) 38 | 39 | return open(url, mode='rb').read() 40 | 41 | alias('fetch-url', 'read') 42 | -------------------------------------------------------------------------------- /aipl/ops/groupby.py: -------------------------------------------------------------------------------- 1 | ''' 2 | !groupby 3 | 4 | Group rows by given named columns, with output value for each key being table of corresponding rows. 5 | ''' 6 | 7 | from copy import copy 8 | from collections import defaultdict 9 | 10 | from aipl import defop 11 | from aipl.table import Table, Column 12 | 13 | 14 | @defop('groupby', 1.5, 1.5) 15 | def op_groupby(aipl, t:Table, *args) -> Table: 16 | 'Group rows into tables, by set of columns given as args.' 17 | groups = defaultdict(list) # groupkey -> list of rowdict 18 | for row in t: 19 | k = tuple([row[colname] for colname in args]) 20 | groups[k].append(row._row) 21 | 22 | ret = Table() 23 | 24 | newkey = aipl.unique_key 25 | for key, rows in groups.items(): 26 | outdict = dict(zip(args, key)) 27 | outtable = copy(t) 28 | outtable.rows = rows 29 | outdict[newkey] = outtable 30 | ret.rows.append(outdict) 31 | 32 | for colname in args: 33 | ret.add_column(Column(colname, colname)) 34 | 35 | ret.add_column(Column(newkey)) 36 | return ret 37 | 38 | 39 | def test_groupby(aipl): 40 | r = aipl.run_test('!split-into name num !groupby name', 'Bob 4', 'Alice 3', 'Carol 8', 'Bob 2', 'Alice 5', 'Bob 1') 41 | assert len(r) == 3 42 | -------------------------------------------------------------------------------- /docs/writing-operators.md: -------------------------------------------------------------------------------- 1 | # Writing a New Operator in Python 2 | 3 | In AIPL you can use !!python to add a new operator. For instance, here's the definition of `!lower`, a scalar string to string operator: 4 | 5 | !!python 6 | @defop('lower', rankin=0, rankout=0) 7 | def _(aipl, v:str) -> str: 8 | return v.lower() 9 | 10 | ## Operators internal to the AIPL codebase 11 | 12 | All .py files in aipl.ops are imported automatically. 13 | You can use the exact same code from the prompt above. 14 | 15 | Each and every operator internal to the aipl codebase should have: 16 | 17 | - Full docs for operator in the file's docstring, including any subtleties or warts 18 | - Concise docs in function's docstring. 19 | - At least one basic test and demonstration of functionality 20 | 21 | Any imports of external libraries should be done within the operator itself, not at toplevel. 22 | 23 | ## Full Example: `aipl/ops/lower.py` 24 | 25 | ''' 26 | !lower converts the input string to lowercase. 27 | Unicode cased characters are supported per [Python str.lower](). 28 | ''' 29 | 30 | from aipl import defop 31 | 32 | 33 | @defop('lower', rankin='scalar', rankout='scalar') 34 | def _(aipl, v:str) -> str: 35 | 'Convert the input string to lowercase.' 36 | return v.lower() 37 | 38 | 39 | def test_lower(aipl): 40 | r = aipl.run('!lower', 'HEY you') 41 | assert r[0] == 'hey you' 42 | -------------------------------------------------------------------------------- /aipl/repl.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import readline 3 | import traceback 4 | 5 | from aipl import parse, Table, AIPLException 6 | 7 | 8 | def repl(aipl, inputs:list[Table]): 9 | 'Standard Read-Eval-Print-Loop (REPL)' 10 | import rich 11 | def completer(text, state): 12 | ops = list(aipl.operators.keys()) + list(aipl.aliases.keys()) 13 | text = text[1:] 14 | results = [x for x in ops if x.startswith(text)] 15 | if results: 16 | return "!" + results[state] 17 | 18 | readline.parse_and_bind("tab: complete") 19 | readline.set_completer_delims(' \n=') 20 | readline.set_completer(completer) 21 | 22 | while True: 23 | sys.stdout.flush() 24 | try: 25 | cmdtext = input('> ') 26 | except KeyboardInterrupt as e: 27 | break # exit on ^C 28 | except EOFError: 29 | print("\n") 30 | continue 31 | 32 | if not cmdtext.strip(): # do nothing empty line 33 | continue 34 | 35 | try: 36 | cmds = parse(cmdtext) 37 | op = aipl.get_op(cmds[0].opname) 38 | if op.needs_prompt: 39 | while True: 40 | line = sys.stdin.readline() 41 | if not line.strip(): 42 | break 43 | cmdtext += '\n' + line 44 | 45 | inputs = aipl.run(cmdtext, inputs) 46 | rich.print(inputs[-1]) 47 | except AIPLException as e: 48 | print(e.args[0]) 49 | except Exception as e: 50 | traceback.print_exc() 51 | -------------------------------------------------------------------------------- /examples/cluster.aipl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bin/aipl 2 | 3 | # Cluster embeddings in database into 10 categories; summarize a random sample 4 | # of each; query GPT for a theme and subreddit; store all in db. 5 | 6 | # the input is some urls 7 | !name sourceurl 8 | !read 9 | !extract-links baseurl={sourceurl} 10 | !name url 11 | 12 | # get only urls below the given url and remove the #fragment 13 | !match ^{sourceurl} 14 | !filter 15 | !url-defrag>mainurl 16 | 17 | !read 18 | !extract-text>text 19 | 20 | !split maxsize=4000 21 | !take 1 22 | !ravel 23 | 24 | # get an embedding for each 25 | # XXX: how to exclude for URLs we've already done? 26 | !llm-embedding model=text-embedding-ada-002 27 | 28 | !cluster n=10 29 | !name label 30 | 31 | !columns url=mainurl text=text label=label 32 | 33 | !dbdrop categorized 34 | !dbinsert categorized 35 | 36 | # make a 2-column table (first being the category label, second being the list of rows) 37 | !groupby label 38 | 39 | # find topic and subreddit 40 | !sample n=3 41 | 42 | # split the text extraction from above into LLM-sized chunks 43 | !split str: 7 | if isinstance(s, str): 8 | return s 9 | return repr(s) 10 | 11 | def stderr(*args, **kwargs): 12 | # args = [strify(x) for x in args] 13 | args = [reprify(x) for x in args] 14 | print(*args, file=sys.stderr, flush=True, **kwargs) 15 | 16 | 17 | def fmtarg(v:str, r:Mapping=None) -> str: 18 | if isinstance(v, str): 19 | v = v.encode('utf-8').decode('unicode-escape') 20 | if r: 21 | return v.format_map(r) 22 | return v 23 | 24 | 25 | def fmtargs(args, contexts:List[Mapping]): 26 | d = ChainMap(*reversed(contexts)) 27 | return [fmtarg(arg, d) for arg in args if not isinstance(arg, str) or not arg.startswith('<')] 28 | 29 | 30 | def fmtkwargs(kwargs, contexts:List[Mapping]): 31 | d = ChainMap(*contexts) 32 | return {k:fmtarg(v, d) for k,v in kwargs.items()} 33 | 34 | 35 | class AttrDict(dict): 36 | def __getattr__(self, k): 37 | if k not in self: 38 | return None 39 | return self[k] 40 | 41 | def __setattr__(self, k, v): 42 | self[k] = v 43 | 44 | 45 | def strify(x, maxlen=0): 46 | if isinstance(x, (list, tuple)): 47 | if not x: 48 | return '[]' 49 | return f'[({len(x)}) {strify(x[0], maxlen=15)}]' 50 | if isinstance(x, dict): 51 | return '{' + ' '.join(f'{k}={strify(v, maxlen=15)}' for k, v in x.items()) + '}' 52 | x = str(x).replace("\n", '\\n') 53 | if maxlen and len(x) > maxlen: 54 | x = x[:maxlen] + f'...({len(x)} bytes)' 55 | return x 56 | -------------------------------------------------------------------------------- /aipl/ops/extract.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from urllib.parse import urljoin 3 | 4 | from aipl import defop 5 | 6 | 7 | @defop('extract-text-all', 0, 0) 8 | def op_extract_text_all(aipl, html:str, **kwargs) -> str: 9 | 'Extract all text from HTML' 10 | from bs4 import BeautifulSoup 11 | soup = BeautifulSoup(html, 'html.parser') 12 | return soup.get_text() 13 | 14 | 15 | @defop('extract-text', 0, 0) 16 | def op_extract_text(aipl, html:str, **kwargs) -> str: 17 | 'Extract meaningful text from HTML' 18 | parms = dict(include_comments=False, 19 | include_tables=False, 20 | no_fallback=True) 21 | parms.update(kwargs) 22 | 23 | import trafilatura 24 | content = trafilatura.extract(html, **parms) 25 | if content is None: 26 | return '' 27 | else: 28 | return content 29 | 30 | 31 | @defop('extract-links', 0, 1.5, outcols='linktext title href') 32 | def op_extract_links(aipl, html:str, baseurl='', **kwargs) -> List[dict]: 33 | 'Extract (linktext, title, href) from tags in HTML' 34 | if not html: 35 | return 36 | 37 | from bs4 import BeautifulSoup 38 | soup = BeautifulSoup(html, 'html.parser') 39 | for link in soup.find_all('a', href=True): 40 | href = link['href'] 41 | if baseurl: 42 | href = urljoin(baseurl, href) 43 | yield dict(linktext=link.text, title=link.get('title', ''), href=href) 44 | 45 | 46 | @defop('extract-selector', 0, 1) 47 | def _(aipl, html:str, selector:str) -> List[dict]: 48 | from bs4 import BeautifulSoup 49 | soup = BeautifulSoup(html, 'html.parser') 50 | for el in soup.select(selector): 51 | yield str(el) 52 | -------------------------------------------------------------------------------- /examples/summarize.aipl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bin/aipl 2 | 3 | # fetch url, split webpage into chunks, summarize each chunk, then summarize the summaries. 4 | 5 | # the inputs are urls 6 | !read 7 | 8 | # extract text from html 9 | !extract-text 10 | 11 | # split into chunks of lines that can fit in the context window 12 | !split maxsize=8000 sep=\n 13 | 14 | # have GPT summary each chunk 15 | !format 16 | 17 | Please read the following section of a webpage (500-1000 words) and provide a 18 | concise and precise summary in a few sentences, optimized for keywords and main 19 | content topics. Write only the summary, and do not include phrases like "the 20 | article" or "this webpage" or "this section" or "the author". Ensure the tone 21 | is precise and concise, and provide an overview of the entire section: 22 | 23 | """ 24 | {_} 25 | """ 26 | 27 | !llm model=gpt-3.5-turbo 28 | 29 | # join the section summaries together 30 | !join sep=\n- 31 | 32 | # have GPT summarize the combined summaries 33 | 34 | !format 35 | 36 | Based on the summaries of each section provided, create a one-paragraph summary 37 | of approximately 100 words. Begin with a topic sentence that introduces the 38 | overall content topic, followed by several sentences describing the most 39 | relevant subsections. Provide an overview of all section summaries and include 40 | a conclusion or recommendations only if they are present in the original 41 | webpage. Maintain a precise and concise tone, and make the overview coherent 42 | and readable, while preserving important keywords and main content topics. 43 | Remove all unnecessary text like "The document" and "the author". 44 | 45 | """ 46 | {_} 47 | """ 48 | 49 | !llm model=gpt-3.5-turbo 50 | 51 | !print 52 | -------------------------------------------------------------------------------- /tools/aipl.vim: -------------------------------------------------------------------------------- 1 | if exists("b:current_syntax") 2 | finish 3 | endif 4 | 5 | syntax match aiplComment "^#.*$" 6 | 7 | syntax region aiplString start=/^[^!]/ end=/^\ze!/ contained contains=aiplTemplateParameter,aiplComment 8 | syntax match aiplTemplateParameter "{[^}]*}" contained 9 | 10 | syn match aiplDef "^!!def\s\+" contained nextgroup=aiplOperatorName 11 | syn match aiplOperatorName "[^ ]\+\n" contained nextgroup=aiplNestedOperator 12 | 13 | syntax match aiplCommand "[^ >!][^ >]*" nextgroup=aiplRedirect contained 14 | syntax match aiplOperator /^!\+/ contained nextgroup=aiplCommand contained 15 | syntax match aiplRedirect ">" nextgroup=aiplRedirectTarget contained 16 | syntax match aiplRedirectTarget "[^ >]\+" contained nextgroup=aiplRedirect 17 | 18 | syntax region aiplCommandRegion start=/^!/ end=/^\ze!/ contains=aiplOperator,aiplComment,aiplString skipempty 19 | 20 | syntax region aiplDefinition start=/^!!def\ze\s/ end="^\ze!" contains=aiplNestedCommandRegion,aiplDef 21 | syntax region aiplNestedCommandRegion start=/^ !/ end=/^\ze \?!/ contained contains=aiplNestedOperator,aiplNestedString,aiplComment skipempty 22 | syntax match aiplNestedOperator /^ !\+/ contained nextgroup=aiplCommand 23 | syntax region aiplNestedString start=/^ [^!]/ end=/^\ze !/ contained contains=aiplTemplateParameter,aiplComment 24 | 25 | highlight link aiplComment Comment 26 | highlight link aiplOperator Operator 27 | highlight link aiplNestedOperator Operator 28 | highlight link aiplRedirect Operator 29 | highlight link aiplDef Keyword 30 | highlight link aiplKeyword Keyword 31 | highlight link aiplNestedString String 32 | highlight link aiplString String 33 | highlight link aiplTemplateParameter Identifier 34 | 35 | let b:current_syntax = "aipl" 36 | -------------------------------------------------------------------------------- /aipl/caching.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | 3 | from aipl import AIPL, stderr 4 | 5 | 6 | def dbcache(func): 7 | 'Decorator to persistently cache result from func(aipl, *args, *kwargs).' 8 | @wraps(func) 9 | def cachingfunc(aipl:AIPL, *args, **kwargs): 10 | if not aipl.cache_db: 11 | return func(aipl, *args, **kwargs) 12 | 13 | key = f'{args} {kwargs}' 14 | tbl = 'cached_'+func.__name__ 15 | ret = aipl.cache_db.select(tbl, key=key) 16 | if ret: 17 | row = ret[-1] 18 | if 'output' in row: 19 | return row['output'] 20 | 21 | del row['key'] 22 | stderr('[using cached value]') 23 | return row 24 | 25 | result = func(aipl, *args, **kwargs) 26 | 27 | if isinstance(result, dict): 28 | aipl.cache_db.insert(tbl, key=key, **result) 29 | else: 30 | aipl.cache_db.insert(tbl, key=key, output=result) 31 | 32 | return result 33 | 34 | return cachingfunc 35 | 36 | 37 | def expensive(mockfunc=None): 38 | 'Decorator to persistently cache result from func(aipl, *args, **kwargs). Use as @expensive(mock_func) where mock_func has identical signature to func and returns a compatible result during --dry-run.' 39 | def _decorator(func): 40 | @wraps(func) 41 | def _wrapper(aipl:AIPL, *args, **kwargs): 42 | if aipl.options.dry_run: 43 | if mockfunc: 44 | return mockfunc(aipl, *args, **kwargs) 45 | else: 46 | return f'<{func.__name__}({args} {kwargs})>' 47 | 48 | return dbcache(func)(aipl, *args, **kwargs) 49 | 50 | return _wrapper 51 | return _decorator 52 | -------------------------------------------------------------------------------- /tools/vscode/syntaxes/aipl.tmLanguage.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://raw.githubusercontent.com/martinring/tmlanguage/master/tmlanguage.json", 3 | "name": "aipl", 4 | "patterns": [ 5 | { 6 | "begin": "#", 7 | "beginCaptures": { 8 | "0": { 9 | "name": "punctuation.definition.comment.aipl" 10 | } 11 | }, 12 | "end": "\\n", 13 | "name": "comment.line.number-sign.aipl", 14 | "patterns": [ 15 | { 16 | "match": "(\\bTODO\\b|\\bFIXME\\b|\\bNOTE\\b|@todo)", 17 | "name": "keyword.other.documentation.task.aipl" 18 | } 19 | ] 20 | }, 21 | 22 | { 23 | "begin": "!", 24 | "beginCaptures": { 25 | "0": { 26 | "name": "punctuation.definition.command.aipl" 27 | } 28 | }, 29 | "end": "\\s|\\n", 30 | "name": "keyword.control.command.aipl", 31 | "patterns": [ 32 | { 33 | "begin": ">", 34 | "end": "\\s|$", 35 | "beginCaptures": { 36 | "0": { 37 | "name": "punctuation.separator.command.aipl" 38 | } 39 | }, 40 | "endCaptures": { 41 | "0": { 42 | "name": "variable.parameter.command.aipl" 43 | } 44 | } 45 | } 46 | ] 47 | } 48 | 49 | ], 50 | "repository": {}, 51 | "scopeName": "source.aipl" 52 | } 53 | -------------------------------------------------------------------------------- /examples/nyt-cooking.aipl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bin/aipl 2 | 3 | # Simple script that takes URLs of recipes from NYT cooking and converst to a markdown recipe format (specifically for Obsidian) 4 | # Output is one markdown file per recipe 5 | 6 | # extract recipe name from URL, for use in file name 7 | @defop('getname', 0.5, 0) 8 | def getname(aipl, d): 9 | s = d['path'].split('-') 10 | return ' '.join(s[1:]) 11 | 12 | # sample URLs 13 | !split>url 14 | https://cooking.nytimes.com/recipes/1019883-chicken-piccata 15 | https://cooking.nytimes.com/recipes/1013317-zucchini-and-carrot-fritters-with-yogurt-mint-dip 16 | https://cooking.nytimes.com/recipes/1022534-green-chile-chicken-tacos 17 | 18 | !url-split 19 | !getname>name 20 | 21 | !fetch-url Notes: 32 | 33 | Time: 34 | Serves: 35 | 36 | ### Ingredients 37 | #ingredients 38 | - [x] first ingredient 39 | - [x] second ingredient 40 | - [x] third ingredient 41 | 42 | --- 43 | #### Intro: 44 | 45 | 46 | --- 47 | #### Directions 48 | 49 | 50 | #### FINISH: 51 | 52 | 53 | And here is a poorly typed recipe that I would like you to re-format in the style of the above template please. 54 | Please put any tips in the Notes section. 55 | Just return the reformatted recipe, no extra words. 56 | 57 | {_} 58 | 59 | !llm model=gpt-3.5-turbo 60 | 61 | # tack on some metadata 62 | !format 63 | --- 64 | alias: 65 | source: {url} 66 | tags: recipe 67 | --- 68 | {_} 69 | 70 | # and we're done! 71 | !save {name}.md -------------------------------------------------------------------------------- /aipl/ops/xml.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from aipl import defop 3 | 4 | def _xml(s): 5 | if not isinstance(s, str): 6 | return s 7 | 8 | # from bs4 import BeautifulSoup 9 | # return BeautifulSoup(xml, 'xml') 10 | 11 | from lxml import etree 12 | root = etree.fromstring(s.encode()) 13 | for elem in root.getiterator(): 14 | # Skip comments and processing instructions, 15 | # because they do not have names 16 | if not ( 17 | isinstance(elem, etree._Comment) 18 | or isinstance(elem, etree._ProcessingInstruction) 19 | ): 20 | # Remove a namespace URI in the element's name 21 | elem.tag = etree.QName(elem).localname 22 | 23 | # Remove unused namespace declarations 24 | etree.cleanup_namespaces(root) 25 | 26 | return root 27 | 28 | 29 | class XMLStringableElement: 30 | def __init__(self, e): 31 | self._element = e 32 | def __getattr__(self, k): 33 | return getattr(self._element, k) 34 | def __str__(self): 35 | return getattr(self._element, 'text', '') or '' 36 | 37 | def StringifiableObject(s): 38 | 'create pass-through wrapper to stringify with s.text if available' 39 | if not hasattr(s, 'text'): 40 | return s 41 | return XMLStringableElement(s) 42 | 43 | 44 | @defop('xml-xpath', 0, 1) 45 | def op_xml_xpath(aipl, v:str, *args) -> List['XmlElement']: 46 | "Return a vector of XMLElements from parsing entries in value." 47 | xml = _xml(v) 48 | for arg in args: 49 | for entry in xml.xpath(arg): 50 | yield StringifiableObject(entry) 51 | 52 | 53 | @defop('xml-xpaths', 0, 0.5) 54 | def op_xml_xpaths(aipl, v:str, **kwargs) -> List['XmlElement']: 55 | "Return a vector of XMLElements from parsing entries in value; kwargs become column_name=xpath." 56 | xml = _xml(v) 57 | ret = {} 58 | for varname, xpath in kwargs.items(): 59 | ret[varname] = StringifiableObject(xml.xpath(xpath)[0]) 60 | return ret 61 | -------------------------------------------------------------------------------- /aipl/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from dataclasses import dataclass 3 | 4 | @dataclass 5 | class Error: 6 | 'A cascading error that does not break the pipeline' 7 | linenum:int = 0 8 | opname:str = '' 9 | exception:Exception = None 10 | 11 | def __str__(self): 12 | return f'AIPL Error (line {self.linenum} !{self.opname}): {self.exception}' 13 | 14 | def __getitem__(self, k): 15 | return self 16 | 17 | 18 | class AIPLCompileError(Exception): 19 | 'A nice error message during compilation to print to stderr and exit without a stacktrace.' 20 | 21 | 22 | class AIPLException(Exception): 23 | 'A nice error message to print to stderr and exit without a stacktrace.' 24 | 25 | 26 | class InnerPythonException(AIPLException): 27 | 'A nice error message when inner Python exec/eval raises.' 28 | def __str__(self): 29 | exc, tb, codestr = self.args 30 | r = [] 31 | if hasattr(self, 'command'): # added by other error handling 32 | linenum = self.command.linenum 33 | r.append(f'In "!{self.command.opname}" (line {self.command.linenum}):') 34 | else: 35 | linenum = 0 36 | 37 | for frame in tb: 38 | r.append(f'Line ~{frame.lineno+linenum}, in {frame.name}') 39 | r.append(' ' + codestr.splitlines()[frame.lineno-1]) 40 | 41 | r.append(f'{type(exc).__name__}: {exc}') 42 | 43 | return '\n'.join(r) 44 | 45 | 46 | class UserAbort(BaseException): 47 | 'UserAbort not caught by internal error handling; will always exit.' 48 | 49 | 50 | from .utils import stderr 51 | from .db import Database 52 | from .table import Table, Column, SubColumn, LazyRow 53 | from .interpreter import AIPL, defop, Command, alias 54 | from .caching import expensive, dbcache 55 | from .parser import parse 56 | from .repl import repl 57 | from .main import main 58 | 59 | 60 | def import_submodules(pkgname): 61 | 'Import all files below the given *pkgname*' 62 | import pkgutil 63 | import importlib 64 | 65 | m = importlib.import_module(pkgname) 66 | for module in pkgutil.walk_packages(m.__path__): 67 | importlib.import_module(pkgname + '.' + module.name) 68 | 69 | 70 | import_submodules('aipl.ops') 71 | -------------------------------------------------------------------------------- /tests/test-xml.aipl: -------------------------------------------------------------------------------- 1 | # testing xml operators 2 | 3 | !test-input 4 | 5 | 6 | 7 | 8 | ArXiv Query: search_query=cat:cs.AI&id_list=&start=1000&max_results=10 9 | http://arxiv.org/api/DT5MV1FVXpfuZZ1aLNCXrU8CaYU 10 | 2023-05-17T00:00:00-04:00 11 | 12 | http://arxiv.org/abs/1109.2347v1 13 | 2011-09-11T20:09:48Z 14 | 2011-09-11T20:09:48Z 15 | Breaking Instance-Independent Symmetries In Exact Graph Coloring 16 | Code optimization and high level synthesis can be posed as constraint satisfaction and optimization problems, such as graph coloring used in register allocation. 17 | 18 | I. L. Markov 19 | 20 | 10.1613/jair.1637 21 | 22 | Journal Of Artificial Intelligence Research, Volume 26, pages 289-322, 2006 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | !xml-xpath //entry/link[@type="application/pdf"] 31 | !xml-xpaths published=../published updated=../updated title=../title href=@href 32 | 33 | !ravel 34 | !test-json 35 | 36 | [{ 37 | "published": "2011-09-11T20:09:48Z", 38 | "updated": "2011-09-11T20:09:48Z", 39 | "title": "Breaking Instance-Independent Symmetries In Exact Graph Coloring", 40 | "href": "http://arxiv.org/pdf/1109.2347v1" 41 | }] 42 | -------------------------------------------------------------------------------- /examples/hanukkah-of-data-5783.aipl: -------------------------------------------------------------------------------- 1 | @defop('sql', 0, 1.5) 2 | def sql(aipl, q:str, db:str): 3 | import sqlite3 4 | con = sqlite3.connect(db) 5 | con.row_factory = sqlite3.Row 6 | return [ 7 | {k:r[k] for k in r.keys()} 8 | for r in con.execute(q).fetchall() 9 | ] 10 | 11 | 12 | # !download < https://hanukkah.bluebird.sh/5783/noahs-sqlite.zip 13 | 14 | !split>model>>models 15 | gpt-3.5-turbo 16 | gpt-4 17 | 18 | !read 19 | https://hanukkah.bluebird.sh/5783/0 20 | !extract-text 21 | !format 22 | This is the intro to a set of database puzzles: 23 | """ 24 | {_} 25 | """ 26 | Give only the answer to the puzzle question, without any surrounding text. 27 | 28 | !cross <puznum sep=" " 38 | 1 2 3 4 5 6 7 8 39 | !format 40 | https://hanukkah.bluebird.sh/5783/{_} 41 | !read 42 | !extract-text 43 | 44 | !format >text >>text 45 | I have a sqlite database. Here are the first few rows from each of the tables: 46 | 47 | - customers 48 | customerid,name,address,citystatezip,birthdate,phone 49 | 1001,Jack Quinn,201 E Park St,"Los Angeles, CA 91343",1960-05-14,805-287-8515 50 | 1002,David Powell,224C Tysens Ln,"Staten Island, NY 10306",1978-04-04,516-768-1652 51 | 1003,Carrie Green,1608 W 53rd Way,"Tampa, FL 33614",1969-01-21,727-209-0470 52 | 53 | - orders 54 | orderid,customerid,ordered,shipped,items,total 55 | 1001,4308,2017-01-31 00:32:19,2017-01-31 07:15:00,,25.52 56 | 1002,11683,2017-01-31 00:58:31,2017-01-31 18:00:00,,35.33 57 | 1003,5676,2017-01-31 01:34:40,2017-01-31 09:00:00,,30.79 58 | 59 | - products 60 | sku,desc,wholesale_cost 61 | DLI0002,Smoked Whitefish Sandwich,9.33 62 | PET0005,"Vegan Cat Food, Turkey & Chicken",4.35 63 | HOM0018,Power Radio (red),21.81 64 | 65 | - orders_items 66 | orderid,sku,qty,unit_price 67 | 1001,COL0820,1,25.52 68 | 1002,TOY8907,1,12.92 69 | 1002,KIT5813,1,7.99 70 | 71 | The schema of the sqlite database exactly matches the schema above. 72 | 73 | Here is a database puzzle to be solved using the above schema. 74 | 75 | """ 76 | {_} 77 | """ 78 | 79 | Give only a SQLite SELECT query to answer the question. 80 | 81 | !cross <query model={model} 83 | 84 | #!sql db=noahs.sqlite 85 | !format 86 | --- 87 | {puznum} 88 | {query} 89 | 90 | {_} 91 | --- 92 | !save hod-{puznum}.sql 93 | -------------------------------------------------------------------------------- /aipl/test_core.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from collections import defaultdict 3 | import string 4 | 5 | import pytest 6 | 7 | from .interpreter import defop 8 | from .table import Table, LazyRow 9 | 10 | 11 | @defop('parse-keyval', 0, 0.5) 12 | def op_parse_keyval(aipl, s:str) -> dict: 13 | k, v = s.split('=', maxsplit=1) 14 | return {k:v} 15 | 16 | @defop('combine-dict', 1.5, 0.5) 17 | def op_combine_dict(aipl, t:Table) -> dict: 18 | ret = {} 19 | for row in t: 20 | ret.update(row._asdict()) 21 | return ret 22 | 23 | @defop('cases', 0, 0.5) 24 | def op_cases(aipl, v:str) -> dict: 25 | return dict(upper=v.upper(), lower=v.lower()) 26 | 27 | @defop('lowercase', 0, 0) 28 | def op_lowercase(aipl, v:str) -> str: 29 | return v.lower() 30 | 31 | @defop('uppercase', 0, 0) 32 | def op_uppercase(aipl, v:str) -> str: 33 | return v.upper() 34 | 35 | @defop('lettertypes', 0, 1.5, outcols='letters digits') 36 | def op_letters(aipl, v:str) -> List[dict]: 37 | 'Yield dict(letters=, digits=) for each word in input.' 38 | for word in v.split(): 39 | letters = defaultdict(int) 40 | for c in word: 41 | if c in string.ascii_letters: 42 | letters['letters'] += 1 43 | elif c in string.digits: 44 | letters['digits'] += 1 45 | yield letters 46 | 47 | def test_lowercase(aipl): 48 | # scalar to scalar 49 | # 2 rows; single column 50 | t = aipl.run_test('!split !lowercase !join', 'A b C', 'DeF') 51 | assert len(t.rows) == 2 52 | assert t[0].value == 'a b c' 53 | assert t[1].value == 'def' 54 | 55 | def test_cases(aipl): 56 | t = aipl.run_test('!split !cases !join', 'A b C', 'DeF') 57 | assert len(t.rows) == 2 58 | assert t[0].value == 'a b c' 59 | assert t[1].value == 'def' 60 | 61 | 62 | def test_op_dicts(aipl): 63 | 'test ops of rankin/rankout == 0.5' 64 | t = aipl.run_test('!split sep=, !parse-keyval !combine-dict', 'a=1,b=2,c=3') 65 | assert t._asdict()[0] == dict(a='1', b='2', c='3') 66 | 67 | 68 | def test_col_reference(aipl): 69 | t = aipl.run_test('!split sep=, !parse-keyval !combine-dict !format\n{first} {last}', 'last=smith,first=mike') 70 | assert t[0].value == 'mike smith' 71 | 72 | 73 | def test_out_table_dict(aipl): 74 | 'Tests when a rankout of 1.5 is returned a dict.' 75 | r = aipl.run_test('!lettertypes', '1abc cd23 de53') 76 | t = r[0].value 77 | assert set(t.colnames) == set(['digits', 'letters']) 78 | assert t[0]['digits'] == 1 and t[0]['letters'] == 3 79 | -------------------------------------------------------------------------------- /about/vision.md: -------------------------------------------------------------------------------- 1 | 2 | # A Toolmaker's Vision 3 | 4 | With a simple framework, a common connection interface, and the right set of components, the work becomes relatively easy: 5 | 6 | - the work being done is only the *essence* of the work to be done 7 | - no unnecessary complexity 8 | - no impedance mismatch between components 9 | 10 | - the work is more than easy--it is delightful 11 | - some of this is just raw "oh thank god yes this is what software should be like" 12 | - some is a surprising depth, an invitation to explore that will often be rewarded 13 | - some is a nostalgic [feeling of the computer](bluebird.sh/feeling) 14 | 15 | - the work is so easy and delightful that it becomes playful. 16 | 17 | Not just can more work get done faster, but a *whole new level* of possibility opens up. 18 | 19 | Like the piano, or the typewriter, or the spreadsheet. 20 | 21 | ## The ladder of computing 22 | 23 | The progression of computational tools goes from calculators, to spreadsheets, to notebooks, to scripts, to programs, to systems. 24 | Each level gives you more power and flexibility, and also needs more mana and a higher skill level to use. 25 | The lower you go, the more it's geared towards an individual user; the higher you go, the more towards users operating as part of a larger organization. 26 | 27 | In the realm of AI, ChatGPT is a calculator: you can run only 1 calculation at a time. 28 | If you have a one-off question for GPT, you can just open the website and type it in, and they handle some niceties for you. 29 | 30 | But if you keep coming back and pasting in a prompt, or you want to run the same prompt with madlibs or a mail merge or across a range of temperatures, or you have to fetch a page from the web, or you have to split the text up so it fits in the context window...you're going to want to use the API (or maybe another LLM). 31 | 32 | But you have to write code to use the API. If your use case is very simple or prescribed, someone may have written the code such that you can use it as an existing program or service. But for anything requiring even a bit of customization outside of that, you would have to at least use a notebook (which aren't pure text and can be unwieldy), or graduate to a script. 33 | 34 | Python has grown into a huge language, and is no longer at the 'script' level for data processing tasks (though it is easier than doing it in Rust!). Even if the libraries to do what you want already exist, you still need a fair amount of programming experience and skill to make it happen. 35 | 36 | AIPL is intended to be at the script level for data processing and AI. 37 | -------------------------------------------------------------------------------- /aipl/ops/json.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Provides !json which converts Table into json blob, 3 | and !json_parse which converts json blob into row. 4 | ''' 5 | 6 | import json 7 | 8 | from aipl import defop, Table, Column 9 | 10 | 11 | class _jsonEncoder(json.JSONEncoder): 12 | def default(self, obj): 13 | return str(obj) 14 | 15 | 16 | @defop('json', 100, 0) 17 | def op_json(aipl, t:Table, indent:int = None) -> str: 18 | 'Convert Table into a json blob.' 19 | jsonenc = _jsonEncoder(indent=indent) 20 | return jsonenc.encode(t._asdict()) 21 | 22 | 23 | def _json_find(v:dict|list|int|float|str, args): 24 | if not args: 25 | yield v 26 | elif isinstance(v, (str, int, float)): 27 | yield v 28 | elif isinstance(v, (list, tuple)): 29 | for item in v: 30 | yield from _json_find(item, args) 31 | elif isinstance(v, dict): 32 | for k, item in v.items(): 33 | if args and k != args[0]: 34 | continue 35 | yield from _json_find(item, args[1:]) 36 | else: 37 | raise 'error' 38 | 39 | 40 | class FlatteningDict(dict): 41 | def __init__(self, d:dict): 42 | for k, v in d.items(): 43 | self[k] = v 44 | 45 | def __setitem__(self, k, v): 46 | if isinstance(v, dict): 47 | for newk, newv in v.items(): 48 | self[k+'_'+newk] = newv # should recurse 49 | else: 50 | super().__setitem__(k, v) 51 | 52 | def test_flattening_dict(): 53 | r = FlatteningDict(dict(a=dict(b=1, c=2), d=4, e=dict(f=dict(g=5)))) 54 | assert r == dict(a_b=1, a_c=2, d=4, e_f_g=5) 55 | 56 | def pyobj_to_table(r) -> Table|dict|int|float|str: 57 | if r is None: 58 | return None 59 | elif isinstance(r, (list, tuple)): 60 | keys = set() 61 | ret = Table() 62 | for inobj in r: 63 | outobj = pyobj_to_table(inobj) 64 | assert isinstance(outobj, dict) 65 | ret.rows.append(outobj) 66 | keys |= set(outobj.keys()) 67 | 68 | for k in keys: 69 | ret.add_column(Column(k, k)) 70 | return ret 71 | elif isinstance(r, dict): 72 | # = {'__parent': parent_row} if parent_row is not None else {} 73 | return FlatteningDict({k:pyobj_to_table(v) for k, v in r.items()}) 74 | else: 75 | assert isinstance(r, (str, int, float)), type(r) 76 | return r 77 | 78 | 79 | @defop('json-parse', 0, 1.5) 80 | def op_json_parse(aipl, v:str, **kwargs) -> Table: 81 | 'Convert a json blob into a Table.' 82 | r = json.loads(v) 83 | if not kwargs: 84 | if isinstance(r, dict): 85 | return pyobj_to_table([r]) 86 | else: 87 | return pyobj_to_table(r) 88 | else: 89 | for colname, findstr in kwargs.items(): 90 | for ret in _json_find(r, findstr.split('.')): 91 | return pyobj_to_table(ret) 92 | -------------------------------------------------------------------------------- /aipl/ops/debug.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import sys 3 | 4 | from aipl import defop, LazyRow, UserAbort, Table, AIPL, Command 5 | 6 | 7 | @defop('option', None, None) 8 | def op_option(aipl, **kwargs): 9 | 'Set option=value.' 10 | for k, v in kwargs.items(): 11 | aipl.options[k] = v 12 | 13 | 14 | @defop('debug', None, None) 15 | def op_debug(aipl, *args): 16 | 'set debug flag and call breakpoint() before each command' 17 | aipl.options.debug = True 18 | 19 | def _vd_singlestep(aipl, cmd:Command, *inputs:List[LazyRow]): 20 | import visidata 21 | @visidata.VisiData.api 22 | def uberquit(vd): 23 | raise UserAbort('user abort') 24 | 25 | inputs = list(r._asdict() for r in inputs) 26 | sheet = visidata.PyobjSheet('current_input', source=inputs) 27 | sheet.help = '{sheet.recentcmd}' 28 | argstr = ' '.join(str(x) for x in cmd.args) 29 | kwargstr = ' '.join(f'{k}={v}' for k, v in cmd.kwargs.items()) 30 | sheet.recentcmd = f'[line {cmd.linenum}] !' + ' '.join([cmd.opname, argstr, kwargstr]) 31 | sheet.addCommand('Q', 'quit-really', 'uberquit()') 32 | visidata.vd.run(sheet) 33 | 34 | 35 | def stderr_rich(*args): 36 | import rich 37 | rich.print(*args, file=sys.stderr) 38 | 39 | 40 | @defop('pp', 100, None) 41 | def op_rich(aipl, t:Table, *args, **kwargs): 42 | 'Pretty-print the whole table to stderr.' 43 | stderr_rich(t, *args, **kwargs) 44 | 45 | def install_rich(aipl, *args): 46 | import rich 47 | AIPL.pre_command = lambda aipl, cmd, t=Table(), *args: stderr_rich(t, cmd) 48 | 49 | 50 | def _rich_table(t:Table, console, console_options): 51 | import rich 52 | import rich.table 53 | 54 | table = rich.table.Table(show_header=True, 55 | row_styles=['', 'bold'], 56 | header_style="bold magenta") 57 | colnames = [] 58 | for col in t.columns: 59 | if col.hidden: 60 | if col is not t.current_col: 61 | continue 62 | colname = col.name 63 | 64 | colnames.append(colname) 65 | table.add_column(colname) 66 | 67 | maxrows = 3 68 | maxstrlen = 280 69 | for i, row in enumerate(t): 70 | if i >= maxrows: 71 | table.add_row('[... %s more rows ...]' % (len(t) - maxrows)) 72 | break 73 | rowdata = [] 74 | for colname in colnames: 75 | cell = row[colname] 76 | if not isinstance(cell, (Table, str)): 77 | cell = str(cell) 78 | if isinstance(cell, str) and len(cell) > maxstrlen: 79 | cell = cell[:maxstrlen] + ' [...]' 80 | rowdata.append(cell) 81 | table.add_row(*rowdata) 82 | return [table] 83 | 84 | 85 | def _rich_command(cmd:Command, console, console_options): 86 | return [str(cmd)] 87 | 88 | 89 | Table.__rich_console__ = _rich_table 90 | Command.__rich_console__ = _rich_command 91 | AIPL.step_rich = install_rich 92 | AIPL.step_vd = _vd_singlestep 93 | -------------------------------------------------------------------------------- /aipl/ops/metrics.py: -------------------------------------------------------------------------------- 1 | from aipl.table import Table 2 | from aipl import defop, LazyRow 3 | import numpy as np 4 | 5 | def _is_int(val): 6 | try: 7 | int(val) 8 | return True 9 | except ValueError: 10 | return False 11 | 12 | def _to_np_int_array(t:Table, colname:str) -> np.array: 13 | column = [int(row[colname]) if _is_int(row[colname]) else np.nan for row in t] 14 | return np.array(column) 15 | 16 | def _true_positives(predictions:np.array, true_values:np.array) -> float: 17 | return ((predictions == 1) & (true_values == 1)).sum() 18 | 19 | def _true_negatives(predictions:np.array, true_values:np.array) -> float: 20 | return ((predictions == 0) & (true_values == 0)).sum() 21 | 22 | def _false_positives(predictions:np.array, true_values:np.array) -> float: 23 | return ((predictions == 1) & (true_values == 0)).sum() 24 | 25 | def _false_negatives(predictions:np.array, true_values:np.array) -> float: 26 | return ((predictions == 0) & (true_values == 1)).sum() 27 | 28 | def _recall(predictions:np.array, true_values:np.array) -> float: 29 | N = true_values.shape[0] 30 | return (true_values == predictions).sum() / N 31 | 32 | def _precision(predictions:np.array, true_values:np.array) -> float: 33 | TP = _true_positives(predictions, true_values) 34 | FP = _false_positives(predictions, true_values) 35 | return TP / (TP+FP) 36 | 37 | def _balanced_accuracy(predictions:np.array, true_values:np.array, add_one_smoothing:bool) -> float: 38 | TP = _true_positives(predictions, true_values) 39 | TN = _true_negatives(predictions, true_values) 40 | FP = _false_positives(predictions, true_values) 41 | FN = _false_negatives(predictions, true_values) 42 | if add_one_smoothing: 43 | true_positive_rate = (TP + 1) / (TP + FN + 1) 44 | true_negative_rate = (TN + 1) / (TN + FP + 1) 45 | else: 46 | true_positive_rate = TP / (TP + FN) 47 | true_negative_rate = TN / (TN + FP) 48 | return (true_positive_rate + true_negative_rate) / 2 49 | 50 | @defop('metrics-accuracy', 1.5, 0) 51 | def op_accuracy(aipl, t:Table, predictions_colname:str, true_values_colname:str, add_one_smoothing:bool=None) -> float: 52 | true_values = _to_np_int_array(t, true_values_colname) 53 | predictions = _to_np_int_array(t, predictions_colname) 54 | return _balanced_accuracy(predictions, true_values, add_one_smoothing=='True') 55 | 56 | @defop('metrics-precision', 1.5, 0) 57 | def op_precision(aipl, t:Table, predictions_colname:str, true_values_colname:str, add_one_smoothing:bool=None) -> float: 58 | true_values = _to_np_int_array(t, true_values_colname) 59 | predictions = _to_np_int_array(t, predictions_colname) 60 | return _precision(predictions, true_values) 61 | 62 | @defop('metrics-recall', 1.5, 0) 63 | def op_precision(aipl, t:Table, predictions_colname:str, true_values_colname:str, add_one_smoothing:bool=None) -> float: 64 | true_values = _to_np_int_array(t, true_values_colname) 65 | predictions = _to_np_int_array(t, predictions_colname) 66 | return _recall(predictions, true_values) -------------------------------------------------------------------------------- /aipl/db.py: -------------------------------------------------------------------------------- 1 | from functools import cached_property 2 | import sys 3 | import json 4 | import sqlite3 5 | 6 | from .utils import AttrDict 7 | 8 | 9 | def dict_factory(cursor, row): 10 | return AttrDict((k, v) for (k, *_), v in zip(cursor.description, row)) 11 | 12 | 13 | def sqlite_to_pyobj(v, t:str): 14 | if t == 'JSON': 15 | return json.loads(v) 16 | return v 17 | 18 | 19 | def pyobj_to_sqlite(v): 20 | if isinstance(v, (dict, list, tuple)): 21 | return json.dumps(v) 22 | return v 23 | 24 | 25 | def sqlite_type(v): 26 | if isinstance(v, int): return 'INTEGER' 27 | if isinstance(v, float): return 'REAL' 28 | if isinstance(v, (dict, list, tuple)): return 'JSON' 29 | return 'TEXT' 30 | 31 | 32 | class Database: 33 | def __init__(self, dbfn): 34 | self.dbfn = dbfn 35 | self.tables = {} # tablename -> { colname -> { .type:str, ... } } 36 | 37 | @cached_property 38 | def con(self): 39 | con = sqlite3.connect(self.dbfn) 40 | con.row_factory = dict_factory 41 | return con 42 | 43 | def __enter__(self): 44 | return self 45 | 46 | def __exit__(self, type, value, tb): 47 | if not tb: 48 | self.con.commit() 49 | return False 50 | 51 | def get_table_info(self, tblname:str): 52 | if tblname not in self.tables: 53 | tinfo = self.query(f'PRAGMA table_info("{tblname}")') 54 | if not tinfo: 55 | return {} 56 | 57 | self.tables[tblname] = {c['name']:c for c in tinfo} 58 | 59 | return self.tables[tblname] 60 | 61 | def insert(self, tblname, **kwargs): 62 | if tblname not in self.tables: 63 | fieldstr = ', '.join(f'"{k}" {sqlite_type(v)}' for k,v in kwargs.items()) 64 | self.con.execute(f'CREATE TABLE IF NOT EXISTS "{tblname}" ({fieldstr})') 65 | 66 | fieldnames = ','.join(f'"{x}"' for x in kwargs.keys()) 67 | valholders = ','.join(['?']*len(kwargs)) 68 | self.con.execute(f'INSERT INTO "{tblname}" ({fieldnames}) VALUES ({valholders})', tuple(pyobj_to_sqlite(v) for v in kwargs.values())) 69 | self.con.commit() 70 | return kwargs 71 | 72 | def table(self, tblname): 73 | return self.query(f'SELECT * FROM "{tblname}"') 74 | 75 | def select(self, tblname, **kwargs): 76 | tinfo = self.get_table_info(tblname) 77 | if not tinfo: 78 | return [] 79 | 80 | wheres = [f'"{k}"=?' for k in kwargs.keys()] 81 | wherestr = ' AND '.join(wheres) 82 | results = self.query(f'SELECT * FROM "{tblname}" WHERE {wherestr}', 83 | *tuple(kwargs.values())) 84 | 85 | return [AttrDict((k, sqlite_to_pyobj(v, tinfo[k]['type'])) 86 | for k, v in row.items() 87 | ) for row in results] 88 | 89 | def query(self, qstr, *args): 90 | try: 91 | cur = self.con.cursor() 92 | res = cur.execute(qstr, args) 93 | return res.fetchall() 94 | except sqlite3.OperationalError as e: 95 | print(e, file=sys.stderr) 96 | return [] 97 | 98 | def sql(self, qstr): 99 | return self.con.execute(qstr) 100 | -------------------------------------------------------------------------------- /aipl/ops/llm.py: -------------------------------------------------------------------------------- 1 | ''' 2 | !llm and !llm-embedding use the OpenAI API to make queries to GPT. 3 | 4 | Requires OPENAI_API_KEY and OPENAI_API_ORG envvars to be set. 5 | ''' 6 | 7 | from typing import List, Dict 8 | import os 9 | import subprocess 10 | from pathlib import Path 11 | 12 | from aipl import defop, expensive, stderr, AIPLException, clients 13 | 14 | 15 | def _parse_msg(s:str): 16 | if s.startswith('@@@s'): 17 | return dict(role='system', content=s) 18 | elif s.startswith('@@@a'): 19 | return dict(role='assistant', content=s) 20 | else: # if s.startswith('@@@u'): 21 | return dict(role='user', content=s) 22 | 23 | def op_llm_mock(aipl, v:str, **kwargs) -> str: 24 | model = kwargs.get('model') 25 | used = clients.count_tokens(v, model=model) 26 | cost = clients.openai_pricing[model]*used/1000 27 | aipl.cost_usd += cost 28 | return f'' 29 | 30 | @defop('llm', 0, 0) 31 | @expensive(op_llm_mock) 32 | def route_llm_query(aipl, v:str, **kwargs) -> str: 33 | 'Send chat messages to `model` (default: gpt-3.5-turbo). Lines beginning with @@@s or @@@a are sent as system or assistant messages respectively (default user). Passes all named args directly to API.' 34 | client_str = kwargs.get('client') 35 | if client_str is None: 36 | if 'LLM_CLIENT_ENDPOINT' in os.environ: 37 | client = clients.SelfHostedChatClient() 38 | else: 39 | client = clients.OpenAIClient() 40 | else: 41 | if client_str == 'selfhosted': 42 | client = clients.SelfHostedChatClient() 43 | elif client_str == 'openai': 44 | client = clients.OpenAIClient() 45 | elif client_str == 'gooseai': 46 | client = clients.GooseClient() 47 | else: 48 | raise AIPLException(f"client '{client_str}' not recognized") 49 | 50 | return client.completion(aipl, v, **kwargs) 51 | 52 | @defop('llm-embedding', 0, 0.5) 53 | @expensive() 54 | def route_llm_embedding_query(aipl, v:str, **kwargs) -> str: 55 | 'Get a [text embedding](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) for a string from `model`: a measure of text-relatedness, to be used with e.g. !cluster.' 56 | model = kwargs.get('model') 57 | if model in clients.gooseai_models: 58 | raise AIPLException("GooseAI embeddings not yet supported") 59 | elif model in clients.openai_pricing: 60 | return embedding_openai(aipl, v, **kwargs) 61 | else: 62 | raise AIPLException(f"{model} not found!") 63 | 64 | def embedding_openai(aipl, v:str, **kwargs) -> dict: 65 | 'Get a an openai [text embedding](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) for a string: a measure of text-relatedness, to be used with e.g. !cluster.' 66 | import openai 67 | 68 | if not v: 69 | raise Exception('no content for embedding') 70 | 71 | if 'OPENAI_API_KEY' not in os.environ or 'OPENAI_API_ORG' not in os.environ: 72 | raise AIPLException('''OPENAI_API_KEY and OPENAI_API_ORG envvars must be set for !llm''') 73 | 74 | resp = openai.Embedding.create(input=v, **kwargs) 75 | 76 | used = resp['usage']['total_tokens'] 77 | stderr(f'Used {used} tokens') 78 | 79 | return dict(model=kwargs.get('model'), 80 | used_tokens=used, 81 | embedding=resp['data'][0]['embedding']) -------------------------------------------------------------------------------- /examples/wiki-to-map.aipl: -------------------------------------------------------------------------------- 1 | !test-input 2 | https://en.wikipedia.org/wiki/Antikythera_mechanism 3 | 4 | # load text of wikipedia article 5 | !read 6 | !extract-text>text 7 | 8 | # possibly, make a summary of the whole thing 9 | !format 10 | The following is a wikipedia article, which we're interested in extracting locations from for a map. 11 | --- 12 | {text} 13 | --- 14 | Please write a 2-sentence summary of the article, focused on the overall location(s) mentioned in the text, which will be used in the next step to provide an overall context for extracting a location from each paragraph. 15 | !llm>summary model=gpt-3.5-turbo-16k 16 | 17 | # go through each paragraph and extract text locations 18 | !format 19 | {text} 20 | !split>paragraph sep=\n 21 | !match ^[^-].{{140,}}$ 22 | !filter 23 | !format 24 | We're extracting locations from a paragraph of a wikipedia article. Here's our summary of the whole article: "{summary}". 25 | --- 26 | Here's the paragraph we want the locations from: 27 | {paragraph} 28 | --- 29 | Locations should be in quotes, comma-separated, and unique for the geocoding step. Add in the country or other major geographic container at the end of the location (eg, "New York City, NY" becomes "New York City, NY, USA", "Galapagos Archipelago" becomes "Galapagos Archipelago, Ecuador"). 30 | 31 | Please ONLY respond with a list of locations, nothing else--there is no need to add caveats or explanations, the answers will inevitably not be "perfect". If there are no locations mentioned in this specific paragraph, please only respond with a comma ",". Do not use locations from the summary unless they appear in the paragraph. 32 | !llm 33 | !split>location sep=, 34 | 35 | !format 36 | The following is a location. Please respond with your best guess at to its latitude and longitude. DO NOT respond with anything except , . You may infer the country of origin. 37 | 38 | For example: 39 | LOCATION: New York City, NY 40 | GEOCODED: 40.712778, -74.006111 41 | 42 | LOCATION: Wreck of the Titanic 43 | GEOCODED: 41.725556, -49.946944 44 | 45 | LOCATION: Acropolis of Athens 46 | GEOCODED: 37.9715, 23.7262 47 | 48 | LOCATION: {location} 49 | GEOCODED: 50 | !llm>geocoded 51 | 52 | !json indent=2 53 | 54 | !print 55 | !save locations.json 56 | 57 | # make into a map 58 | !python 59 | import json 60 | import folium 61 | import itertools 62 | 63 | colors = itertools.cycle(['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue', 'darkpurple', 'pink', 'lightblue', 'lightgreen', 'gray', 'black', 'lightgray']) 64 | 65 | # Load locations data from JSON file 66 | with open('locations.json', 'r') as f: 67 | data = json.load(f) 68 | 69 | # Initialize map centered around (0, 0) 70 | m = folium.Map(location=[0, 0], zoom_start=4) 71 | 72 | # List to store coordinates for lines 73 | coords = [] 74 | 75 | # Parse and add each location to the map 76 | for item in data: 77 | for subitem in item["_"]: 78 | if subitem["_"]: 79 | paragraph = subitem["paragraph"] 80 | color = next(colors) 81 | for loc in subitem["_"]: 82 | geocoded = loc["geocoded"] 83 | location_name = loc["location"] 84 | try: 85 | lat, lon = [float(coord) for coord in geocoded.split(',')] 86 | except: 87 | print("skipping", location_name, geocoded, file=sys.stderr) 88 | continue 89 | marker = folium.Marker([lat, lon], icon=folium.Icon(color=color, icon="cloud")) 90 | marker.add_child(folium.Tooltip(text=location_name, permanent=True)) 91 | marker.add_child(folium.Popup(paragraph, max_width=500)) 92 | marker.add_to(m) 93 | 94 | # Save the map to an HTML file 95 | m.save('map.html') 96 | -------------------------------------------------------------------------------- /aipl/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | import argparse 5 | 6 | from aipl import AIPL, Table, UserAbort, AIPLException, parse, repl 7 | 8 | def parse_args(args): 9 | parser = argparse.ArgumentParser(description='AIPL interpreter') 10 | parser.add_argument('--debug', '-d', action='store_true', help='abort on exception') 11 | parser.add_argument('--test', '-t', action='store_true', help='enable test mode') 12 | parser.add_argument('--interactive', '-i', action='store_true', help='interactive REPL') 13 | parser.add_argument('--step', action='store', default='', help='call aipl.step_(cmd, input) before each step') 14 | parser.add_argument('--step-breakpoint', '-x', action='store_const', dest='step', const='breakpoint', help='breakpoint() before each step') 15 | parser.add_argument('--step-rich', '-v', action='store_const', dest='step', const='rich', help='output rich table before each step') 16 | parser.add_argument('--step-vd', '--vd', action='store_const', dest='step', const='vd', help='open VisiData with input before each step') 17 | parser.add_argument('--dry-run', '-n', action='store_true', help='do not execute @expensive operations') 18 | parser.add_argument('--cache-db', '-c', action='store', default='aipl-cache.sqlite', dest='cachedbfn', help='sqlite database for caching operators') 19 | parser.add_argument('--no-cache', action='store_const', dest='cachedbfn', const='', help='sqlite database for caching operators') 20 | parser.add_argument('--output-db', '-o', action='store', default='aipl-cache.sqlite', dest='outdbfn', help='sqlite database accessible to !db operators') 21 | parser.add_argument('--split', '--separator', '-s', action='store', default='\n', dest='separator', help='separator to split input on') 22 | parser.add_argument('script_or_global', nargs='*', help='scripts to run, or k=v global parameters') 23 | return parser.parse_args(args) 24 | 25 | 26 | 27 | def main(): 28 | 29 | args = parse_args(None) 30 | global_parameters = {} 31 | scripts = [] 32 | inputs = [] 33 | 34 | for arg in args.script_or_global: 35 | if '=' in arg: 36 | key, value = arg.split('=', maxsplit=1) 37 | global_parameters[key] = value 38 | else: 39 | scripts.append(arg) 40 | 41 | if not scripts: # nothing to run -> REPL 42 | args.interactive = True 43 | 44 | aipl = AIPL(**vars(args)) 45 | 46 | # dup stdin/stdout if necessary 47 | 48 | if not sys.stdin.isatty(): 49 | try: 50 | fin = open('/dev/tty') 51 | aipl.stdin = open(os.dup(0)) 52 | os.dup2(fin.fileno(), 0) 53 | stdin_contents = aipl.stdin.read() 54 | fin.close() 55 | except OSError as e: 56 | aipl.stdin = sys.stdin 57 | stdin_contents = '' 58 | else: 59 | aipl.stdin = sys.stdin 60 | stdin_contents = '' 61 | 62 | if not sys.stdout.isatty(): 63 | try: 64 | fout = open('/dev/tty', mode='w') 65 | aipl.stdout = open(os.dup(1), 'w') # for dumping to stdout from interface 66 | os.dup2(fout.fileno(), 1) 67 | fout.close() # close file descriptors for original stdin/stdout 68 | except OSError as e: 69 | aipl.stdout = sys.stdout 70 | else: 71 | aipl.stdout = sys.stdout 72 | 73 | aipl.globals.update(global_parameters) 74 | 75 | # add input from stdin 76 | input_text = stdin_contents.strip() 77 | 78 | if args.separator: 79 | inputlines = input_text.split(args.separator) 80 | else: 81 | inputlines = [input_text] 82 | 83 | inputs.append(aipl.new_input(*inputlines)) 84 | 85 | try: 86 | for fn in scripts: 87 | inputs = aipl.run(open(fn).read(), inputs) 88 | 89 | if not scripts or args.interactive: 90 | repl(aipl, inputs) 91 | except UserAbort as e: 92 | print(f'aborted', e, file=sys.stderr) 93 | sys.exit(2) 94 | except AIPLException as e: 95 | print(e, file=sys.stderr) 96 | sys.exit(1) 97 | finally: 98 | if aipl.cost_usd: 99 | print(f'total cost: ${aipl.cost_usd:.02f}', file=sys.stderr) 100 | -------------------------------------------------------------------------------- /aipl/test_parse.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import textwrap 3 | 4 | from .parser import parse 5 | 6 | def test_single_line(): 7 | assert ops(parse("!one !two !three\n")) == ["one", "two", "three"] 8 | 9 | def test_simple_varname(): 10 | command = parse("!split>output\n") 11 | assert command[0].opname == "split" 12 | assert command[0].kwargs == {} 13 | assert command[0].args == [] 14 | assert command[0].varnames == ["output"] 15 | 16 | def test_varname_afterwards(): 17 | command = parse("!op arg >var") 18 | assert command[0].opname == "op" 19 | assert command[0].args == ["arg"] 20 | assert command[0].varnames == ["var"] 21 | 22 | def test_global(): 23 | command = parse("!op arg >>global_name") 24 | assert command[0].opname == "op" 25 | assert command[0].args == ["arg"] 26 | assert command[0].varnames == [] 27 | assert command[0].globals == ["global_name"] 28 | 29 | def test_split_newlines(): 30 | command = parse("!split sep=\\n\n") 31 | assert command[0].opname == "split" 32 | assert command[0].kwargs == {"sep": "\n"} 33 | 34 | def test_trailing_empty(): 35 | commands = parse("!split\n\n!ravel\n") 36 | 37 | assert ops(commands) == ["split", "ravel"] 38 | 39 | assert commands[0].kwargs == {} 40 | assert commands[1].kwargs == {} 41 | 42 | def test_no_final_newline(): 43 | commands = parse("!split") 44 | assert ops(commands) == ["split"] 45 | 46 | 47 | def test_no_final_newline_prompt(): 48 | commands = parse("!split\nsome text") 49 | assert commands[0].opname == "split" 50 | assert commands[0].prompt == "some text" 51 | 52 | 53 | def test_random_spaces(): 54 | commands = parse("!a !b \n c d\n \n d\n e\n") 55 | assert ops(commands) == ["a", "b"] 56 | assert commands[0].args == [] 57 | assert commands[0].kwargs == {} 58 | assert commands[1].prompt == "c d\n\nd\n e" 59 | 60 | 61 | def test_args(): 62 | commands = parse("!fn arg1 arg2 arg3") 63 | assert commands[0].args == ["arg1", "arg2", "arg3"] 64 | 65 | 66 | def test_args_with_kwargs(): 67 | commands = parse("!fn arg1 key=abc arg2 key2=def arg3") 68 | assert commands[0].args == ["arg1", "arg2", "arg3"] 69 | assert commands[0].kwargs == {"key": "abc", "key2": "def"} 70 | 71 | 72 | def test_nested_parse(): 73 | commands = parse(textwrap.dedent(''' 74 | !!def split-join 75 | !split 76 | 77 | !join 78 | 79 | !split-join 80 | ''')) 81 | 82 | assert ops(commands) == ["def", "split_join"] 83 | assert commands[0].prompt == "!split\n\n!join" 84 | 85 | def test_quoted(): 86 | commands = parse(r'!fn "arg1" "\"\n"') 87 | assert commands[0].args == ["arg1", '"\n'] 88 | 89 | def test_single_quoted(): 90 | commands = parse(r"!fn 'arg1' '\'\n'") 91 | assert commands[0].args == ["arg1", "'\n"] 92 | 93 | def test_numbers(): 94 | commands = parse("!fn 1 2.0 3.0e10 -3 -2e-7") 95 | assert commands[0].args == [1, 2.0, 3.0e10, -3, -2e-7] 96 | 97 | def test_input_cols(): 98 | commands = parse("!split >`, e.g. `!join>>foo`. It can be referred by name anywhere in the latter part of the script. 22 | 23 | A single `>`, e.g. `>bar`, is used to assign a name to the bottommost column(s) of scalars, which are then available for formatting as `{bar}` in arguments and elsewhere. 24 | 25 | ## Tacit dataflow 26 | 27 | This completely tacit dataflow is great for unary (and nonary) operators. 28 | 29 | For binary operators, the second or "right" operand can be passed as a special argument, e.g. `!cross <>`. 30 | 31 | Alternatively, the text on the lines below the operator, commonly called "the prompt", will be passed as the second operand. For unary operators, if there is any non-whitespace text in the prompt, the prompt will override the default input and be passed as the first operand instead. The result of this operator becomes the input, so the previous result must be named or it will be lost forever!! 32 | 33 | A lone `<` as an argument signifies that everything until the end of the line is taken to be the prompt. 34 | 35 | If `<` is at the end of a !command line, then a prompt is expected, and the REPL will read text until EOF. 36 | (In non-REPL mode, `<\n!` would force the input operand to be an empty string. 37 | 38 | ## Tacit looping 39 | 40 | The input may have as many as 99 dimensions, but operand(s) can only have 0 or 1 dimensions (until actual matrix operations are implemented, but then the limit would be 2). 41 | 42 | Each operator must specify the dimensionality of its operands (using defop kwargs `rankin` and `rankin2`). 43 | 44 | ### Unary operators 45 | When a unary operator is applied to an input with higher dimensionality, the operator will be applied recursively to each of the input's values. 46 | The result will have the same "outer" structure as the input, while the "innermost" values will the output values of the operator. 47 | 48 | Each row containing the input value with the lowest dimensionality will be augmented with the output of the operator applied to it. 49 | 50 | A column will be added to the tables containing those rows, such that the row and table values will now be these most recent results. 51 | 52 | ### Binary operators 53 | 54 | The left operand is the operator's "prompt" in the script or the current "input". The right operand must be a scalar or a toplevel output (for now). 55 | If needed, looping over the right operand must be done manually by the operator. 56 | 57 | ## Rows and columns 58 | 59 | All scalars and vectors are actually projections of "rows" and "tables", respectively. The "value" of a row is a (boxed) scalar or another table. A "simple row" has a scalar value. The value of a table is a vector of the values of its rows. A simple table has a value that is a vector of scalars. 60 | 61 | A simple row is like `0-plus` or `0.5` dimension. A simple table is like `1-plus` or `1.5` dimension. 62 | 63 | A opaque row can have other potential data besides its value. 64 | A column knows how to get a particular projection of data from a opaque row. 65 | 66 | A table is a list of opaque rows and a list of columns. 67 | The table can generate a list of virtual rows, one for each opaque row, which appear as mappings from column names to values. 68 | 69 | A row can be part of many tables. 70 | Each column is on only one table, but a specific opaque row can be part of many tables. 71 | 72 | The opaque row contains all the data, so both columns and tables can be lightweight objects. 73 | If rows are augmented but never need to be copied, then generating both column- and row-wise subsets of tables is a lightweight process. 74 | 75 | ## Tacit context of previous results 76 | 77 | [TBW] 78 | -------------------------------------------------------------------------------- /about/23-faq.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | ## Q: why would I not just write some code? 4 | 5 | First of all, AIPL is code. In fact it's entirely based on Python, and allows you to start writing code by using the `!python` operator. 6 | However, there are reasons why you might not want to just write Python code in the first place. 7 | 8 | The biggest being: you shouldn't over-engineer prototypes. 9 | 10 | Yes, if you were going to put some AI workflow into production, for "real" users, you would probably want to write some "real" code! 11 | You might need to consider factors like implementing real-time responses, handling large data volumes, or incorporating custom code within a loop. But that's going to cost real time, real money, and real skill. 12 | 13 | Before reaching that stage, you need to know how your idea can be done with AI in its current form. 14 | You may need to explore which of the available models might be better or cheaper, figure out how exactly you would have to organize the pipeline so that you can get the results you need, and engineer the literal prompts themselves. 15 | You might even have to scrap the idea altogether if you can't get GPT (or whatever LLM) to respond accurately--and if that's the case, you want to find that out quickly, before investing any real resources. 16 | 17 | You want something quick-and-dirty to experiment with. You want to be able to whip up a prototype in a couple hours. 18 | 19 | But you need something bigger than prompting directly to ChatGPT within the browser. It's fine for testing one thing, and you can do the pre- and post-processing yourself by hand. For anything greater than N=1, though, you're already wanting something more reproducible. 20 | 21 | For instance, here's a script to summarize any number of webpages: https://github.com/saulpw/aipl/blob/develop/examples/summarize.aipl 22 | 23 | To do this in Python would involve being explicit about iteration, caching, error-handling, and the result would be a more unwieldy script, with the requisite quoting and/or escaping, code out-of-order and perhaps code scattered across multiple files, even the boilerplate--these things add friction for someone who knows Python, and make it impossible for a non-coder. 24 | 25 | At the tiny sets (N=10 or N=100) we use to validate our ideas, we want our focus to be on the experiments themselves as much as possible. 26 | 27 | There's a progression of computational tools: from calculators, to spreadsheets, to notebooks, to scripts, to programs, to systems. Each level gives you more power and flexibility, but requires more attention and skill. 28 | 29 | In this context, ChatGPT is only a calculator, while Python is used to create programs and systems. Python notebooks are useful but have their quirks, and don't scale well without explicit adjustments. 30 | 31 | AIPL fills the gap between notebooks and programs, functioning as a platform for scripts. Scripts are less flexible than full programs but are easier to write, modify, and maintain. They are self-contained in a single text file, making them easy to share and understand. AIPL scripts provide a clear flow of operations and include required content inline. 32 | 33 | It's like asking, why would I write a bash script, why would I not just write some code? And sure! You might want to do that. But maybe you start with `cut | sort | uniq | sort -n` and see if that gets the job done in a fraction of the time. 34 | 35 | ## Q: What about the name "AIPL"? 36 | 37 | "AIPL" pays homage to APL, the original array language by Ken Iverson in the 1960s, which inspired some [core design decisions in AIPL](../23-design.md). 38 | Though AIPL is a generic pipeline language and not at all limited to AI, its first use case was for LLM experiments, and some features (like inline prompts) are particularly useful in the LLM world. 39 | Also when GPT-4 suggested "Array-Inspired Pipeline Language" (I fed it the README and asked for a list of 5 backronyms), it pretty much nailed it, and that sealed the deal. 40 | 41 | ## Q: What's the basic concept of AIPL? 42 | 43 | AIPL is "just" a thin layer over Python, offering various operators for data processing and calling LLMs. New operators are regularly added. Users can even create their own commands using a function decorator (@defop(...)). 44 | 45 | The role of AIPL is to execute these Python code snippets consistently across all data in predictable ways. It handles input, output, caching, logging, error handling, and the parallelism. And it turns out that infrastructure is 90% of the work of building a pipeline. 46 | 47 | Users are then able to focus on the literal essence of their work: finding the data, arranging the appropriate operators in the correct order with appropriate parameters and prompts. We'll take care of the rest. 48 | 49 | AIPL can be used for prototypes up to about a million rows, and the resulting .aipl script is the ultimate reference of the 'secret sauce'. 50 | 51 | Then if/when the need arises for scaling or for a real-time usage pattern, converting the AIPL prototype into a "real" pipeline requires less effort than it would have taken to develop the initial implementation. It is a low-risk process, especially if users write their own runner in Python using another library. [porting cookbooks for $] 52 | -------------------------------------------------------------------------------- /aipl/parser.py: -------------------------------------------------------------------------------- 1 | from typing import List, Callable 2 | import textwrap 3 | import sys 4 | from dataclasses import dataclass 5 | import ast 6 | from collections import defaultdict 7 | from lark import Lark, Transformer, Discard, Token 8 | 9 | aipl_grammar = Lark(r''' 10 | start: line* 11 | 12 | ws: [ _WS ] 13 | _WS: /[ \t]+/ 14 | 15 | line: commands prompt | _EMPTY_LINE 16 | 17 | commands: (command)+ 18 | command: command_sign OPNAME arg_list ws 19 | 20 | OPNAME: IDENTIFIER 21 | 22 | ?command_sign: /!!?/ 23 | 24 | _EMPTY_LINE: "\n" 25 | 26 | varname: ">" IDENTIFIER 27 | globalname: ">>" IDENTIFIER 28 | input_col: "<" IDENTIFIER 29 | input_table: "<<" IDENTIFIER 30 | 31 | arg_list: arg* 32 | 33 | arg: ws (KEY "=" literal | literal | varname | globalname | input_col | input_table) 34 | 35 | ?literal: BARE_STRING | ESCAPED_STRING 36 | BARE_STRING: /[^ \t\n!"'><]\S*/ 37 | 38 | ESCAPED_STRING: /(["']).*?(? {self.opname} (line {self.linenum-1})' # first line is implicit !!python 68 | 69 | 70 | class ToAst(Transformer): 71 | def line(self, tree): 72 | if len(tree) == 0: 73 | return tree 74 | (commands, prompt) = tree 75 | if prompt: 76 | commands[-1].prompt = prompt 77 | return commands 78 | 79 | def commands(self, tree): 80 | return list(tree) 81 | 82 | def start(self, tree): 83 | output = [] 84 | for line in tree: 85 | output.extend(line) 86 | return output 87 | 88 | def command(self, tree): 89 | command_sign, opname, arguments = tree 90 | 91 | return Command( 92 | opname=opname, 93 | op=None, 94 | linenum=command_sign.line, 95 | immediate=command_sign.value == '!!', 96 | varnames=arguments['varnames'], 97 | globals=arguments['globalnames'], 98 | input_cols=arguments['input_cols'], 99 | input_tables=arguments['input_tables'], 100 | args=arguments['args'], 101 | kwargs=dict(arguments['kwargs']), 102 | prompt=None, 103 | ) 104 | 105 | def OPNAME(self, token): 106 | return clean_to_id(token.value) 107 | 108 | def command_prompt(self, tree): 109 | command, prompt = tree 110 | if prompt: 111 | command.kwargs['prompt'] = prompt 112 | return command 113 | 114 | def arg_list(self, arg_list): 115 | arguments = defaultdict(list) 116 | 117 | for key, arg in arg_list: 118 | arguments[key].append(arg) 119 | 120 | return arguments 121 | 122 | def varname(self, tree): 123 | return ('varnames', tree[0]) 124 | 125 | def globalname(self, tree): 126 | return ('globalnames', tree[0]) 127 | 128 | def input_table(self, tree): 129 | return ("input_tables", tree[0]) 130 | 131 | def input_col(self, tree): 132 | return ("input_cols", tree[0]) 133 | 134 | def arg(self, tree): 135 | if isinstance(tree[0], tuple): 136 | return tree[0] 137 | 138 | if isinstance(tree[0], Token) and tree[0].type == 'KEY': 139 | return ('kwargs', (clean_to_id(tree[0].value), tree[1])) 140 | 141 | return ('args', tree[0]) 142 | 143 | def prompt(self, lines): 144 | prompt = textwrap.dedent(''.join(token.value for token in lines)).strip() 145 | if not prompt: 146 | return None 147 | return prompt 148 | 149 | def ws(self, tree): 150 | return Discard 151 | 152 | def BARE_STRING(self, token): 153 | return trynum(token.value) 154 | 155 | def IDENTIFIER(self, token): 156 | return token.value 157 | 158 | def ESCAPED_STRING(self, token): 159 | return ast.literal_eval(token.value) 160 | 161 | def parse(program_text): 162 | parse_tree = aipl_grammar.parse(program_text + "\n") 163 | return ToAst().transform(parse_tree) 164 | 165 | 166 | def trynum(x:str) -> int|float|str: 167 | try: 168 | return int(x) 169 | except Exception: 170 | try: 171 | return float(x) 172 | except Exception: 173 | return x.replace('\\n', '\n').replace('\\t', '\t').replace('\\\\', '\\') 174 | 175 | 176 | def clean_to_id(s:str) -> str: 177 | return s.replace('-', '_').strip('!') 178 | 179 | 180 | if __name__ == '__main__': 181 | for file in sys.argv[1:]: 182 | print("Parsing: ", file) 183 | # prepend `!!python` to the input to correctly handle any leading python code 184 | # see also: AIPL.run() method in interpreter.py 185 | parse_tree = aipl_grammar.parse('!!python\n' + open(file).read()) 186 | print("Parse tree: ", parse_tree.pretty()) 187 | for command in ToAst().transform(parse_tree): 188 | print(command) 189 | -------------------------------------------------------------------------------- /aipl/clients.py: -------------------------------------------------------------------------------- 1 | from aipl import defop, expensive, stderr, AIPLException 2 | import openai 3 | import os 4 | 5 | # from the horse's mouth, 2023-05-30 6 | openai_pricing = { 7 | "gpt-4": 0.06, 8 | "gpt-4-32k": 0.12, 9 | "gpt-3.5-turbo": 0.002, 10 | "gpt-3.5-turbo-16k": 0.002, 11 | "text-ada-001": 0.0016, 12 | "text-babbage-001": 0.0024, 13 | "text-curie-001": 0.0120, 14 | "text-davinci-003": 0.1200 15 | } 16 | 17 | # base price covers the first 25 tokens, then it's the per-token price (2023-06-06) 18 | gooseai_models = { 19 | "fairseq-13b": { 20 | "pricing": { 21 | "base": 0.001250, 22 | "token": 0.000036 23 | }, 24 | "encoding": "" 25 | }, 26 | "gpt-neo-20b": { 27 | "pricing": { 28 | "base": 0.002650, 29 | "token": 0.000063 30 | }, 31 | "encoding": "gpt2" 32 | } 33 | } 34 | 35 | def _parse_msg(s:str): 36 | if s.startswith('@@@s'): 37 | return dict(role='system', content=s) 38 | elif s.startswith('@@@a'): 39 | return dict(role='assistant', content=s) 40 | else: # if s.startswith('@@@u'): 41 | return dict(role='user', content=s) 42 | 43 | 44 | def count_tokens(s:str, model:str=''): 45 | try: 46 | import tiktoken 47 | enc = tiktoken.encoding_for_model(model) 48 | return len(enc.encode(s)) 49 | except ModuleNotFoundError as e: 50 | # stderr(str(e)) 51 | return len(s)//4 52 | except KeyError as e: 53 | # just estimate 54 | return len(s)//4 55 | 56 | 57 | class StandardClient: 58 | def compute_cost(self, aipl, resp, model): 59 | if self.client_type == 'openai': 60 | used = resp['usage']['total_tokens'] 61 | result = resp['choices'][0]['message']['content'] 62 | cost = openai_pricing[model]*used/1000 63 | if aipl: # makes it easier to run unit tests 64 | aipl.cost_usd += cost 65 | 66 | stderr(f'Used {used} tokens (estimate {len(result)//4} tokens). Cost: ${cost:.03f}') 67 | elif self.client_type == 'selfhosted': 68 | stderr('Used TODO tokens. Cost: $¯\\_(ツ)_/¯') 69 | 70 | def completion(self, aipl, v:str, **kwargs) -> str: 71 | 'Send chat messages to GPT. Lines beginning with @@@s or @@@a are sent as system or assistant messages respectively (default user). Passes all [named args](https://platform.openai.com/docs/guides/chat/introduction) directly to API.' 72 | model = kwargs.get('model') or self.default_model 73 | temperature = kwargs.get('temperature') or 0 74 | params = dict( 75 | temperature=float(temperature), 76 | top_p=1, 77 | frequency_penalty=0, 78 | presence_penalty=0, 79 | model=model 80 | ) 81 | params.update(kwargs) 82 | 83 | # TODO: there must be a less hacky way of doing these 84 | params['temperature'] = float(params['temperature']) 85 | if 'client' in params: 86 | del params['client'] 87 | 88 | role = 'user' 89 | def _get_role_msg(s:str): 90 | if s.startswith('@@@s'): 91 | return 'system', s[4:] 92 | elif s.startswith('@@@a'): 93 | return 'assistant', s[4:] 94 | elif s.startswith('@@@u'): 95 | return 'user', s[4:] 96 | else: 97 | return role, s 98 | 99 | msgs = [] 100 | for m in v.splitlines(): 101 | role, msg = _get_role_msg(m) 102 | if msgs and msgs[-1]['role'] == role: 103 | msgs[-1]['content'] += '\n' + msg 104 | else: 105 | msgs.append(dict(role=role, content=msg)) 106 | 107 | resp = openai.ChatCompletion.create( 108 | messages=msgs, 109 | **params 110 | ) 111 | try: 112 | result = resp['choices'][0]['message']['content'] 113 | except: 114 | raise AIPLException(resp) 115 | self.compute_cost(aipl, resp, model) 116 | 117 | return result 118 | 119 | 120 | class GooseClient(StandardClient): 121 | def __init__(self): 122 | if 'GOOSE_AI_KEY' not in os.environ: 123 | raise AIPLException(f'''GOOSE_AI_KEY envvar must be set to use gooseai client type''') 124 | self.client_type = 'gooseai' 125 | self.default_model = 'gpt-neo-20b' 126 | openai.api_key = os.environ['GOOSE_AI_KEY'] 127 | openai.api_base = "https://api.goose.ai/v1" 128 | 129 | def completion(self, aipl, v, **kwargs): 130 | import requests 131 | 132 | model = kwargs.get('model') or self.default_model 133 | if 'GOOSE_AI_KEY' not in os.environ: 134 | raise AIPLException(f'''GOOSE_AI_KEY envvar must be set for !llm to use {model}''') 135 | headers = { 136 | 'Content-Type': 'application/json', 137 | 'Authorization': f'Bearer {os.environ["GOOSE_AI_KEY"]}' 138 | } 139 | params = dict( 140 | temperature=0 141 | ) 142 | params.update(**kwargs) 143 | # TODO: GooseAI supports multiple prompt completions in parallel 144 | data = {'prompt': v, **params} 145 | r = requests.post(f'https://api.goose.ai/v1/engines/{model}/completions', headers=headers, json=data) 146 | j = r.json() 147 | if 'error' in j: 148 | raise AIPLException(f'''GooseAI returned an error: {j["error"]}''') 149 | 150 | response = j['choices'][0]['text'] 151 | # Only output tokens are charged 152 | used = count_tokens(response, gooseai_models[model]['encoding']) 153 | # GooseAI's base cost provides the first 25 tokens, then each token after is charged at the token rate 154 | cost = gooseai_models[model]['pricing']['token'] * max(0, used-25) + gooseai_models[model]['pricing']['base'] 155 | if aipl: 156 | aipl.cost_usd += cost 157 | stderr(f'Used {used} tokens (estimate {len(v)//4} tokens). Cost: ${cost:.03f}') 158 | return response 159 | 160 | 161 | class OpenAIClient(StandardClient): 162 | def __init__(self): 163 | if 'OPENAI_API_KEY' not in os.environ or 'OPENAI_API_ORG' not in os.environ: 164 | raise AIPLException('''OPENAI_API_KEY and OPENAI_API_ORG envvars must be set for openai client type''') 165 | self.client_type = 'openai' 166 | self.default_model = 'gpt-3.5-turbo' 167 | 168 | 169 | class SelfHostedChatClient(StandardClient): 170 | def __init__(self): 171 | if 'LLM_CLIENT_ENDPOINT' not in os.environ: 172 | raise AIPLException('''LLM_CLIENT_ENDPOINT envvar must be set for selfhosted client type''') 173 | openai.api_base = os.environ['LLM_CLIENT_ENDPOINT'] 174 | self.client_type = 'selfhosted' 175 | if 'DEFAULT_SELFHOSTED_MODEL' in os.environ: 176 | self.default_model = os.environ['DEFAULT_SELFHOSTED_MODEL'] 177 | 178 | 179 | if __name__ == "__main__": 180 | max_tokens = 10 181 | prompt = '''A lesser-known robot character from sci-fi is''' 182 | 183 | print('openai\n', prompt, OpenAIClient().completion(None, prompt, max_tokens=max_tokens)) 184 | print('gooseai\n', prompt, GooseClient().completion(None, prompt, max_tokens=max_tokens)) 185 | print('selfhosted\n', prompt, SelfHostedChatClient().completion(None, prompt, max_tokens=max_tokens)) 186 | -------------------------------------------------------------------------------- /about/23-design.md: -------------------------------------------------------------------------------- 1 | 2 | # Design 3 | 4 | AIPL is intended as a simple platform for quick proof of concept AI-based data pipelines to be implemented and tested. 5 | 6 | ## Why? 7 | 8 | The recent developments in LLMs and AI are a whole new level of capabilities (and costs). 9 | I wanted to see what all the fuss was about, so I tried to do some basic things with [langchain](https://github.com/hwchase17/langchain) but it was this big complicated system. 10 | So instead I implemented some small workflows myself with raw Python, and it turned out that AI is actually pretty straightforward. 11 | This is a small hackable platform that makes it easy to experiment and get small-scale results. 12 | It's called AIPL. 13 | 14 | ### Emphasize the Dataflow 15 | 16 | An AIPL script represents the essence of a data pipeline, with only the high-level operations and their parameters and prompts. 17 | No boilerplate or quoting or complicated syntax. 18 | Not even much of a language--just commands executed in order. 19 | This keeps the focus on data flow and the high-level operations--the actual links in the chain. 20 | It can be expanded or optimized or parallelized as needed. 21 | 22 | ### Very Little Overhead 23 | 24 | AIPL is array-oriented and concatenative, drawing inspiration from APL and Forth, both of which have powerful operators and very simple syntax. 25 | Passing data implicitly between operators allows for efficient representation of data flows, and avoids [one of the hardest problems in computer science](https://www.namingthings.co/). 26 | And the implicit looping of array languages makes it easier to scale interactivity. 27 | 28 | ### Take Advantage of Python Ecosystem 29 | 30 | AIPL is also intended to be practical (if only at small scale), so operators are easy to write using the existing cadre of Python libraries, and allow options and parameters passed to them verbatim. 31 | 32 | ### Keep It Simple 33 | 34 | The implementation is intentionally homespun, to remove layers of abstraction and reduce the friction of setup and operation. 35 | It doesn't parallelize anything yet but it still should be able to handle hundreds of items even as it is, enough to prove a concept. 36 | I expect it to be straightforward to scale it to mag 5 (up to a million items) if something takes off. 37 | 38 | ### Learn and Explore 39 | 40 | At the very least, AIPL should be a useful tool to learn, explore, and prototype small-scale data pipelines that have expensive operations like API calls and LLM generation. 41 | 42 | ## What is "implicit looping"? 43 | 44 | It's a concept borrowed from APL. 45 | 46 | Yes, APL, that language from the 60s that looks like this: 47 | 48 | ``` 49 | avg ← +⌿ ÷ ≢ 50 | ``` 51 | 52 | Now before you run away screaming, there are 3 big ideas in APL, and why Iverson won the Turing Award in 1980: 53 | 54 | 1. implicit looping and tacit programming 55 | 56 | - brilliant, removes a large amount of code. beyond just the loops too 57 | 58 | 2. symbols 59 | 60 | APL uses a special set of non-text symbols, a custom alphabet that nearly predates ASCII itself. 61 | This is why it looks like alien gibberish to the uninitiated, and why APL has all but died out. 62 | [Iverson's paper and talk for the Turing Award is entitled [Notation as a Tool of Thought](https://dl.acm.org/doi/pdf/10.1145/358896.358899), 63 | so "notation" is ironically the focus *and* the fallacy of APL.] 64 | 65 | The symbology is math-based (as APL is a language for teaching and doing linear algebra), and is elegantly designed. but the idea is unfortunately a non-starter for modern adoption. 66 | 67 | People think in *words* or word-like chunks, and while letters of the alphabet make up the words, they are only symbols, and they carry zero semantic content. 68 | Learning a new alphabet (and one with combinatoric semantics) is a huge barrier to learning a new language. 69 | 70 | Now, I agree with Iverson's fundamental premise, that a sub-verbal facility with a core set of operations, is a tremendous tool for thought. 71 | But the actual terse and non-verbal notation ultimately prevented APL from being in common use. 72 | 73 | 3. vocabulary 74 | 75 | APL defines an elegant core set of operators that are just the right level of abstraction for math, particularly linear algebra. 76 | This is why APL-like languages are still used in the finance world: you can get a lot done quickly, and with very little code, when you know the domain and the operators are at the right level of abstraction and you can fit them in your head. 77 | 78 | The real art of software is in developing a set of legos that are easily explainable and interoperate well together, conforming to an intuitive yet precisely-defined connection spec. 79 | When done well, these legos are composable without anything else necessary to bind one's input to the other's output. Then tacit programming becomes not just possible but desired. 80 | 81 | --- 82 | 83 | So AIPL borrows implicit looping and tacit programming from APL, and lets go of its alien symbology. 84 | AIPL also borrows some of APL's vocabulary, but since data pipelines are a much different domain than math (and much more has been developed in the data domain over the past 50 years), we need to develop a different set of operators. 85 | 86 | So AIPL is also a *vocabulary discovery platform*. 87 | It is easy to add new operations in Python. 88 | AIPL is really just a skin over Python; a notebook in script form. 89 | 90 | ## For "port-able" prototypes 91 | 92 | Once you have the operators in the right order and with the right parameters, it's "just" engineering to call them at the right time, with the right batch size, in the right format, caching the results, etc. 93 | You have a "script", like for a movie. 94 | 95 | You still have to do all the engineering; you have to put it into production. 96 | But with the script, you know what's required, and you can start to plan out the process of development. 97 | 98 | You can *port* it. 99 | 100 | Don't over-engineer your experiments and your prototypes. 101 | Just put legos together in a logical order and see how the whole chain works. 102 | Tune, iterate, and discover quickly if your idea is viable or not. 103 | 104 | # The data table 105 | 106 | Operators take 0, 1, or 2 "operands with dimensionality", and any number of scalar (int/float/str) parameters. 107 | 108 | These "operands with dimensionality" are used like arrays in traditional array languages. 109 | However, those have multidimensional arrays of scalars, whereas AIPL operands are more like nestable database tables. 110 | 111 | These tables have: 112 | 113 | - a list of "rows" 114 | - named columns that can be reordered and removed without updating each row 115 | - homogenous types within a column (possibly NULL) 116 | - heterogenous types within a row 117 | 118 | Every operator consumes 0, 1, or 2 operands and produces exactly 1 operand. 119 | (Some operators have only side-effects and don't actually do anything to the data; these take 1 operand and return the same.) 120 | 121 | The simplest operator implementation takes 0/1/2 tables and returns a table. 122 | The return table may be one of the unmodified input operands, otherwise it must be a new table. 123 | The rows may be referenced for efficiency. 124 | 125 | These operators must use the consistent pattern for iterating over the table's dimensions correctly, only "changing" the proper dimension (by default the last dimension). 126 | 127 | Tables are more complex than simple vectors. 128 | But ideally, an operator could be defined only by its smallest operation, and a decorator(?) would do the consistent iteration. 129 | 130 | -------------------------------------------------------------------------------- /aipl/table.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping, List 2 | from copy import copy 3 | 4 | from aipl import AIPLException 5 | from .utils import fmtargs, fmtkwargs, stderr, strify 6 | 7 | UNWORKING = object() 8 | CURRENT_COLNAME='_' 9 | 10 | class Row(dict): 11 | pass 12 | 13 | 14 | class Column: 15 | def __init__(self, key, name=''): 16 | self.name = name or key 17 | self.key = key 18 | 19 | @property 20 | def hidden(self) -> bool: 21 | return self.name.startswith('_') 22 | 23 | def get_value(self, row:Row): 24 | if isinstance(self.key, (list, tuple)): 25 | obj = row 26 | for k in self.key: 27 | obj = obj.get(k) 28 | if obj is None: 29 | return None 30 | return obj 31 | 32 | return row.get(self.key) 33 | 34 | def __str__(self): 35 | return f'[Column {self.name}]' 36 | 37 | def __repr__(self): 38 | return f"" 39 | 40 | def deepname(self, table): 41 | if table.rows: 42 | r = self.get_value(table.rows[0]) 43 | if isinstance(r, Table): 44 | return f'{self.name}:{r.deepcolnames}' 45 | 46 | return self.name or self.key 47 | 48 | 49 | class SubColumn(Column): 50 | 'Use for tables that have nested rows from other tables in the row dict at *self.key*' 51 | def __init__(self, key, origcol): 52 | super().__init__(key, origcol.name) 53 | self.origcol = origcol 54 | 55 | def get_value(self, row:dict): 56 | return self.origcol.get_value(row[self.key]) 57 | 58 | 59 | class LazyRow(Mapping): 60 | def __init__(self, table:'Table', row:Row): 61 | self._row = row 62 | self._table = table 63 | 64 | def __copy__(self): 65 | return LazyRow(self._table, self._row) 66 | 67 | def __len__(self): 68 | return len(self._table.columns) 69 | 70 | def __iter__(self): 71 | assert isinstance(self.value, Table), f"can't iterate {type(self.value).__name__}" 72 | return iter(self.value) 73 | 74 | def __getitem__(self, k): 75 | obj = self 76 | while True: 77 | c = obj._table.get_column(k) 78 | if c: 79 | return c.get_value(obj._row) 80 | 81 | obj = obj.parent_row 82 | 83 | if obj is None: 84 | raise KeyError(k) 85 | 86 | @property 87 | def value(self): 88 | return self._table.columns[-1].get_value(self._row) 89 | 90 | def items(self): 91 | return self._asdict().items() 92 | 93 | def keys(self): 94 | return self._asdict().keys() 95 | 96 | def _asdict(self, named_only=False): 97 | 'if named_only=False, add current_col as "{CURRENT_COLNAME}" if it is hidden. otherwise ignore it too' 98 | d = {} 99 | 100 | for c in self._table.columns: 101 | if c.hidden: 102 | if named_only or c is not self._table.current_col: 103 | continue 104 | 105 | k = CURRENT_COLNAME 106 | else: 107 | k = c.name 108 | 109 | v = c.get_value(self._row) 110 | 111 | if v is None: 112 | continue 113 | elif isinstance(v, Table): 114 | if v.rank == 0: 115 | v = v.scalar 116 | else: 117 | v = [r._asdict() for r in v] 118 | elif not isinstance(v, (int, float, str)): 119 | v = str(v) 120 | 121 | if k in d: 122 | del d[k] 123 | d[k] = v 124 | 125 | return d 126 | 127 | @property 128 | def parent_row(self) -> 'LazyRow': 129 | return self._row.get('__parent', None) 130 | 131 | def __repr__(self): 132 | return f"" 133 | 134 | 135 | class Table: 136 | def __init__(self, rows:List[Mapping|LazyRow]=[], parent:'Table|None'=None): 137 | self.rows = [] # list of dict 138 | self.columns = [] # list of Column 139 | self.parent = parent 140 | self.scalar = None 141 | 142 | if isinstance(rows, (list, tuple)): # should be sequence-but-not-string 143 | for row in rows: 144 | self.append(row) 145 | else: 146 | self.scalar = rows 147 | 148 | def __len__(self): 149 | return len(self.rows) 150 | 151 | def __bool__(self): 152 | return len(self.rows) > 0 153 | 154 | def __copy__(self) -> 'Table': 155 | 'Returns structural copy of table with all columns and no rows.' 156 | ret = Table() 157 | 158 | for c in self.columns: 159 | ret.add_column(copy(c)) 160 | 161 | ret.rows = [] 162 | ret.scalar = self.scalar 163 | return ret 164 | 165 | def axis(self, rank:int=0): 166 | if self.rank > rank: 167 | firstrowval = self.columns[-1].get_value(self.rows[0]) 168 | return firstrowval.axis(rank) 169 | 170 | return self 171 | 172 | @property 173 | def values(self): 174 | if self.scalar is not None: 175 | return [self.scalar] 176 | return [r.value for r in self] 177 | 178 | @property 179 | def shape(self) -> List[int]: 180 | if self.scalar is not None: 181 | return [] 182 | dims = [len(self.rows)] 183 | if self.rows: 184 | if self.columns: 185 | firstrowval = self.current_col.get_value(self.rows[0]) 186 | if isinstance(firstrowval, Table): 187 | dims += firstrowval.shape 188 | return dims 189 | 190 | @property 191 | def rank(self) -> int: 192 | return len(self.shape) 193 | 194 | @property 195 | def colnames(self): 196 | return [c.name for c in self.columns] 197 | 198 | @property 199 | def colkeys(self): 200 | return [c.key for c in self.columns] 201 | 202 | @property 203 | def current_col(self) -> Column: 204 | return self.columns[-1] 205 | 206 | @property 207 | def deepcolnames(self) -> str: 208 | return ','.join(f'{c.deepname(self)}' for c in self.columns if not c.hidden or c is self.current_col) or "no cols" 209 | 210 | def __getitem__(self, k:int) -> LazyRow: 211 | if k >= len(self.rows): 212 | raise IndexError('table index out of range') 213 | return LazyRow(self, self.rows[k]) 214 | 215 | def _asdict(self): 216 | if self.scalar is not None: 217 | return self.scalar 218 | return [r._asdict() for r in self] 219 | 220 | def __repr__(self): 221 | if self.scalar is not None: 222 | return str(self.scalar) 223 | 224 | shapestr = 'x'.join(map(str, self.shape)) 225 | contentstr = '' 226 | if self.rows: 227 | contentstr += strify(self[0], maxlen=20) 228 | if len(self.rows) > 1: 229 | contentstr += ' ...' 230 | return f'' 231 | 232 | def __iter__(self): 233 | if self.scalar is not None: 234 | yield self.scalar 235 | else: 236 | for r in self.rows: 237 | yield LazyRow(self, r) 238 | 239 | def add_new_columns(self, row:Row): 240 | for k in row.keys(): 241 | if not k.startswith('__'): 242 | self.add_column(Column(k)) 243 | 244 | def add_column(self, col:Column): 245 | assert not col.name.startswith('__') 246 | if self.rows: 247 | assert col.get_value(self.rows[0]) is not UNWORKING 248 | if col.name in self.colnames: 249 | return 250 | 251 | if col.name.startswith('_cost'): 252 | self.columns.insert(0, col) 253 | else: 254 | self.columns.append(col) 255 | 256 | def get_column(self, name:str) -> Column: 257 | if name == CURRENT_COLNAME: 258 | return self.columns[-1] 259 | 260 | for c in self.columns: 261 | if c.name == name: 262 | return c 263 | 264 | return None 265 | 266 | def append(self, row:dict): 267 | if isinstance(row, LazyRow): 268 | self.rows.append(row._row) 269 | elif isinstance(row, Mapping): 270 | self.rows.append(row) 271 | self.add_new_columns(row) 272 | else: 273 | raise TypeError(f"row must be Mapping or LazyRow not {type(row)}") 274 | -------------------------------------------------------------------------------- /docs/tutorial.md: -------------------------------------------------------------------------------- 1 | # AIPL tutorial 2 | 3 | Okay so you heard about this AIPL thing and you're curious to see if the claims hold true. 4 | Are array languages really as powerful as they say? 5 | Can you really prototype an AI pipeline (or any data pipeline) in a few minutes? 6 | 7 | Well let's put it to the test. 8 | For this little experiment, I wanted to see how good GPT is at solving the 8 puzzles from [Hanukkah of Data](https://hanukkah.bluebird.sh/5783). 9 | Now, I've already tried pasting one of them into the ChatGPT web interface, so I have an idea how this could work. 10 | (First, if you haven't yet, [install AIPL]()). 11 | 12 | # read Hannukah of Data puzzle from the web 13 | !read 14 | https://hanukkah.bluebird.sh/5783/1 15 | !extract-text 16 | !print 17 | 18 | Okay so first off, there's no boilerplate here. An AIPL script is just a list of commands (called "operators" hereafter), in order, one after the other. 19 | Each of these operates on the input, and generates an output which becomes the next input. 20 | 21 | Here's the toplevel syntax of AIPL: 22 | 23 | - Lines that start with `#` are comments, and ignored. 24 | - Lines that start with `!` are AIPL command lines, which contain one or more operators and their arguments. 25 | - All operators start with `!`. 26 | - All lines after a command line, if there are any, are the "verbatim input", which is passed verbatim to the operator instead of the previous input. 27 | 28 | So in this case, `!read` is the operator that reads a URL or file into memory, and it's passed the URL of the first puzzle. 29 | `!extract-text` takes HTML and, um, extracts the text from it. `!print` prints it to stdout. 30 | 31 | We can now run this script: 32 | 33 | aipl hod5783.aipl 34 | 35 | and it should work, no questions asked. 36 | 37 | ## inspecting the pipeline 38 | 39 | If you want to see what is happening at each step, you can use `--step rich`: 40 | 41 | aipl hod5783.aipl --step rich 42 | 43 | And then before every command, it will dump the input table using the [rich]() text library. 44 | 45 | ## going bigger 46 | 47 | Okay, that's pretty cool, but ultimately we're going to want to do this for all 8 puzzles. 48 | 49 | !split 50 | 1 2 3 4 5 6 7 8 51 | 52 | !format 53 | https://hanukkah.bluebird.sh/5783/{_} 54 | 55 | !read 56 | !extract-text 57 | !print 58 | 59 | The `!split` operator splits its input into a list, just like in Python. 60 | 61 | `!format` formats its input using [Python string formatting](), and can refer to previous elements by their names (discussed later), or the immediate previous input with `_`. 62 | 63 | Now `!read` takes that formatted link (since it has no verbatim input anymore), and then `!extract-text` and `!print` work as before. If we run it again, we now we get the text of all 8 puzzles. 64 | 65 | ## array languages 66 | 67 | So, uh, that was kind of too easy. What's going on here? Where's the loop? Is this even programming, bro? 68 | 69 | Okay, so, the thing about array languages, is that they automatically iterate over their input. It's called "[loopless programming]()". 70 | Think of the input as an N-dimensional (jagged) list: a list of elements, each of which may also be a list, etc. 71 | 72 | The scalar operators we've seen so far--which take a scalar value, usually a string, and return a scalar value, also usually a string--loop over the "last" dimension, or the deepest list. 73 | After a scalar operator is applied, its output has the same structure as the input. 74 | 75 | The `!split` operator, on the other hand, is not a scalar operator. It takes a string, but it returns a list of strings--so the input grows by one dimension. 76 | 77 | In array-land the number of dimensions of an operand is called its "rank", with rank of 0 meaning "scalar". 78 | So in our above example, the `!split` takes its verbatim input (a 1-dimensional list of 1 string), and splits 79 | it into a 2-dimensional list: a list of 1 element, which is a list of 8 strings. 80 | Every subsequent operator just operates over all the scalar elements in the list of lists. 81 | 82 | ## using GPT 83 | 84 | Okay, so we've extracted the text, now what? Well, let's see if GPT can solve the puzzle: 85 | 86 | !format 87 | I have a database with 4 tables (field names given inside parens): 88 | 89 | - customers (customerid,name,address,citystatezip,birthdate,phone) 90 | - orders (orderid,customerid,ordered,shipped,items,total) 91 | - products (sku,desc,wholesale_cost) 92 | - orders_items (orderid,sku,qty,unit_price) 93 | 94 | Here is a database puzzle to be solved using the above schema: 95 | 96 | """ 97 | {_} 98 | """ 99 | 100 | Give only a SQLite SELECT query to answer the question. 101 | 102 | !llm model=gpt-3.5-turbo 103 | 104 | !print 105 | 106 | I wrote a prompt and inserted the extracted text with heavy delimiters, as recommended by the prompt experts. (Who knows if this does anything? I sure don't.) But we're clearly asking for a SQL answer from GPT. 107 | 108 | Note here that AIPL operators can take arguments, both positional and keyword, like in Python. 109 | These arguments don't have to be quoted--only if they have spaces or quotes or a few other characters (which can be escaped like in C or Python as usual). 110 | 111 | To run this script, you'll need to make sure you have the `OPENAI_API_KEY` and `OPENAI_API_ORG` environment variables set. 112 | 113 | Okay, so if you run this script, you can see the output the GPT-3.5 gives. Well that's nice, but what if we want to save it? What we want is to do this instead of `!print`: 114 | 115 | !save hod-{puznum}.sql 116 | 117 | In addition to `!format` formatting its "verbatim" input, arguments are also automatically formatted. 118 | So where could `puznum` come from? 119 | 120 | ## context stays available 121 | 122 | Now here's where AIPL is different than other array languages. 123 | 124 | All the way at the beginning of the script, we had that `!split` which gave us the list of puzzle numbers. 125 | If we change that to this: 126 | 127 | !split>puznum 128 | 129 | Then the list of puzzle numbers will be named `puznum`, and be carried forward as additional context to future results. 130 | So even though the puzzle number gets converted to a URL, which gets converted to HTML, then to text, etc, the earlier named contexts are still available for use in formatting. 131 | 132 | You can see this if you view the intermediate outputs with `--step rich`: 133 | 134 | ______ 135 | 136 | ## expensive operations are cached 137 | 138 | Note that the second time through, it ran a lot faster! 139 | This is because AIPL automatically caches the results of expensive operations in a sqlite db called `aipl-cache.sqlite`--you can use [VisiData](visidata.org) or another tool to inspect it. 140 | Since the LLM prompt hasn't changed, AIPL uses the previously cached output, to save both time and money. 141 | (Most of the time, in development, you are going to be trying things over and over, so this is a great convenience. 142 | When you want to deliberately not use the cache, you can use the `--no-cache` CLI flag. 143 | 144 | ## inputs are actually tables 145 | 146 | You may have already noticed that the operands actually look like whole *tables*, instead of lists. 147 | This is because under the hood, they are tables. 148 | For purposes of looping like an array language, it's the rightmost or most-recently-added column 149 | which is automatically looped into and over. 150 | 151 | But the other columns are still available, and certain operators besides `!format` can take advantage of them. 152 | Like if we put `!json` before the `!save`: 153 | 154 | !json 155 | !save hod-solutions.json 156 | 157 | `!json` converts the entire table to one single JSON blob, including the immediate output, and all previously named columns. 158 | 159 | ## cross-joining 160 | 161 | Okay, so GPT-3.5 isn't so great at solving the puzzle (at least with the prompt we've given it). 162 | Maybe GPT-4 would do better? 163 | 164 | In this script, we could manually replace the model, and run it again. But what if we wanted to run a new prompt with both models and compare the results? Or on 10 different models? 165 | 166 | With the `!cross` operator (and one more language feature), we can. Let's put this at the top of the script: 167 | 168 | !split>model>>models 169 | gpt-3.5-turbo 170 | gpt-4.0 171 | 172 | And replace the `!llm` with this: 173 | 174 | !cross <>models` to remember the whole table for later. (`>model` names the column of values, so we can refer to it in arg formatting). 180 | 181 | Then we go about our usual business constructing this main table. 182 | Just before running `!llm`, we use `!cross` to do the cross-join, and we use `<op 3 | aipl.operators.values() 4 | !format 5 | - `!{op.opname}` (in={op.rankin} out={op.rankout}) 6 | {op.__doc__} 7 | !sort 8 | !join sep=\n 9 | 10 | !read>summarize_aipl 11 | examples/summarize.aipl 12 | 13 | !unbox>>summarize_aipl 14 | 15 | !aipl-ops>opdocs 16 | !unbox>>opdocs 17 | 18 | !sh 19 | aipl --help 20 | 21 | !format>opdocs 22 | {opdocs} 23 | 24 | 25 | !format 26 | 27 | # AIPL (Array-Inspired Pipeline Language) 28 | 29 | [![Tests](https://github.com/saulpw/aipl/workflows/aipl-testing/badge.svg)](https://github.com/saulpw/aipl/actions/workflows/main.yml) 30 | [![discord](https://img.shields.io/discord/880915750007750737?label=discord)](https://visidata.org/chat) 31 | 32 | A tiny DSL to make it easier to explore and experiment with AI pipelines. 33 | 34 | ## Features 35 | 36 | - array language semantics (implicit looping over input) 37 | - tacit dataflow (output from previous command becomes input to next command) 38 | - entire pipeline defined in same file and in execution order for better understanding 39 | - including inline prompt templates 40 | - persistent cache of expensive operations into a sqlite db 41 | 42 | ### `summarize.aipl` 43 | 44 | Here's a prime example, a multi-level summarizer in the "map-reduce" style of langchain: 45 | 46 | ``` 47 | {summarize_aipl} 48 | ``` 49 | 50 | ## Usage 51 | 52 | ``` 53 | {stdout} 54 | ``` 55 | 56 | ## Command Syntax 57 | 58 | This is the basic syntax: 59 | 60 | - comments start with `#` as the first character of a line, and ignore the whole line. 61 | - commands start with `!` as the first character of a line. 62 | - everything else is given as the verbatim input to the previous `!` command. 63 | 64 | Commands can take positional and/or keyword arguments, separated by whitespace. 65 | 66 | - `!cmd arg1 key=value arg2` 67 | 68 | Keyword arguments have an `=` between the key and the value, and non-keyword arguments are those without a `=` in them. 69 | 70 | - `!cmd` will call the Python function registered to the `cmd` operator with the arguments given, as an operator on the current value. 71 | 72 | - Any text following the command line is dedented (and stripped) and added verbatim as a `prompt=` keyword argument. 73 | - Argument values may include Python formatting like `{{input}}` which will be replaced by values from the current row (falling back to parent rows, and ultimately the provided globals). 74 | - Prompt values, on the other hand, are not automatically formatted. `!format` go over every leaf row and return the formatted prompt as its output. 75 | - !literal will set its prompt as the toplevel input, without formatting. 76 | 77 | The AIPL syntax will continue to evolve and be clarified over time as it's used and developed. 78 | 79 | Notes: 80 | 81 | - an AIPL source file documents an entire pipeline from newline-delimited inputs on stdin (or via `!literal`) to the end of the pipeline (often `!print`). 82 | - commands always run consecutively and across all inputs. 83 | - the initial input is a single string (read from stdin). 84 | 85 | ## List of operators 86 | 87 | {opdocs} 88 | 89 | 90 | ## Defining a new operator 91 | 92 | It's pretty easy to define a new operator that can be used right away. 93 | For instance, here's how the `!join` operator might be defined: 94 | 95 | ``` 96 | @defop('join', rankin=1, rankout=0) 97 | def op_join(aipl:AIPL, v:List[str], sep=' ') -> str: 98 | 'Concatenate text values with *sep* into a single string.' 99 | return sep.join(v) 100 | ``` 101 | 102 | - `@defop(...)` registers the decorated function as the named operator. 103 | - `rankin`/`rankout` indicate what the function takes as input, and what it returns: 104 | - `0`: a scalar (number or string) 105 | - `0.5`: a whole row (a mapping of key/value pairs) 106 | - `1`: a vector of scalar values (e.g. `List[str]` as above) 107 | - `1.5`: a whole Table (list of the whole table (array of rows) 108 | - `None`: nothing (the operator is an input "source" if rankin is None; it is a pass-through if rankout is None) 109 | - `arity` is how many operands it takes (only `0` and `1` supported currently) 110 | 111 | The join operator is `rankin=1 rankout=0` which means that it takes a list of strings and outputs a single string. 112 | 113 | - Add the `@expensive` decorator to operators that actually go to the network or use an LLM; this will persistently cache the results in a local sqlite database. 114 | - running the same inputs through a pipeline multiple times won't keep refetching the same data impolitely, and won't run up a large bill during development. 115 | 116 | ## Design 117 | 118 | AIPL is intended as a simple platform for quick proof of concept data pipelines to be implemented and tested. 119 | 120 | ## Why? 121 | 122 | The recent developments in LLMs and AI are a whole new level of capabilities (and costs). 123 | I wanted to see what all the fuss was about, so I tried to do some basic things with [langchain](https://github.com/hwchase17/langchain) but it was this big complicated system. 124 | So instead I implemented some small workflows myself with raw Python, and it turned out that AI is actually pretty straightforward. 125 | This is just a small hackable platform that makes it easy to experiment and get small-scale results. 126 | For now it's called AIPL. 127 | 128 | ### Emphasize the Dataflow 129 | 130 | An AIPL script represents the essence of a data pipeline, with only the high-level operations and their parameters and prompts. 131 | No boilerplate or quoting or complicated syntax. 132 | Not even much of a language--just commands executed in order. 133 | This keeps the focus on data flow and the high-level operations--the actual links in the chain. 134 | It can be expanded or optimized or parallelized as needed. 135 | 136 | ### Very Little Overhead 137 | 138 | AIPL is array-oriented and concatenative, drawing inspiration from APL and Forth, both of which have powerful operators and very simple syntax. 139 | Passing data implicitly between operators allows for efficient representation of data flows, and avoids [one of the hardest problems in computer science](https://www.namingthings.co/). 140 | And the implicit looping of array languages makes it easier to scale interactivity. 141 | 142 | ### Take Advantage of Python Ecosystem 143 | 144 | AIPL is also intended to be practical (if only at small scale), so operators are easy to write using the existing cadre of Python libraries, and allow options and parameters passed to them verbatim. 145 | 146 | ### Keep It Simple 147 | 148 | The implementation is intentionally homespun, to remove layers of abstraction and reduce the friction of setup and operation. 149 | It doesn't parallelize anything yet but it still should be able to handle hundreds of items even as it is, enough to prove a concept. 150 | I expect it to be straightforward to scale it to mag 5 (up to a million items) if something takes off. 151 | 152 | ### Learn and Explore 153 | 154 | At the very least, AIPL should be a useful tool to learn, explore, and prototype small-scale data pipelines that have expensive operations like API calls and LLM generation. 155 | 156 | 157 | # Architecture 158 | 159 | The fundamental data structure is a Table: an array of hashmaps ("rows"), with named Columns that key into each Row to get its value. 160 | 161 | A value can be a string or a number or another Table. 162 | 163 | The value of a row is the value in the rightmost column of its table. 164 | The rightmost column of a table is a vector of values representing the whole table. 165 | 166 | A simple vector has only strings or numbers. 167 | A simple table has a simple rightmost value vector and is Rank 0. 168 | Each nesting of tables in the rightmost value vector increases its Rank by 1. 169 | 170 | ## operators 171 | Each operator consumes 0 or 1 or 2 operands (its `arity`), and produces one result, which becomes the operand for the next operator. 172 | 173 | Each operator has an "in rank" and an "out rank", which is the rank of the operands they input and output. 174 | 175 | By default, each operator is applied across the deepest nested table. 176 | The result of each operator is then placed in the deepest nested table (or its parent). 177 | 178 | ### rankin=0: one scalar at a time 179 | 180 | With `rankin=0` and `rankout` of: 181 | 182 | - -1: no change (like 'print') 183 | - 0: scalar operation (like 'translate') 184 | - 0.5: scalar to simple row (like 'url-split') 185 | - 1: scalar to simple vector (like 'split-text') 186 | - 1.5: scalar to table (like 'extract-links') 187 | 188 | ### rankin=0.5: consume whole row 189 | 190 | With `rankin=0.5`, and `rankout` of: 191 | 192 | - -1: no change to row (like 'dbinsert') 193 | - 0: add a new value to row (like 'pyexpr') 194 | - 0.5: replace or remove row (like 'filter') 195 | - 1: transform whole vector (like 'sort' or 'normalize') 196 | - 1.5: row to table 197 | 198 | ### rankin=1: consume the rightmost column 199 | 200 | With `rankin=1`, and `rankout` of: 201 | 202 | - -1: no change to row (like 'dbinsert') 203 | - 0: reduce to scalar (like 'join') 204 | - 0.5: reduce to simple row (like 'stats') 205 | - 1: transform whole vector (like 'normalize'); or return None to remove column 206 | - 1.5: vector to table 207 | 208 | ### rankin=1.5: consume whole table 209 | 210 | With `rankin=2`, and `rankout` of: 211 | 212 | - -1: no change to table 213 | - 0: reduce table to scalar 214 | - 0.5: reduce table to single row (like 'collapse') 215 | - 1: reduce table to single vector ?? 216 | - 1.5: replace table with returned table (like 'sort') 217 | 218 | ## arguments and formatting 219 | 220 | In addition to operands, operators also take parameters, both positional and named (`args` and `kwargs` in Python). 221 | These cannot have spaces, but they can have Python format strings like `{{input}}`. 222 | 223 | The identifiers available to Python format strings come from a chain of contexts: 224 | 225 | - column names in the current table are replaced with the value in the current row (for rankin=0 or 0.5). 226 | - from each nested table, in priority from innermost to outermost 227 | - row will also defer to their "parent" row if they don't have the column 228 | 229 | # Future 230 | 231 | ## new operators 232 | 233 | - `!dbtable`: use entire table as input 234 | - `!dbquery`: sql template -> table 235 | 236 | ## single-step debugging 237 | 238 | - show results of each step in e.g. VisiData 239 | - output as Pandas dataframe 240 | 241 | ## simple website scraping 242 | 243 | - recursively apply `!extract-links` and `!fetch-url` to scrape an entire website 244 | - need operators to remove already-scraped urls and urls outside a particular domain/urlbase 245 | 246 | ## License 247 | 248 | I don't know yet. 249 | 250 | You can use this and play with it, and if you want to do anything more serious with it, please get in touch. 251 | The [rest](https://bluebird.sh) [of my](https://xd.saul.pw) [work](https://visidata.org) is fiercely open source, but I also appreciate a good capitalist scheme. 252 | Come chat with me on Discord [saul.pw/chat](saul.pw/chat) or Mastodon [@saulpw@fosstodon.org](https://fosstodon.org/@saulpw) and let's jam. 253 | 254 | If you want to get updates about I'm playing with, you can [sign up for my AI mailing list](https://landing.mailerlite.com/webforms/landing/y9b3w8). 255 | 256 | !print 257 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AIPL (Array-Inspired Pipeline Language) 2 | 3 | [![Tests](https://github.com/saulpw/aipl/workflows/aipl-testing/badge.svg)](https://github.com/saulpw/aipl/actions/workflows/main.yml) 4 | [![discord](https://img.shields.io/discord/880915750007750737?label=discord)](https://visidata.org/chat) 5 | 6 | A tiny DSL to make it easier to explore and experiment with AI pipelines. 7 | 8 | ## Features 9 | 10 | - array language semantics (implicit looping over input) 11 | - tacit dataflow (output from previous command becomes input to next command) 12 | - entire pipeline defined in same file and in execution order for better understanding 13 | - including inline prompt templates 14 | - persistent cache of expensive operations into a sqlite db 15 | 16 | ### `summarize.aipl` 17 | 18 | Here's a prime example, a multi-level summarizer in the "map-reduce" style of langchain: 19 | 20 | ``` 21 | #!/usr/bin/env bin/aipl 22 | 23 | # fetch url, split webpage into chunks, summarize each chunk, then summarize the summaries. 24 | 25 | # the inputs are urls 26 | !read 27 | 28 | # extract text from html 29 | !extract-text 30 | 31 | # split into chunks of lines that can fit in the context window 32 | !split maxsize=8000 sep=\n 33 | 34 | # have GPT summary each chunk 35 | !format 36 | 37 | Please read the following section of a webpage (500-1000 words) and provide a 38 | concise and precise summary in a few sentences, optimized for keywords and main 39 | content topics. Write only the summary, and do not include phrases like "the 40 | article" or "this webpage" or "this section" or "the author". Ensure the tone 41 | is precise and concise, and provide an overview of the entire section: 42 | 43 | """ 44 | {_} 45 | """ 46 | 47 | !llm model=gpt-3.5-turbo 48 | 49 | # join the section summaries together 50 | !join sep=\n- 51 | 52 | # have GPT summarize the combined summaries 53 | 54 | !format 55 | 56 | Based on the summaries of each section provided, create a one-paragraph summary 57 | of approximately 100 words. Begin with a topic sentence that introduces the 58 | overall content topic, followed by several sentences describing the most 59 | relevant subsections. Provide an overview of all section summaries and include 60 | a conclusion or recommendations only if they are present in the original 61 | webpage. Maintain a precise and concise tone, and make the overview coherent 62 | and readable, while preserving important keywords and main content topics. 63 | Remove all unnecessary text like "The document" and "the author". 64 | 65 | """ 66 | {_} 67 | """ 68 | 69 | !llm model=gpt-3.5-turbo 70 | 71 | !print 72 | 73 | ``` 74 | 75 | ## Usage 76 | 77 | ``` 78 | usage: aipl [-h] [--debug] [--test] [--interactive] [--step STEP] [--step-breakpoint] [--step-rich] [--step-vd] [--dry-run] [--cache-db CACHEDBFN] [--no-cache] 79 | [--output-db OUTDBFN] [--split SEPARATOR] 80 | [script_or_global ...] 81 | 82 | AIPL interpreter 83 | 84 | positional arguments: 85 | script_or_global scripts to run, or k=v global parameters 86 | 87 | options: 88 | -h, --help show this help message and exit 89 | --debug, -d abort on exception 90 | --test, -t enable test mode 91 | --interactive, -i interactive REPL 92 | --step STEP call aipl.step_(cmd, input) before each step 93 | --step-breakpoint, -x 94 | breakpoint() before each step 95 | --step-rich, -v output rich table before each step 96 | --step-vd, --vd open VisiData with input before each step 97 | --dry-run, -n do not execute @expensive operations 98 | --cache-db CACHEDBFN, -c CACHEDBFN 99 | sqlite database for caching operators 100 | --no-cache sqlite database for caching operators 101 | --output-db OUTDBFN, -o OUTDBFN 102 | sqlite database accessible to !db operators 103 | --split SEPARATOR, --separator SEPARATOR, -s SEPARATOR 104 | separator to split input on 105 | 106 | ``` 107 | 108 | ## Command Syntax 109 | 110 | This is the basic syntax: 111 | 112 | - comments start with `#` as the first character of a line, and ignore the whole line. 113 | - commands start with `!` as the first character of a line. 114 | - everything else is part of the prompt template for the previous `!` command. 115 | 116 | Commands can take positional and/or keyword arguments, separated by whitespace. 117 | 118 | - `!cmd arg1 key=value arg2` 119 | 120 | Keyword arguments have an `=` between the key and the value, and non-keyword arguments are those without a `=` in them. 121 | 122 | - `!cmd` will call the Python function registered to the `cmd` operator with the arguments given, as an operator on the current value. 123 | 124 | - Any text following the command line is dedented (and stripped) and added verbatim as a `prompt=` keyword argument. 125 | - Argument values may include Python formatting like `{input}` which will be replaced by values from the current row (falling back to parent rows, and ultimately the provided globals). 126 | - Prompt values, on the other hand, are not automatically formatted. `!format` go over every leaf row and return the formatted prompt as its output. 127 | - !literal will set its prompt as the toplevel input, without formatting. 128 | 129 | The AIPL syntax will continue to evolve and be clarified over time as it's used and developed. 130 | 131 | Notes: 132 | 133 | - an AIPL source file documents an entire pipeline from newline-delimited inputs on stdin (or via `!literal`) to the end of the pipeline (often `!print`). 134 | - commands always run consecutively and across all inputs. 135 | - the initial input is a single string (read from stdin). 136 | 137 | ## List of operators 138 | 139 | - `!abort` (in=None out=None) 140 | Abort the current chain. 141 | - `!cluster` (in=1 out=1) 142 | Cluster rows by embedding into n clusters; add label column. 143 | - `!columns` (in=1.5 out=1.5) 144 | Create new table containing only these columns. 145 | - `!comment` (in=None out=None) 146 | Do nothing (ignoring args and prompt). 147 | - `!cross` (in=0.5 out=1.5) 148 | Construct cross-product of current input with given global table 149 | - `!global` (in=100 out=1.5) 150 | Save toplevel input into globals. 151 | - `!unbox` (in=1.5 out=1.5) 152 | None 153 | - `!csv-parse` (in=None out=1.5) 154 | Converts a .csv into a table of rows. 155 | - `!dbopen` (in=None out=0) 156 | Open connection to database. 157 | - `!dbquery` (in=0.5 out=1.5) 158 | Query database table. 159 | - `!dbdrop` (in=None out=None) 160 | Drop database table. 161 | - `!dbinsert` (in=0.5 out=None) 162 | Insert each row into database table. 163 | - `!option` (in=None out=None) 164 | Set option=value. 165 | - `!debug` (in=None out=None) 166 | set debug flag and call breakpoint() before each command 167 | - `!def` (in=0 out=None) 168 | Define composite operator from cmds in prompt (must be indented). 169 | - `!extract-text-all` (in=0 out=0) 170 | Extract all text from HTML 171 | - `!extract-text` (in=0 out=0) 172 | Extract meaningful text from HTML 173 | - `!extract-links` (in=0 out=1.5) 174 | Extract (linktext, title, href) from tags in HTML 175 | - `!filter` (in=1.5 out=1.5) 176 | Return copy of table, keeping only rows whose value is Truthy. 177 | - `!format` (in=0.5 out=0) 178 | Format prompt text (right operand) as a Python string template, substituting values from row (left operand) and global context. 179 | - `!groupby` (in=1.5 out=1.5) 180 | Group rows into tables, by set of columns given as args. 181 | - `!require-input` (in=100 out=100) 182 | Ensure there is any input at all; if not, display the prompt and read input from the user. 183 | - `!join` (in=1 out=0) 184 | Join inputs with sep into a single output scalar. 185 | - `!json` (in=100 out=0) 186 | Convert Table into a json blob. 187 | - `!json-parse` (in=0 out=1.5) 188 | Convert a json blob into a Table. 189 | - `!literal` (in=None out=0) 190 | Set prompt as top-level input, without formatting. 191 | - `!llm` (in=0 out=0) 192 | Send chat messages to `model` (default: gpt-3.5-turbo). Lines beginning with @@@s or @@@a are sent as system or assistant messages respectively (default user). Passes all named args directly to API. 193 | - `!llm-embedding` (in=0 out=0.5) 194 | Get a [text embedding](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) for a string from `model`: a measure of text-relatedness, to be used with e.g. !cluster. 195 | - `!match` (in=0 out=0) 196 | Return a bool with whether value matched regex. Used with !filter. 197 | - `!metrics-accuracy` (in=1.5 out=0) 198 | None 199 | - `!metrics-precision` (in=1.5 out=0) 200 | None 201 | - `!metrics-recall` (in=1.5 out=0) 202 | None 203 | - `!name` (in=1.5 out=1.5) 204 | Rename current input column to given name. 205 | - `!nop` (in=None out=None) 206 | No operation. 207 | - `!pdf-extract` (in=0 out=0) 208 | Extract contents of pdf to value. 209 | - `!print` (in=0 out=None) 210 | Print to stdout. 211 | - `!python` (in=None out=None) 212 | exec() Python toplevel statements. 213 | - `!python-expr` (in=0.5 out=0) 214 | Add columns for Python expressions. 215 | - `!python-input` (in=0 out=1.5) 216 | eval() Python expression and use as toplevel input table. 217 | - `!ravel` (in=100 out=1.5) 218 | All of the leaf scalars in the value column become a single 1-D array. 219 | - `!read` (in=0 out=0) 220 | Return contents of local filename. 221 | - `!read-bytes` (in=0 out=0) 222 | Return contents of URL or local filename as bytes. 223 | - `!ref` (in=1.5 out=1.5) 224 | Move column on table to end of columns list (becoming the new .value) 225 | - `!regex-capture` (in=0 out=0.5) 226 | Capture from prompt regex into named matching groups. 227 | - `!regex-translate` (in=0 out=0) 228 | Translate input according to regex translation rules in prompt, one per line, with regex and output separated by whitespace: 229 | Dr\.? Doctor 230 | Jr\.? Junior 231 | 232 | - `!replace` (in=0 out=0) 233 | Replace `find` in all leaf values with `repl`. 234 | - `!sample` (in=1.5 out=1.5) 235 | Sample n random rows from the input table. 236 | - `!save` (in=0 out=None) 237 | Save to given filename. 238 | - `!sh` (in=0 out=1.5) 239 | Run the command described by args. Return (retcode, stderr, stdout) columns. 240 | - `!shtty` (in=None out=0.5) 241 | Run the command described by args. Return (retcode, stderr, stdout) columns. 242 | - `!sort` (in=1.5 out=1.5) 243 | Sort the table by the given columns. 244 | - `!grade-up` (in=1.5 out=1) 245 | Assign ranks to unique elements in an array, incrementally increasing each by its corresponding rank value. 246 | - `!split` (in=0 out=1) 247 | Split text into chunks based on sep, keeping each chunk below maxsize. 248 | - `!split-into` (in=0 out=0.5) 249 | Split text by sep into the given column names. 250 | - `!take` (in=1.5 out=1.5) 251 | Return a table with first n rows of `t` 252 | - `!test-input` (in=100 out=1.5) 253 | In test mode, replace input with prompt. 254 | - `!test-equal` (in=0 out=None) 255 | In test mode, error if value is not equal to prompt. 256 | - `!test-json` (in=100 out=None) 257 | Error if value Column is not equal to json blob in prompt. 258 | - `!url-split` (in=0 out=0.5) 259 | Split url into components (scheme, netloc, path, params, query, fragment). 260 | - `!url-defrag` (in=0 out=0) 261 | Remove fragment from url. 262 | - `!xml-xpath` (in=0 out=1) 263 | Return a vector of XMLElements from parsing entries in value. 264 | - `!xml-xpaths` (in=0 out=0.5) 265 | Return a vector of XMLElements from parsing entries in value; kwargs become column_name=xpath. 266 | - `!aipl-ops` (in=0 out=0) 267 | None 268 | 269 | 270 | ## Defining a new operator 271 | 272 | It's pretty easy to define a new operator that can be used right away. 273 | For instance, here's how the `!join` operator might be defined: 274 | 275 | ``` 276 | @defop('join', rankin=1, rankout=0) 277 | def op_join(aipl:AIPL, v:List[str], sep=' ') -> str: 278 | 'Concatenate text values with *sep* into a single string.' 279 | return sep.join(v) 280 | ``` 281 | 282 | - `@defop(...)` registers the decorated function as the named operator. 283 | - `rankin`/`rankout` indicate what the function takes as input, and what it returns: 284 | - `0`: a scalar (number or string) 285 | - `0.5`: a whole row (a mapping of key/value pairs) 286 | - `1`: a vector of scalar values (e.g. `List[str]` as above) 287 | - `1.5`: a whole Table (list of the whole table (array of rows) 288 | - `None`: nothing (the operator is an input "source" if rankin is None; it is a pass-through if rankout is None) 289 | - `arity` is how many operands it takes (only `0` and `1` supported currently) 290 | 291 | The join operator is `rankin=1 rankout=0` which means that it takes a list of strings and outputs a single string. 292 | 293 | - Add the `@expensive` decorator to operators that actually go to the network or use an LLM; this will persistently cache the results in a local sqlite database. 294 | - running the same inputs through a pipeline multiple times won't keep refetching the same data impolitely, and won't run up a large bill during development. 295 | 296 | # Architecture 297 | 298 | The fundamental data structure is a Table: an array of hashmaps ("rows"), with named Columns that key into each Row to get its value. 299 | 300 | A value can be a string or a number or another Table. 301 | 302 | The value of a row is the value in the rightmost column of its table. 303 | The rightmost column of a table is a vector of values representing the whole table. 304 | 305 | A simple vector has only strings or numbers. 306 | A simple table has a simple rightmost value vector and is Rank 0. 307 | Each nesting of tables in the rightmost value vector increases its Rank by 1. 308 | 309 | ## operators 310 | Each operator consumes 0 or 1 or 2 operands (its `arity`), and produces one result, which becomes the operand for the next operator. 311 | 312 | Each operator has an "in rank" and an "out rank", which is the rank of the operands they input and output. 313 | 314 | By default, each operator is applied across the deepest nested table. 315 | The result of each operator is then placed in the deepest nested table (or its parent). 316 | 317 | ### rankin=0: one scalar at a time 318 | 319 | With `rankin=0` and `rankout` of: 320 | 321 | - -1: no change (like 'print') 322 | - 0: scalar operation (like 'translate') 323 | - 0.5: scalar to simple row (like 'url-split') 324 | - 1: scalar to simple vector (like 'split-text') 325 | - 1.5: scalar to table (like 'extract-links') 326 | 327 | ### rankin=0.5: consume whole row 328 | 329 | With `rankin=0.5`, and `rankout` of: 330 | 331 | - -1: no change to row (like 'dbinsert') 332 | - 0: add a new value to row (like 'pyexpr') 333 | - 0.5: replace or remove row (like 'filter') 334 | - 1: transform whole vector (like 'sort' or 'normalize') 335 | - 1.5: row to table 336 | 337 | ### rankin=1: consume the rightmost column 338 | 339 | With `rankin=1`, and `rankout` of: 340 | 341 | - -1: no change to row (like 'dbinsert') 342 | - 0: reduce to scalar (like 'join') 343 | - 0.5: reduce to simple row (like 'stats') 344 | - 1: transform whole vector (like 'normalize'); or return None to remove column 345 | - 1.5: vector to table 346 | 347 | ### rankin=1.5: consume whole table 348 | 349 | With `rankin=2`, and `rankout` of: 350 | 351 | - -1: no change to table 352 | - 0: reduce table to scalar 353 | - 0.5: reduce table to single row (like 'collapse') 354 | - 1: reduce table to single vector ?? 355 | - 1.5: replace table with returned table (like 'sort') 356 | 357 | ## arguments and formatting 358 | 359 | In addition to operands, operators also take parameters, both positional and named (`args` and `kwargs` in Python). 360 | These cannot have spaces, but they can have Python format strings like `{input}`. 361 | 362 | The identifiers available to Python format strings come from a chain of contexts: 363 | 364 | - column names in the current table are replaced with the value in the current row (for rankin=0 or 0.5). 365 | - from each nested table, in priority from innermost to outermost 366 | - row will also defer to their "parent" row if they don't have the column 367 | 368 | ## More information 369 | 370 | Come chat with us on Discord [bluebird.sh/chat](https://bluebird.sh/chat) or Mastodon [@saulpw@fosstodon.org](https://fosstodon.org/@saulpw). 371 | 372 | If you want to get updates about I'm playing with, you can [sign up for my AI mailing list](https://landing.mailerlite.com/webforms/landing/y9b3w8). 373 | 374 | ## License 375 | 376 | Licensed under MIT. 377 | 378 | -------------------------------------------------------------------------------- /aipl/interpreter.py: -------------------------------------------------------------------------------- 1 | from typing import List, Mapping, Callable 2 | from copy import copy 3 | from dataclasses import dataclass 4 | from functools import wraps 5 | from itertools import cycle 6 | import time 7 | import inspect 8 | 9 | from aipl import Error, AIPLException, InnerPythonException 10 | from .table import Table, LazyRow, Column 11 | from .db import Database 12 | from .utils import stderr, fmtargs, fmtkwargs, AttrDict 13 | from .parser import clean_to_id, Command 14 | from . import parser 15 | 16 | 17 | Scalar = int|float|str 18 | 19 | 20 | class UserAbort(BaseException): 21 | 'UserAbort not caught by internal error handling; will always exit.' 22 | 23 | 24 | def rank(v): 25 | if isinstance(v, LazyRow): 26 | return rank(v.value) 27 | if isinstance(v, Table): 28 | return v.rank 29 | else: 30 | return 0 31 | 32 | 33 | class AIPL: 34 | operators = {} # opname:str -> func(aipl, ..., *args, *kwargs) 35 | aliases = {} # aliasname:str -> builtinopname:str 36 | next_unique_key:int = 0 37 | cost_usd:float = 0.0 38 | 39 | def __init__(self, **kwargs): 40 | self.tables = {} # named tables 41 | self.globals = dict( # base context, imports go into here for later use in the whole script 42 | aipl=self, 43 | defop=defop, 44 | stderr=stderr, 45 | Table=Table, 46 | ) 47 | self.options = AttrDict(kwargs) 48 | self.forced_input = None # via !test-input 49 | self.output_db = Database(self.options.outdbfn) 50 | self.cache_db = None 51 | if self.options.cachedbfn: 52 | self.cache_db = Database(self.options.cachedbfn) 53 | 54 | 55 | @property 56 | def unique_key(self) -> str: 57 | r = self.next_unique_key 58 | self.next_unique_key += 1 59 | return f'_{r}' 60 | 61 | def step_breakpoint(self, cmd:Command, *inputs:List[Table]): 62 | breakpoint() 63 | 64 | def get_op(self, opname:str): 65 | while opname in self.aliases: 66 | opname = self.aliases[opname].opname 67 | 68 | return self.operators.get(opname, None) 69 | 70 | def parse(self, source:str) -> List[Command]: 71 | 'Generate list of Commands from source text' 72 | 73 | ast = parser.parse(source) 74 | 75 | commands = [] 76 | for command in ast: 77 | command.op = self.get_op(command.opname) 78 | 79 | if not command.op: 80 | raise AIPLException( 81 | f'[line {command.linenum}] no such operator "!{command.opname}"') 82 | 83 | if command.immediate: 84 | result = self.run_cmdlist([command], []) 85 | if isinstance(result, Error): 86 | if isinstance(result.exception, InnerPythonException): 87 | result.exception.command = command 88 | raise result.exception 89 | 90 | if command.varnames: 91 | last_variable = command.varnames[-1] 92 | self.globals[last_variable] = result 93 | stderr(f'(global) {last_variable} = result of {command.line}') 94 | else: 95 | commands.append(command) 96 | return commands 97 | 98 | def new_input(self, *inputlines): 99 | argkey = self.unique_key 100 | return Table([{argkey:line} for line in inputlines]) 101 | 102 | def run_test(self, script:str, *inputlines): 103 | inputs = [self.new_input(*inputlines)] 104 | return self.run(script, inputs)[-1] 105 | 106 | def run(self, script:str, inputs:list[Table]=None): 107 | # lines before first cmdline are Python, to be executed immediately. 108 | # also add nop at end to do final single-steps. 109 | cmds = self.parse('!!python\n' + script + '\n!nop') 110 | 111 | return self.run_cmdlist(cmds, inputs) 112 | 113 | def pre_command(self, cmd:Command, t:Table=Table(), *args): 114 | stderr(t, str(cmd)) 115 | 116 | def run_cmdlist(self, cmds:List[Command], inputs:List[Table]): 117 | for cmd in cmds: 118 | if self.forced_input is not None: 119 | inputs.append(self.forced_input) 120 | self.forced_input = None 121 | 122 | input_tables = [self.tables[arg] for arg in cmd.input_tables] 123 | 124 | operands = [inputs[-1]] if inputs else [] 125 | if cmd.prompt is not None: 126 | input_tables.append(Table(cmd.prompt)) 127 | 128 | if input_tables: 129 | operands[cmd.op.arity-len(input_tables):] = input_tables 130 | 131 | for input_col_name in cmd.input_cols: 132 | t = operands[-1] 133 | col = t.get_column(input_col_name) 134 | if col not in t.columns: 135 | raise AIPLException(f'no such column {input_col_name!r}') 136 | t.columns.remove(col) 137 | t.add_column(col) 138 | 139 | self.pre_command(cmd, *operands) 140 | 141 | if self.options.step: 142 | for stepfuncname in self.options.step.split(','): 143 | stepfunc = getattr(self, 'step_'+stepfuncname, None) 144 | if stepfunc: 145 | stepfunc(cmd, *operands) 146 | else: 147 | stderr(f'no aipl.step_{stepfuncname}!') 148 | 149 | try: 150 | annotated_result = self.eval_op(cmd, *operands, contexts=[self.globals, self.tables]) 151 | result = annotated_result['result'] 152 | if cmd.op.rankout is None: 153 | continue # just keep former inputs 154 | elif isinstance(result, Table): 155 | inputs = [result] 156 | else: 157 | k = cmd.varnames[-1] if cmd.varnames else self.unique_key 158 | inputs = [Table([{k:result}])] 159 | 160 | for g in cmd.globals: 161 | self.tables[g] = inputs[-1] 162 | 163 | except AIPLException as e: 164 | raise AIPLException(f'AIPL Error (line {cmd.linenum} !{cmd.opname}): {e}') from e 165 | except Exception as e: 166 | raise Exception(f'AIPL Error (line {cmd.linenum} !{cmd.opname}): {e}') from e 167 | 168 | for result in inputs: 169 | if isinstance(result, Error): 170 | if isinstance(result.exception, InnerPythonException): 171 | result.exception.command = command 172 | raise result.exception 173 | 174 | return inputs 175 | 176 | def call_cmd(self, cmd:Command, contexts:List[Mapping], *inputs, newkey=''): 177 | operands = [prep_input(arg, rank) 178 | for arg,rank in zip(inputs, 179 | [cmd.op.rankin, cmd.op.rankin2]) 180 | ] 181 | args = fmtargs(cmd.args, contexts) 182 | kwargs = fmtkwargs(cmd.kwargs, contexts) 183 | 184 | 185 | try: 186 | if self.options.step and 'break' in self.options.step.split(','): 187 | breakpoint() 188 | start_t = time.time() 189 | ret = cmd.op(self, *operands, *args, **kwargs) 190 | except Exception as e: 191 | if self.options.debug or self.options.test: 192 | raise 193 | return Error(cmd.linenum, cmd.opname, e) 194 | 195 | end_t = time.time() 196 | 197 | if cmd.op.rankout is not None and cmd.varnames: 198 | varname = cmd.varnames[-1] 199 | else: 200 | varname = newkey or self.unique_key 201 | 202 | result = prep_output(self, 203 | inputs[0] if inputs else None, 204 | ret, 205 | cmd.op.rankout, 206 | cmd.op.outcols.split(), 207 | varname) 208 | 209 | annotated_ret = dict(result=result, cost_usd=self.cost_usd, cost_ms=int((end_t-start_t)*1000)) 210 | self.cost_usd = 0 211 | return annotated_ret 212 | 213 | def eval_op(self, cmd:Command, *operands:List[Table|LazyRow], contexts=[], newkey='') -> dict: 214 | 'Recursively evaluate cmd.op(t) with cmd args formatted with contexts. Return dict(result:Table, cost_usd:float, cost_ms:int)' 215 | 216 | if cmd.op.arity == 0: 217 | return self.call_cmd(cmd, contexts, newkey=newkey) 218 | 219 | else: 220 | if len(operands) < cmd.op.arity: 221 | operands = list(operands) + [Table() for i in range(cmd.op.arity-len(operands))] 222 | 223 | t = operands[0] 224 | if rank(t) <= cmd.op.rankin: 225 | return self.call_cmd(cmd, contexts, *operands, newkey=newkey) 226 | 227 | if isinstance(t, Table): 228 | ret = copy(t) 229 | else: 230 | ret = copy(t.value) 231 | 232 | # !op>var1>var2 names the deepest column "var2" and the column one-level up (for rankout==1) "var1" 233 | if cmd.op.rankout is not None and len(cmd.varnames) > cmd.op.rankout and rank(t) == int(cmd.op.rankin+1): 234 | newkey = cmd.varnames[0] or self.unique_key 235 | else: 236 | newkey = newkey or self.unique_key 237 | 238 | start_t = time.time() 239 | cost_usd = 0 240 | for row in t: 241 | annotated_x = self.eval_op(cmd, row, *operands[1:], contexts=contexts+[row], newkey=newkey) 242 | x = annotated_x['result'] 243 | 244 | if x is None: 245 | continue 246 | 247 | subresult = update_dict(row._row, x, newkey) 248 | cost_usd += annotated_x['cost_usd'] 249 | subresult.setdefault('_costs', Table()).append(dict(usd=annotated_x['cost_usd'], ms=annotated_x['cost_ms'])) 250 | ret.rows.append(subresult) 251 | 252 | ret.add_column(Column('_costs')) 253 | 254 | if isinstance(x, Mapping): 255 | for k in x.keys(): 256 | ret.add_column(Column(k, k)) 257 | else: 258 | ret.add_column(Column(newkey)) 259 | 260 | end_t = time.time() 261 | 262 | return dict(result=ret, cost_usd=cost_usd, cost_ms=int((end_t-start_t)*1000)) 263 | 264 | 265 | def update_dict(d:dict, elem, key:str='') -> dict: 266 | 'Update d with elem if elem is dict, otherwise add d[key]=elem. Return d.' 267 | if isinstance(elem, dict): 268 | d.update(elem) 269 | else: 270 | d[key] = elem 271 | return d 272 | 273 | 274 | def prep_input(operand:LazyRow|Table|Error, rankin:int|float) -> Scalar|List[Scalar]|Table|LazyRow: 275 | if isinstance(operand, Error): 276 | return operand 277 | 278 | if rankin is None: 279 | return None 280 | if rankin == 0: 281 | if isinstance(operand, Table) and operand.rank == 0: 282 | return operand.scalar 283 | elif isinstance(operand, LazyRow): 284 | return operand.value 285 | else: 286 | assert False, type(operand) 287 | elif rankin == 0.5: 288 | assert isinstance(operand, LazyRow) 289 | return operand 290 | elif rankin == 1: 291 | if isinstance(operand, LazyRow): 292 | assert operand.value.rank == 1 293 | return operand.value.values 294 | elif isinstance(operand, Table): 295 | assert operand.rank == 1 296 | return operand.values 297 | elif rankin >= 1.5: 298 | if isinstance(operand, LazyRow): 299 | #assert operand.value.rank == 1 300 | return operand.value 301 | elif isinstance(operand, Table): 302 | return operand 303 | else: 304 | raise Exception("Unexpected rankin") 305 | 306 | def ziplift(a:Table, b:Table): 307 | 'Yield item pairs from `a` and `b`, with the number of elements from the shorter extended (lifted) to match the number of elements from the longer.' 308 | 309 | ita = iter(a) 310 | itb = iter(b) 311 | if len(a) > len(b): 312 | itb = cycle(itb) 313 | elif len(a) < len(b): 314 | ita = cycle(ita) 315 | return zip(ita, itb) 316 | 317 | def prep_output(aipl, 318 | in_row:LazyRow|Table, 319 | out:Scalar|List[Scalar]|LazyRow|Table, 320 | rankout:int|float, 321 | outcols:List[str], 322 | varname:str) -> Scalar|List[Scalar]|Table|LazyRow: 323 | 324 | if rankout is None: 325 | return None 326 | 327 | if rankout == 0: 328 | assert not isinstance(out, (Table, LazyRow, dict)) 329 | return out 330 | 331 | elif rankout == 0.5: 332 | return out 333 | 334 | elif rankout == 1: 335 | ret = Table() 336 | if isinstance(in_row, LazyRow): 337 | ret.rows = [{'__parent': in_row, varname:v} for v in out] 338 | elif isinstance(in_row, Table): 339 | out = list(out) 340 | ret.rows = [{'__parent': parent_row, varname:v} for parent_row, v in ziplift(in_row, out)] 341 | else: 342 | assert False, 'unknown type for in_row' 343 | ret.add_column(Column(varname)) 344 | return ret 345 | 346 | elif rankout >= 1.5: 347 | if isinstance(out, Table): 348 | return out 349 | else: 350 | if in_row is None: 351 | parent_table = None 352 | parent_row = None 353 | elif isinstance(in_row, Table): 354 | parent_table = None 355 | parent_row = None 356 | elif isinstance(in_row, LazyRow): 357 | parent_table = in_row._table 358 | parent_row = in_row 359 | else: 360 | raise Exception(f'unknown type for in_row: {type(in_row)}') 361 | 362 | rows = [] 363 | latest_row = {} # in case there are no rows in out 364 | all_keys = set() 365 | for v in out: 366 | latest_row = {'__parent': parent_row} if parent_row is not None else {} 367 | if isinstance(v, dict): 368 | all_keys |= set(v.keys()) 369 | latest_row.update(v) 370 | else: 371 | latest_row[varname] = v 372 | rows.append(latest_row) 373 | 374 | # use final latest_row to figure out columns 375 | ret = Table(rows, parent=parent_table) 376 | if outcols: 377 | for k in outcols: 378 | ret.add_column(Column(k)) 379 | elif all_keys: # we have to figure out the keys, for better or worse 380 | for k in all_keys: 381 | ret.add_column(Column(k)) 382 | 383 | return ret 384 | 385 | else: 386 | raise Exception("Unexpected rankout") 387 | 388 | 389 | ranktypes = dict( 390 | none = None, 391 | all = 100, 392 | scalar = 0, 393 | row = 0.5, 394 | vector = 1, 395 | table = 1.5, 396 | ) 397 | 398 | def defop(operation:str|Callable|None=None, 399 | rankin:None|int|float|str=0, 400 | rankout:None|int|float|str=0, 401 | *, 402 | rankin2:None|int|float|str=None, 403 | outcols:str='', 404 | preprompt=lambda x: x, 405 | opname:str|None=None): 406 | ''' 407 | Define a new operator. 408 | 409 | Can be used as a decorator: 410 | 411 | @defop('op_name', rankin='vector') 412 | def myop(...): 413 | 414 | Or just as a function: 415 | defop(function, rankout='vector') 416 | defop(function, opname='alternative_name') 417 | 418 | aipl will be passed to the function if the first argument is called 419 | 'aipl'. 420 | ''' 421 | # arity implied by rankin 422 | if rankin is None: 423 | arity = 0 424 | elif rankin2 is None: 425 | arity = 1 426 | else: 427 | arity = 2 428 | 429 | # replace string mnemonic with 'actual' rank 430 | rankin = ranktypes.get(rankin, rankin) 431 | rankout = ranktypes.get(rankout, rankout) 432 | rankin2 = ranktypes.get(rankin2, rankin2) 433 | 434 | def _decorator(f): 435 | if opname: 436 | name = opname 437 | elif isinstance(operation, str): 438 | name = operation 439 | else: 440 | name = getattr(f, '__name__', None) or str(f) 441 | name = clean_to_id(name) 442 | AIPL.operators[name] = Operator( 443 | rankin = rankin, 444 | rankout = rankout, 445 | rankin2 = rankin2, 446 | arity = arity, 447 | outcols = outcols, 448 | opname = opname, 449 | preprompt = preprompt, 450 | func = f) 451 | return f 452 | 453 | if callable(operation): 454 | return _decorator(operation) 455 | else: 456 | return _decorator 457 | 458 | @dataclass 459 | class Operator: 460 | rankin: int 461 | rankout: int 462 | rankin2: int|None 463 | arity: int 464 | outcols: str 465 | opname: str 466 | preprompt: Callable 467 | func: Callable 468 | 469 | def __call__(self, aipl, *args, **kwargs): 470 | if self._needs_aipl: 471 | r = self.func(aipl, *args, **kwargs) 472 | else: 473 | r = self.func(*args, **kwargs) 474 | 475 | return r 476 | 477 | @property 478 | def needs_prompt(self): 479 | try: 480 | return 'prompt' in inspect.signature(self.func).parameters 481 | except ValueError: 482 | return False 483 | 484 | @property 485 | def _needs_aipl(self): 486 | try: 487 | return list(inspect.signature(self.func).parameters)[0] == 'aipl' 488 | except ValueError: 489 | return False 490 | 491 | 492 | def alias(alias_name:str, builtin_name:str, dialect:str=''): 493 | 'Create an alias `alias_name` for the op `builtin_name`' 494 | assert alias_name not in AIPL.aliases 495 | if builtin_name not in AIPL.operators: 496 | raise AIPLException(f"{builtin_name} is not a valid operator for alias") 497 | AIPL.aliases[clean_to_id(alias_name)] = AttrDict(opname=builtin_name, dialect=dialect) 498 | --------------------------------------------------------------------------------