├── tests
    ├── __init__.py
    ├── test-emptiness.aipl
    ├── test_colon_rejoin.aipl
    ├── match-filter.aipl
    ├── test-def.aipl
    ├── input_cols.aipl
    ├── op-globals.aipl
    ├── globals.aipl
    ├── test_scripts.py
    ├── toplevel-ravel.aipl
    ├── test-named-ravel.aipl
    └── test-xml.aipl
├── aipl
    ├── ops
    │   ├── __init__.py
    │   ├── sleep.py
    │   ├── print.py
    │   ├── comment.py
    │   ├── abort.py
    │   ├── literal.py
    │   ├── replace.py
    │   ├── unbox.py
    │   ├── save.py
    │   ├── match.py
    │   ├── table.py
    │   ├── nop.py
    │   ├── pdf.py
    │   ├── dedup.py
    │   ├── name.py
    │   ├── format.py
    │   ├── csv.py
    │   ├── join.py
    │   ├── ref.py
    │   ├── input.py
    │   ├── take.py
    │   ├── sample.py
    │   ├── cluster.py
    │   ├── url.py
    │   ├── filter.py
    │   ├── db.py
    │   ├── sh.py
    │   ├── def.py
    │   ├── split.py
    │   ├── ravel.py
    │   ├── columns.py
    │   ├── cross.py
    │   ├── test.py
    │   ├── regex.py
    │   ├── read.py
    │   ├── groupby.py
    │   ├── sort.py
    │   ├── python.py
    │   ├── extract.py
    │   ├── xml.py
    │   ├── json.py
    │   ├── debug.py
    │   ├── metrics.py
    │   └── llm.py
    ├── __main__.py
    ├── test_format.aipl
    ├── test_db.py
    ├── repl.py
    ├── utils.py
    ├── caching.py
    ├── __init__.py
    ├── test_core.py
    ├── db.py
    ├── main.py
    ├── test_parse.py
    ├── parser.py
    ├── clients.py
    ├── table.py
    └── interpreter.py
├── requirements-geo.txt
├── examples
    ├── benchmarks
    │   ├── models.txt
    │   ├── tasks.txt
    │   ├── bigbench-binary-classification.aipl
    │   └── bigbench-binary-classification-local.aipl
    ├── random-link.aipl
    ├── rewrite.aipl
    ├── git-commit.aipl
    ├── rowan
    │   └── load-json-v4.aipl
    ├── saulpw
    │   └── crossword-log.aipl
    ├── cluster.aipl
    ├── summarize.aipl
    ├── nyt-cooking.aipl
    ├── hanukkah-of-data-5783.aipl
    └── wiki-to-map.aipl
├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── main.yml
├── .gitignore
├── bin
    └── aipl
├── pytest.ini
├── about
    ├── 23-design-log.md
    ├── README.md
    ├── roadmap.md
    ├── vision.md
    ├── 23-faq.md
    └── 23-design.md
├── conftest.py
├── tools
    ├── vscode
    │   ├── README.md
    │   ├── language-configuration.json
    │   ├── package.json
    │   └── syntaxes
    │   │   └── aipl.tmLanguage.json
    ├── README.md
    ├── translate-dialect.py
    └── aipl.vim
├── requirements.txt
├── pyproject.toml
├── setup.py
├── LICENSE.mit
├── docs
    ├── writing-operators.md
    ├── operators.md
    └── tutorial.md
├── README.aipl
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/aipl/ops/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements-geo.txt:
--------------------------------------------------------------------------------
1 | folium==0.14


--------------------------------------------------------------------------------
/examples/benchmarks/models.txt:
--------------------------------------------------------------------------------
1 | gpt-3.5-turbo


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: saulpw
2 | patreon: saulpw
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.log
3 | aipl-cache.sqlite
4 | wip/
5 | tags
6 | 


--------------------------------------------------------------------------------
/bin/aipl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | from aipl import main
4 | 
5 | main()
6 | 


--------------------------------------------------------------------------------
/aipl/__main__.py:
--------------------------------------------------------------------------------
1 | from . import main
2 | 
3 | if __name__ == '__main__':
4 |     main()
5 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | python_files = test_*.py ops/*.py
3 | python_functions = test_
4 | 
5 | 


--------------------------------------------------------------------------------
/tests/test-emptiness.aipl:
--------------------------------------------------------------------------------
1 | !literal
2 | # test a whole lotta nothing
3 | !extract-links
4 | !name url
5 | 


--------------------------------------------------------------------------------
/about/23-design-log.md:
--------------------------------------------------------------------------------
1 | 
2 | ## The prompt is used as verbatim input (except for whitespace dedent)
3 | 
4 | ## 
5 | 


--------------------------------------------------------------------------------
/tests/test_colon_rejoin.aipl:
--------------------------------------------------------------------------------
1 | !test-input
2 | a
3 | b
4 | c
5 | !split
6 | !join sep=:
7 | !test-equal
8 | a:b:c
9 | 


--------------------------------------------------------------------------------
/examples/benchmarks/tasks.txt:
--------------------------------------------------------------------------------
1 | causal_judgment
2 | strategyqa
3 | moral_permissibility
4 | anachronisms
5 | mathematical_induction


--------------------------------------------------------------------------------
/tests/match-filter.aipl:
--------------------------------------------------------------------------------
1 | !test-input
2 | ab zh cd zq azzz z
3 | !split>keep
4 | !match ^z
5 | !filter
6 | !join
7 | !test-equal
8 | zh zq z
9 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | from aipl import AIPL
4 | 
5 | @pytest.fixture()
6 | def aipl():
7 |     r = AIPL(debug=True, test=True)
8 |     return r
9 | 


--------------------------------------------------------------------------------
/examples/random-link.aipl:
--------------------------------------------------------------------------------
1 | # pick a random link from the given webpages(s)
2 | 
3 | !name baseurl
4 | !read
5 | !extract-links {baseurl}
6 | !sample 1
7 | !print
8 | 


--------------------------------------------------------------------------------
/aipl/ops/sleep.py:
--------------------------------------------------------------------------------
1 | import time
2 | 
3 | from aipl import defop
4 | 
5 | @defop('sleep', 0, 0)
6 | def _(aipl, n:float) -> float:
7 |     time.sleep(n)
8 |     return n
9 | 


--------------------------------------------------------------------------------
/aipl/ops/print.py:
--------------------------------------------------------------------------------
1 | from aipl import defop
2 | 
3 | @defop('print', 0, None)
4 | def op_print(aipl, v:str):
5 |     'Print to stdout.'
6 |     print(v, file=aipl.stdout)
7 | 
8 | 


--------------------------------------------------------------------------------
/aipl/ops/comment.py:
--------------------------------------------------------------------------------
1 | from aipl import defop
2 | 
3 | 
4 | @defop('comment', None, None)
5 | def op_comment(aipl, *args, **kwargs):
6 |     'Do nothing (ignore args and prompt).'
7 |     pass
8 | 


--------------------------------------------------------------------------------
/aipl/ops/abort.py:
--------------------------------------------------------------------------------
1 | from aipl import defop, UserAbort
2 | 
3 | 
4 | @defop('abort', None, None)
5 | def op_abort(aipl, *args):
6 |     "Abort the current chain."
7 |     raise UserAbort(*args)
8 | 


--------------------------------------------------------------------------------
/aipl/ops/literal.py:
--------------------------------------------------------------------------------
1 | from aipl import defop
2 | 
3 | @defop('literal', 0, 0)
4 | def op_literal(aipl, prompt=''):
5 |     'Set prompt as top-level input, without formatting.'
6 |     return prompt
7 | 


--------------------------------------------------------------------------------
/tests/test-def.aipl:
--------------------------------------------------------------------------------
 1 | 
 2 | !!def first-3-words
 3 |  !split
 4 |  !take 3
 5 |  !join
 6 | 
 7 | !test-input
 8 | now is the time for all good men
 9 | 
10 | !first-3-words
11 | 
12 | !test-equal
13 | now is the
14 | 


--------------------------------------------------------------------------------
/tests/input_cols.aipl:
--------------------------------------------------------------------------------
1 | !split sep=: >abc << a:b:c
2 | !format>dundered << _{_}_
3 | 
4 | !join sep='' <abc
5 | !test-equal << abc
6 | # Should work (but doesn't):
7 | # !join sep='' <dundered
8 | # !test-equal << _a__b__c_
9 | 


--------------------------------------------------------------------------------
/aipl/ops/replace.py:
--------------------------------------------------------------------------------
1 | from aipl import defop
2 | 
3 | @defop('replace', 0, 0)
4 | def op_replace(aipl, s:str, find:str, repl:str) -> str:
5 |     'Replace `find` in all leaf values with `repl`.'
6 |     return s.replace(find, repl)
7 | 
8 | 


--------------------------------------------------------------------------------
/tests/op-globals.aipl:
--------------------------------------------------------------------------------
 1 | import string
 2 | 
 3 | @defop('capwords', 0)
 4 | def _(aipl, v):
 5 |     return string.capwords(v)
 6 | 
 7 | !split
 8 | abc def ghi
 9 | 
10 | !capwords
11 | !join sep=,
12 | 
13 | !test-equal
14 | Abc,Def,Ghi
15 | 


--------------------------------------------------------------------------------
/aipl/ops/unbox.py:
--------------------------------------------------------------------------------
1 | from aipl import defop, Table
2 | 
3 | 
4 | @defop('unbox', 1.5, 1.5)
5 | def op_unbox(aipl, t:Table) -> Table:
6 |     'Return value of single-row table (remove outermost layer).'
7 |     assert len(t) == 1
8 |     return Table(t[0].value)
9 | 


--------------------------------------------------------------------------------
/aipl/ops/save.py:
--------------------------------------------------------------------------------
 1 | from aipl import defop
 2 | 
 3 | @defop('save', 0, None)
 4 | def op_save(aipl, v:str, filename=''):
 5 |     'Save to given filename.'
 6 |     assert '{' not in filename, filename
 7 |     with open(filename, 'w') as fp:
 8 |         fp.write(v)
 9 | 
10 | 


--------------------------------------------------------------------------------
/aipl/ops/match.py:
--------------------------------------------------------------------------------
 1 | from aipl import defop
 2 | 
 3 | @defop('match', 0, 0)
 4 | def op_match(aipl, v:str, regex:str) -> bool:
 5 |     'Return a bool with whether value matched regex. Used with !filter.'
 6 |     import re
 7 |     m = re.search(regex, v)
 8 |     return m is not None
 9 | 
10 | 


--------------------------------------------------------------------------------
/aipl/ops/table.py:
--------------------------------------------------------------------------------
 1 | from aipl import defop, Table, alias
 2 | 
 3 | 
 4 | @defop('table', 100, 1.5)
 5 | def op_table(aipl, t:Table, tname:str) -> Table:
 6 |     'Save toplevel input into global list of tables.'
 7 |     aipl.tables[tname] = t
 8 |     return t
 9 | 
10 | 
11 | alias('global', 'table')
12 | 


--------------------------------------------------------------------------------
/about/README.md:
--------------------------------------------------------------------------------
1 | # About AIPL
2 | 
3 | AIPL is a pseudo-computer language (skin on top of Python) that makes it easy to develop prototypes for data processing tasks, with language models as first class citizens.
4 | 
5 | - [Announcement](23-announcement.md)
6 | - [Design](23-design.md)
7 | - [FAQ](23-faq.md)
8 | 


--------------------------------------------------------------------------------
/aipl/ops/nop.py:
--------------------------------------------------------------------------------
 1 | from aipl import defop, alias
 2 | 
 3 | 
 4 | @defop('nop', None, None)
 5 | def op_nop(aipl):
 6 |     'No operation.'
 7 |     pass
 8 | 
 9 | 
10 | #@defop('identity', 0, 0)
11 | #def op_identity(aipl, v):
12 | #    return v
13 | alias('identity', 'nop')  # functionally equivalent in AIPL
14 | 


--------------------------------------------------------------------------------
/tools/vscode/README.md:
--------------------------------------------------------------------------------
1 | Loosely based on this [Redshift syntax highlighter](https://github.com/ronsoak/vsc_redshift_extension). This is rudimentary and far from complete.
2 | 
3 | ## installing
4 | 
5 | On a Unix system, copy the vscode folder to `~/.vscode/extensions/aipl-syntax` (or whatever you want to name the new dir).
6 | 


--------------------------------------------------------------------------------
/aipl/ops/pdf.py:
--------------------------------------------------------------------------------
 1 | from aipl import defop
 2 | 
 3 | 
 4 | @defop('pdf-extract', 0, 0)
 5 | def op_pdf_extract(aipl, pdfdata:bytes) -> str:
 6 |     'Extract contents of pdf to value.'
 7 |     from pdfminer.high_level import extract_text
 8 |     from io import BytesIO
 9 |     s = BytesIO(pdfdata)
10 |     return extract_text(s)
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | openai
 2 | scikit-learn
 3 | numpy
 4 | trafilatura  # for html extraction
 5 | beautifulsoup4  # for xml/html extraction
 6 | pdfminer  # for pdf extraction
 7 | tiktoken  # for cost-estimation
 8 | lxml      # for xml/html extraction
 9 | lark      # for parser
10 | rich      # for --step-rich table display
11 | 


--------------------------------------------------------------------------------
/tests/globals.aipl:
--------------------------------------------------------------------------------
 1 | !format>>feigenbaum
 2 | 4.66920
 3 | !!python
 4 | from aipl import defop
 5 | from aipl.table import Table
 6 | @defop('test', 1.5, 0)
 7 | def op_test(aipl, t:Table) -> str:
 8 | 	return '42'
 9 | 
10 | !test
11 | !format>answer
12 | {feigenbaum[0].value}
13 | !test-json
14 | [{"answer": "4.66920"}]
15 | !nop
16 | 


--------------------------------------------------------------------------------
/aipl/ops/dedup.py:
--------------------------------------------------------------------------------
 1 | from aipl import defop
 2 | 
 3 | @defop('dedup', 1, 1)
 4 | def _(aipl, v:list) -> list:
 5 |     'Deduplicate a list of scalars.'
 6 |     return list(set(v))
 7 | 
 8 | 
 9 | def test_dedup(aipl):
10 |     r = aipl.run_test('!split !dedup !sort !join', 'a b a b d c c c a b')
11 |     assert r[0].value == 'a b c d'
12 | 


--------------------------------------------------------------------------------
/examples/rewrite.aipl:
--------------------------------------------------------------------------------
 1 | # Rewrite the contents of each given file.
 2 | 
 3 | !name filename
 4 | !read-file
 5 | !split sep=\n\n maxsize=3000
 6 | !format
 7 | Rewrite this content, keeping the structure the same as a drop-in replacement:
 8 | 
 9 | """
10 | {_}
11 | """
12 | 
13 | !llm model=gpt-3.5-turbo
14 | !join
15 | !save {filename}.out
16 | 


--------------------------------------------------------------------------------
/aipl/ops/name.py:
--------------------------------------------------------------------------------
 1 | from copy import copy
 2 | 
 3 | from aipl import defop, Table
 4 | 
 5 | 
 6 | @defop('name', 1.5, 1.5)
 7 | def op_name(aipl, t:Table, name) -> Table:
 8 |     'Rename current input column to given name.'
 9 |     ret = copy(t)
10 |     ret.rows = copy(t.rows)
11 |     c = ret.current_col
12 |     c.name = name
13 |     return ret
14 | 


--------------------------------------------------------------------------------
/tools/README.md:
--------------------------------------------------------------------------------
 1 | ## Vim Syntax Highlighting
 2 | 
 3 |     mkdir -p ~/.vim/syntax
 4 |     cp tools/aipl.vim ~/.vim/syntax/aipl.vim
 5 |     mkdir -p ~/.vim/ftdetect
 6 |     cat > ~/.vim/ftdetect/aipl.vim
 7 | 
 8 |     au BufRead,BufNewFile *.aipl set filetype=aipl
 9 | 
10 | Reference: https://vim.fandom.com/wiki/Creating_your_own_syntax_files#Install_the_syntax_file
11 | 


--------------------------------------------------------------------------------
/tests/test_scripts.py:
--------------------------------------------------------------------------------
 1 | from io import StringIO
 2 | from glob import glob
 3 | import pytest
 4 | from aipl.interpreter import AIPL
 5 | 
 6 | @pytest.mark.parametrize("input_file", glob("tests/*.aipl"))
 7 | def test_script(aipl, input_file):
 8 |     aipl.stdout = StringIO()
 9 |     aipl.options.test = True
10 | 
11 |     with open(input_file) as fh:
12 |         aipl.run_test(fh.read(), '')
13 | 


--------------------------------------------------------------------------------
/aipl/test_format.aipl:
--------------------------------------------------------------------------------
 1 | # test: prompt is dedented
 2 | 
 3 | !format>golden
 4 |  1
 5 |  2
 6 |  # literal line that starts with '#'
 7 |  3
 8 | 
 9 | !format
10 | 
11 |    1
12 | # comments are not part of the prompt and can be interspersed
13 |    2
14 |    # literal line that starts with '#'
15 |    3
16 | 
17 | # leading and trailing newlines are also stripped
18 | 
19 | !test-equal {golden}
20 | 


--------------------------------------------------------------------------------
/aipl/ops/format.py:
--------------------------------------------------------------------------------
1 | from collections import ChainMap
2 | 
3 | from aipl import defop, LazyRow
4 | 
5 | @defop('format', 0.5, 0, rankin2=0)
6 | def op_format(aipl, row:LazyRow, prompt:str='') -> str:
7 |     'Format prompt text (right operand) as a Python string template, substituting values from row (left operand) and global context.'
8 |     return prompt.format_map(ChainMap(row, aipl.tables, aipl.globals))
9 | 


--------------------------------------------------------------------------------
/aipl/ops/csv.py:
--------------------------------------------------------------------------------
 1 | from aipl import defop
 2 | from typing import List
 3 | 
 4 | # assumes header row
 5 | @defop('csv-parse', None, 1.5)
 6 | def op_csv_parse(aipl, fname:str) -> List[dict]:
 7 |     'Converts a .csv into a table of rows.'
 8 |     import csv
 9 |     with open(fname, newline='') as csvfile:
10 |         reader = csv.DictReader(csvfile)
11 |         for row in reader:
12 |             yield row
13 | 


--------------------------------------------------------------------------------
/aipl/ops/join.py:
--------------------------------------------------------------------------------
 1 | from aipl import defop
 2 | 
 3 | from typing import List
 4 | 
 5 | @defop('join', 1, 0)
 6 | def op_join(aipl, v:List[str], sep=' ') -> str:
 7 |     'Join inputs with sep into a single output scalar.'
 8 |     return sep.join(v)
 9 | 
10 | def test_toplevel_join(aipl):
11 |     t = aipl.run_test('!join', 'now', 'is', 'the')
12 |     assert len(t.rows) == 1
13 |     assert t[0].value == 'now is the'
14 | 
15 | 


--------------------------------------------------------------------------------
/tests/toplevel-ravel.aipl:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | !test-input
 4 | 
 5 | pizza toppings:tomato,cheese,onion
 6 | sex:m,f,y,n
 7 | 
 8 | !split sep=\n
 9 | !split-into sep=: category members
10 | !split>member sep=,
11 | !ravel
12 | 
13 | !format << {category}: {member}
14 | !join sep=\n
15 | !test-equal
16 | pizza toppings: tomato
17 | pizza toppings: cheese
18 | pizza toppings: onion
19 | sex: m
20 | sex: f
21 | sex: y
22 | sex: n
23 | 


--------------------------------------------------------------------------------
/aipl/ops/ref.py:
--------------------------------------------------------------------------------
 1 | from aipl import defop, Table, AIPLException
 2 | 
 3 | 
 4 | @defop('ref', 1.5, 1.5)
 5 | def op_ref(aipl, t:Table, name):
 6 |     'Move column on table to end of columns list (becoming the new .value)'
 7 |     col = t.get_column(name)
 8 |     if col not in t.columns:
 9 |         raise AIPLException(f'no such column {name}')
10 | 
11 |     t.columns.remove(col)
12 |     t.add_column(col)
13 |     return t
14 | 


--------------------------------------------------------------------------------
/aipl/test_db.py:
--------------------------------------------------------------------------------
 1 | from . import Database
 2 | 
 3 | 
 4 | def test_db():
 5 |     import tempfile
 6 |     with tempfile.NamedTemporaryFile() as f:
 7 |         with Database(f.name) as db:
 8 |             db.insert('people', id=10, name='James Jones')
 9 |             db.insert('people', id=11, name='Maria Garcia')
10 |             db.insert('people', id=12, name='Michael Smith')
11 | 
12 |         db = Database(f.name)
13 |         assert len(db.table('people')) == 3
14 |         assert db.query('SELECT * FROM people WHERE id=?', 12)[0].name == 'Michael Smith'
15 | 


--------------------------------------------------------------------------------
/examples/git-commit.aipl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bin/aipl
 2 | 
 3 | !format
 4 | 
 5 | gpt-3.5-turbo
 6 | #gpt-4
 7 | 
 8 | !split>model
 9 | 
10 | !ravel
11 | 
12 | # write commit msg for git diff
13 | !sh git diff --cached
14 | !format
15 | Add a commit subject and message that explains the following commit.
16 | Keep same exact commit format, as it will be piped directly into git.
17 | Be terse.
18 | 
19 | """
20 | {_}
21 | """
22 | 
23 | !llm model={model}
24 | !print
25 | 
26 | !format
27 | {_}
28 | 
29 | [Commit message generated by {model}]
30 | 
31 | !join sep=\n\n
32 | 
33 | !shtty git commit -v -m {_} --edit
34 | 


--------------------------------------------------------------------------------
/aipl/ops/input.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | !require-input prompts the user for input, if none provided in the script.
 3 | '''
 4 | 
 5 | import sys
 6 | 
 7 | from aipl import defop, Table
 8 | 
 9 | 
10 | @defop('require-input', 100, 100)
11 | def op_require_input(aipl, t:'Table', prompt=''):
12 |     'Ensure there is any input at all; if not, display the prompt and read input from the user.'
13 |     if len(t.rows) == 0 or not t[0].value:
14 |         print(prompt, file=sys.stderr)
15 |         print('Ctrl+D to end input', file=sys.stderr)
16 |         return Table([{'input':sys.stdin.read().strip('\n')}])
17 |     return t
18 | 


--------------------------------------------------------------------------------
/aipl/ops/take.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | !take `n` returns a copy of an input `Table` with only the first `n` rows.
 3 | '''
 4 | 
 5 | from copy import copy
 6 | 
 7 | from aipl import defop
 8 | from aipl.table import Table
 9 | 
10 | @defop('take', 1.5, 1.5)
11 | def op_take(aipl, t:Table, n=1) -> Table:
12 |     'Return a table with first n rows of `t`'
13 |     ret = copy(t)
14 |     ret.rows = t.rows[:n]
15 |     return ret
16 | 
17 | 
18 | def test_take(aipl):
19 |     r = aipl.run_test('!take 2', '1 2 3', '4 5 6', '7 8 9')
20 |     assert len(r.rows) == 2
21 |     assert r[0].value == '1 2 3'
22 |     assert r[1].value == '4 5 6'
23 | 


--------------------------------------------------------------------------------
/aipl/ops/sample.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | !sample <n>
 3 | 
 4 | Replace input table with table of n sampled rows.
 5 | '''
 6 | 
 7 | from aipl import defop, Table
 8 | 
 9 | 
10 | @defop('sample', 1.5, 1.5)
11 | def op_sample(aipl, t:Table, n:int=1) -> Table:
12 |     'Sample n random rows from the input table.'
13 |     import random
14 |     return Table(random.sample(t.rows, n), parent=t)
15 | 
16 | 
17 | def test_sample(aipl):
18 |     r = aipl.run_test('!split !sample 2', 'a b c', 'd e f')
19 |     assert len(r[0].value) == 2
20 |     for row in r[0].value:
21 |         assert row.value in 'abc'
22 |     for row in r[1].value:
23 |         assert row.value in 'def'
24 | 


--------------------------------------------------------------------------------
/aipl/ops/cluster.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | !cluster will partition input vectors into n clusters,
 3 | using k-means clustering which minimises within cluster
 4 | variances.
 5 | '''
 6 | 
 7 | from typing import List
 8 | 
 9 | from aipl import defop
10 | 
11 | @defop('cluster', 1, 1)
12 | def op_cluster(aipl, v:List[List[float]], n=10):
13 |     'Cluster rows by embedding into n clusters; add label column.'
14 |     import numpy as np
15 |     from sklearn.cluster import KMeans
16 | 
17 |     matrix = np.vstack(v)
18 |     kmeans = KMeans(n_clusters=n, init='k-means++', random_state=42, n_init='auto')
19 |     kmeans.fit(matrix)
20 | 
21 |     return [int(x) for x in kmeans.labels_]
22 | 


--------------------------------------------------------------------------------
/aipl/ops/url.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse, urlunparse
 2 | 
 3 | from aipl import defop
 4 | 
 5 | 
 6 | @defop('url-split', 0, 0.5)
 7 | def op_url_split(aipl, url:str) -> dict:
 8 |     'Split url into components (scheme, netloc, path, params, query, fragment).'
 9 |     r = urlparse(url)
10 |     return dict(scheme=r.scheme,
11 |                netloc=r.netloc,
12 |                path=r.path,
13 |                params=r.params,
14 |                query=r.query,
15 |                fragment=r.fragment)
16 | 
17 | 
18 | @defop('url-defrag', 0, 0)
19 | def op_url_defrag(aipl, url:str) -> str:
20 |     'Remove fragment from url.'
21 |     return urlunparse(urlparse(url)._replace(fragment=''))
22 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "aipl"
 3 | version = "0.1.0"
 4 | description = "Array-Inspired Pipeline Language"
 5 | authors = ["Saul Pwanson <code@saul.pw>"]
 6 | license = "Proprietary"
 7 | readme = "README.md"
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.8"
11 | openai = "^0.27.6"
12 | scikit-learn = "^1.2.2"
13 | numpy = "^1.24.3"
14 | trafilatura = "^1.6.0"
15 | beautifulsoup4 = "^4.12.2"
16 | lxml = "^4.9.2"
17 | lark = "^1.1.5"
18 | rich = "^13.4.1"
19 | 
20 | [tool.poetry.group.dev.dependencies]
21 | pytest = "^7.3.1"
22 | 
23 | [build-system]
24 | requires = ["poetry-core"]
25 | build-backend = "poetry.core.masonry.api"
26 | 
27 | [tool.poetry.scripts]
28 | aipl = "aipl.main:main"
29 | 


--------------------------------------------------------------------------------
/aipl/ops/filter.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | !filter returns the table, containing only the rows
 3 | that were truthy in the value column.
 4 | The value column is then discarded.
 5 | '''
 6 | from copy import copy
 7 | 
 8 | from aipl import defop
 9 | from aipl.table import Table
10 | 
11 | @defop('filter', 1.5, 1.5)
12 | def op_filter(aipl, t:Table) -> Table:
13 |     'Return copy of table, keeping only rows whose value is Truthy.'
14 |     ret = copy(t)
15 |     ret.rows = [r._row for r in t if r.value]
16 |     ret.columns = ret.columns[:-1]  # discard bool column
17 |     return ret
18 | 
19 | def test_filter(aipl):
20 |     r = aipl.run_test('!match ^c !filter', 'a b c', 'b c d', 'c d e')
21 |     assert len(r.rows) == 1
22 |     assert r[0].value == 'c d e'
23 | 


--------------------------------------------------------------------------------
/tools/vscode/language-configuration.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "comments": {
 3 |         // symbol used for single line comment. Remove this entry if your language does not support line comments
 4 |         "lineComment": "#",
 5 |     },
 6 |     // symbols used as brackets
 7 |     "brackets": [
 8 |         ["{", "}"],
 9 |         ["[", "]"],
10 |         ["(", ")"]
11 |     ],
12 |     // symbols that are auto closed when typing
13 |     "autoClosingPairs": [
14 |         ["{", "}"],
15 |         ["[", "]"],
16 |         ["(", ")"],
17 |         ["\"", "\""],
18 |         ["'", "'"]
19 |     ],
20 |     // symbols that that can be used to surround a selection
21 |     "surroundingPairs": [
22 |         ["{", "}"],
23 |         ["[", "]"],
24 |         ["(", ")"]
25 |     ]
26 | }
27 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: aipl-testing
 2 | on:
 3 |   pull_request:
 4 |     branches:
 5 |       - develop
 6 |   push:
 7 |     branches:
 8 |       - develop
 9 | 
10 | jobs:
11 |   run-tests:
12 | 
13 |     strategy:
14 |       matrix:
15 |         python-version: ["3.10", "3.11"]
16 | 
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |     - uses: actions/checkout@v2
20 | 
21 |     - name: Set up Python ${{ matrix.pythonversion }}
22 |       uses: actions/setup-python@v2
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 | 
26 |     - name: Install
27 |       run: |
28 |         pip3 install .
29 |         pip3 install pytest
30 | 
31 |     - name: Ensure it exe starts up
32 |       run: aipl -h
33 | 
34 |     - name: Run pytests
35 |       run: pytest .
36 | 


--------------------------------------------------------------------------------
/tools/vscode/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "vsc-aipl-extension",
 3 |     "displayName": "vsc_aipl_extension",
 4 |     "description": "A VSC extension to support basic AIPL syntax highlighting",
 5 |     "publisher": "0",
 6 |     "version": "0.0.1",
 7 |     "engines": {
 8 |         "vscode": "^1.35.0"
 9 |     },
10 |     "categories": [
11 |         "Programming Languages"
12 |     ],
13 |     "contributes": {
14 |         "languages": [{
15 |             "id": "aipl", 
16 |             "aliases": ["AIPL", "aipl"],
17 |             "extensions": [".aipl"],
18 |             "configuration": "./language-configuration.json"
19 |         }],
20 |         "grammars": [{
21 |             "language": "aipl",
22 |             "scopeName": "source.aipl",
23 |             "path": "./syntaxes/aipl.tmLanguage.json"
24 |         }]
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/examples/rowan/load-json-v4.aipl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bin/aipl
 2 | 
 3 | # loads a JSON file from a hardcoded URL and accesses a sample from a list inside (AIPL idiomatic), filtering on string matching
 4 | 
 5 | # allow user to select if they want to filter by ironic=True/1 or False/0
 6 | !require-input
 7 | ironic? 1 or 0
 8 | !split>choice sep=\n
 9 | 
10 | # load the JSON
11 | !format
12 | https://raw.githubusercontent.com/google/BIG-bench/main/bigbench/benchmark_tasks/irony_identification/task.json
13 | !fetch-url
14 | 
15 | # access 'examples' list in JSON blob
16 | !json-parse examples=examples
17 | 
18 | # map each element to its 'input' attribute
19 | !format>statement
20 | {input}
21 | 
22 | # !ravel
23 | !format
24 | {target_scores_ironic}: {statement}
25 | 
26 | # filter to only ironic or non-ironic statements
27 | !match {choice}:
28 | !filter
29 | !print
30 | 


--------------------------------------------------------------------------------
/aipl/ops/db.py:
--------------------------------------------------------------------------------
 1 | from aipl import defop, Database
 2 | 
 3 | 
 4 | @defop('dbopen', None, 0)
 5 | def op_dbopen(aipl, url:str):
 6 |     'Open connection to database.'
 7 |     return Database(url)
 8 | 
 9 | @defop('dbquery', 0.5, 1.5)
10 | def op_dbquery(aipl, row:'LazyRow', dbname:str, tblname:str, *colnames, **kwargs):
11 |     'Query database table.'
12 |     for r in aipl.globals[dbname].select(tblname, **kwargs):
13 |         yield {colname:r[colname] for colname in colnames}
14 | 
15 | 
16 | @defop('dbdrop', None, None)
17 | def op_dbdrop(aipl, tblname:str):
18 |     'Drop database table.'
19 |     aipl.output_db.sql(f'DROP TABLE IF EXISTS {tblname}')
20 | 
21 | 
22 | @defop('dbinsert', 0.5, None)
23 | def op_dbinsert(aipl, row, tblname:str, **kwargs):
24 |     'Insert each row into database table.'
25 |     aipl.output_db.insert(tblname, **row._asdict(), **kwargs)
26 | 


--------------------------------------------------------------------------------
/examples/saulpw/crossword-log.aipl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env aipl
 2 | 
 3 | # Usage:
 4 | #  $0 book="Book Title" attempted_date="2022-2023"
 5 | 
 6 | !regex-capture
 7 |    (?P<A1_D1>\w+) ?(?P<note>.*)?
 8 | 
 9 | !regex-translate>rating
10 | # rating:
11 | # -2: hated
12 | # -1: didn't like
13 | #  0: attempted, unfinished
14 | # +1: completed
15 | #  2: liked
16 |   \* 0
17 |   \+ +2
18 |   \- -1
19 |   ^$ +1
20 | 
21 | !ravel
22 | 
23 | !!python
24 | from dateutil.parser import parse
25 | def date(s):
26 |     return parse(s)
27 | 
28 | !!dbopen>gxd gxd.sqlite
29 | !dbquery gxd puzzles xdid A1_D1={A1_D1}
30 | 
31 | !python-expr>dotw date('{xdid}'[3:]).strftime('%A')
32 | 
33 | !format>attempted_date
34 | {attempted_date}
35 | !format>book
36 | {book}
37 | 
38 | !columns  attempted_date dotw xdid A1_D1 rating book
39 | !dbdrop attempted_puzzles
40 | !dbinsert attempted_puzzles
41 | 


--------------------------------------------------------------------------------
/tests/test-named-ravel.aipl:
--------------------------------------------------------------------------------
 1 | !test-input
 2 | 
 3 | a b c
 4 | d e f g
 5 | 
 6 | # for an op with rankout=1, !op>var1>var2 will name the deepest (scalar) 'inner column' var2, and the outer column that contains the vector var1.
 7 | 
 8 | !split>line sep=\n
 9 | 
10 | # make sure columns are named correctly
11 | # and that named columns remain visible to a top-level json
12 | 
13 | !test-json
14 | [{"_": [{"line": "a b c"}, {"line": "d e f g"}]}]
15 | 
16 | !split>chars>char
17 | 
18 | !test-json
19 | 
20 | [{
21 |     "_": [
22 |        {"line": "a b c",   "chars": [{"char": "a"}, {"char": "b"}, {"char": "c"}]},
23 |        {"line": "d e f g", "chars": [{"char": "d"}, {"char": "e"}, {"char": "f"}, {"char": "g"}]}
24 |      ]
25 | }]
26 | 
27 | !ravel>letter
28 | 
29 | !join>out sep=.
30 | !columns out
31 | 
32 | !test-json
33 | [{
34 |     "out": "a.b.c.d.e.f.g"
35 | }]
36 | 
37 | !print
38 | 


--------------------------------------------------------------------------------
/tools/translate-dialect.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | from aipl import AIPL
 5 | 
 6 | 
 7 | def dialectize(cmd:Command) -> str:
 8 |     ret = f'!{cmd.opname}'
 9 |     if cmd.args:
10 |         ret += ' ' + ' '.join(str(arg) for arg in cmd.args)
11 |     if cmd.kwargs:
12 |         ret += ' ' + ' '.join(f'{k}={v}' for k,v in cmd.kwargs.items())
13 | 
14 |     ret += '\n'
15 |     if cmd.prompt:
16 |         ret += cmd.prompt + '\n\n'
17 |     return ret
18 | 
19 | 
20 | def main(*args):
21 |     aipl = AIPL()
22 |     for fn in args:
23 |         code = open(fn).read()
24 |         with open(fn, 'w') as outfp:
25 |             cmds = aipl.parse(code)
26 |             for cmd in cmds:
27 |                 # XXX: need to handle comments and outputting a particular dialect
28 |                 print(dialectize(cmd), file=outfp)
29 | 
30 | 
31 | main(*sys.argv[1:])
32 | 


--------------------------------------------------------------------------------
/aipl/ops/sh.py:
--------------------------------------------------------------------------------
 1 | from aipl import defop, Table
 2 | 
 3 | 
 4 | @defop('sh', 0, 1.5)
 5 | def op_sh(aipl, cmdline:str, **kwargs) -> dict:
 6 |     'Run the command described by args.  Return (retcode, stderr, stdout) columns.'
 7 |     import subprocess
 8 |     r = subprocess.run(cmdline, shell=True, text=True,
 9 | #                       stdin=subprocess.PIPE,
10 |                        stdout=subprocess.PIPE,
11 |                        stderr=subprocess.PIPE)
12 |     return Table([dict(retcode=r.returncode,
13 |                 stderr=r.stderr,
14 |                 stdout=r.stdout)])
15 | 
16 | @defop('shtty', None, 0.5)
17 | def op_shtty(aipl, _:'LazyRow', *args) -> dict:
18 |     'Run the command described by args.  Return (retcode, stderr, stdout) columns.'
19 |     import subprocess
20 |     r = subprocess.run(args, text=True,
21 |                        stderr=subprocess.PIPE)
22 |     return dict(retcode=r.returncode,
23 |                 stderr=r.stderr)
24 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | from setuptools import setup, find_packages
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def readme():
 8 |     with open('README.md') as f:
 9 |         return f.read()
10 | 
11 | def requirements():
12 |     return Path('requirements.txt').read_text().splitlines()
13 | 
14 | 
15 | setup(
16 |     name="AIPL",
17 |     version="0.1",
18 |     description="A tiny DSL to make it easier to explore and experiment with AI pipelines.",
19 |     long_description=readme(),
20 |     long_description_content_type="text/markdown",
21 |     python_requires=">=3.10",
22 |     py_modules=["aipl"],
23 |     scripts=['bin/aipl'],
24 |     install_requires=requirements(),
25 |     packages=find_packages(),
26 |     author="Saul Pwanson",
27 |     url="https://github.com/saulpw/aipl",
28 |     classifiers=[
29 |         "Development Status :: 4 - Beta",
30 |         "Programming Language :: Python :: 3",
31 |     ],
32 |     keywords="GPT aipl visidata array",
33 | )
34 | 


--------------------------------------------------------------------------------
/aipl/ops/def.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | !!def <opname>
 3 |  !op1
 4 |  !op2
 5 | 
 6 | Create a new op named <opname> that runs the AIPL in the prompt when invoked.
 7 | '''
 8 | 
 9 | from aipl import defop, Table
10 | 
11 | 
12 | @defop('def', 0, None)  # immediate
13 | def op_def(aipl, prompt, opname):
14 |     'Define composite operator from cmds in prompt (must be indented).'
15 |     cmds = aipl.parse(prompt)
16 | 
17 |     @defop(opname,
18 |            rankin=cmds[0].op.rankin,
19 |            rankout=cmds[-1].op.rankout)
20 |     def new_operator(aipl, *args, **kwargs):
21 |         arity = 0 if cmds[0].op.rankin is None else 1
22 |         if arity == 0:
23 |             t = aipl.new_input()
24 |         elif arity == 1:
25 |             t = aipl.new_input(args[0])
26 |         ret = aipl.run_cmdlist(cmds, [t], *args[arity:])
27 |         return ret[-1][0].value
28 | 
29 | 
30 | def test_def(aipl):
31 |     r = aipl.run_test('''
32 | !!def split-join
33 |  !split
34 |  !join
35 | 
36 | !split-join
37 | ''', 'a b c', 'd e f')
38 |     assert r[0].value == 'a b c'
39 |     assert r[1].value == 'd e f'
40 | 


--------------------------------------------------------------------------------
/aipl/ops/split.py:
--------------------------------------------------------------------------------
 1 | from aipl import defop
 2 | 
 3 | from typing import List
 4 | 
 5 | @defop('split', 0, 1)
 6 | def op_split(aipl, v: str, sep:str=None, maxsize:int=0, trim=False) -> List[str]:
 7 |     'Split text into chunks based on sep, keeping each chunk below maxsize.'
 8 |     if trim:
 9 |         v = v.strip()
10 |     win = []
11 |     tot = 0
12 |     for i, unit in enumerate(v.split(sep)):
13 |         n = len(unit)
14 |         if tot+n > int(maxsize):
15 |             if win:
16 |                 yield (sep or ' ').join(win)
17 |                 win = []
18 |                 tot = 0
19 | 
20 |         win.append(unit)
21 |         tot += n
22 | 
23 |     if win:
24 |         yield (sep or ' ').join(win)
25 | 
26 | 
27 | @defop('split-into', 0, 0.5)
28 | def op_split_into(aipl, v:str, *args, sep=None) -> dict:
29 |     'Split text by sep into the given column names.'
30 |     return dict(zip(args, v.split(sep)))
31 | 
32 | def test_split_join(aipl):
33 |     t = aipl.run_test('!split !take 3 !join', 'now is the time')
34 |     assert len(t.rows) == 1
35 |     assert t[0].value == 'now is the'
36 | 
37 | 


--------------------------------------------------------------------------------
/LICENSE.mit:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2023  Saul Pwanson and the Devottys
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/aipl/ops/ravel.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from copy import copy
 3 | 
 4 | 
 5 | from aipl import defop
 6 | from aipl.table import Table, Column
 7 | 
 8 | 
 9 | @defop('ravel', 100, 1.5)
10 | def op_ravel(aipl, v:Table, rank=0) -> Table:
11 |     'All of the leaf scalars in the value column become a single 1-D array.'
12 |     def _ravel(t:Table, newkey:str, parent=None) -> List['Scalar']:
13 |         for row in t:
14 |             if isinstance(row.value, Table) and row.value.rank > rank:
15 |                 yield from _ravel(row.value, newkey, parent=row)
16 |             else:
17 |                 if '__parent' not in row._row and parent is not None:
18 |                     row._row['__parent'] = parent
19 | 
20 |                 yield row
21 | 
22 |     newkey = aipl.unique_key
23 |     ret = Table(parent=v)
24 |     for row in _ravel(v, newkey):
25 |         ret.rows.append(row._row)
26 | 
27 |         for c in row._table.columns:
28 |             ret.add_column(copy(c))
29 |     return ret
30 | 
31 | 
32 | def test_ravel(aipl):
33 |     t = aipl.run_test('!split !take 2 !ravel !join', 'a b c d', 'e f g')
34 |     assert t[0].value == 'a b e f'
35 | 


--------------------------------------------------------------------------------
/aipl/ops/columns.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | !column takes a space-seperated list of columns
 3 | in the current table, and returns a copy of the table
 4 | with only those columns.
 5 | Akin to SQLite SELECT.
 6 | '''
 7 | 
 8 | from copy import copy
 9 | 
10 | from aipl import defop, Table, Column
11 | 
12 | 
13 | @defop('columns', 1.5, 1.5)
14 | def op_columns(aipl, t:'Table', *colnames, **renamedcols) -> Table:
15 |     'Create new table containing only these columns.'
16 |     namings = [(n,n) for n in colnames]  # from_name:to_name
17 |     namings.extend((v,k) for k,v in renamedcols.items())
18 |     newcols = []
19 |     ret = copy(t)
20 |     ret.rows = []
21 |     for row in t:
22 |         d = {'__parent':row}
23 |         d.update({to_name:row[from_name] for from_name, to_name in namings})
24 |         ret.rows.append(d)
25 | 
26 |     for from_name, to_name in namings:
27 |         ret.add_column(Column(to_name))
28 | 
29 |     return ret
30 | 
31 | def test_columns(aipl):
32 |     r = aipl.run_test('!name letters !split !ravel !columns letters', 'a b c', 'd e f')
33 |     assert r[0].value == 'a b c'
34 |     assert r[3].value == 'd e f'
35 |     assert len(r) == 6
36 | 


--------------------------------------------------------------------------------
/examples/benchmarks/bigbench-binary-classification.aipl:
--------------------------------------------------------------------------------
 1 | # in: list of models; out: % accuracy in classifying the given task
 2 | 
 3 | !csv-parse model-task.csv
 4 | 
 5 | !format
 6 | https://raw.githubusercontent.com/google/BIG-bench/main/bigbench/benchmark_tasks/{task}/task.json
 7 | !fetch-url
 8 | # name=name description=description 
 9 | !json-parse examples=examples
10 | 
11 | !format>statement
12 | {input}
13 | !take 20
14 | 
15 | # try these tasks without any prompt context and see what happens!
16 | !format>zero-shot
17 | {statement}
18 | ---
19 | Classify with 1 if yes, 0 if no.
20 | Classification: 
21 | !llm>classification model={model} max_tokens=1
22 | 
23 | # TODO: be able to look at responses per-model; currently can't tell what model had what classification
24 | # !format
25 | # {model} {classification} ({target_scores_Yes}): {statement}
26 | !format
27 | {classification} ({target_scores_Yes}): {statement}
28 | !print
29 | 
30 | !metrics-accuracy>accuracy classification target_scores_Yes
31 | !format
32 | {model:15} {task:25} {accuracy:.2f}
33 | !print
34 | # !columns zero-shot classification target_scores_Yes
35 | # !json 2
36 | # !save {model}_{task}.json
37 | 
38 | !print


--------------------------------------------------------------------------------
/aipl/ops/cross.py:
--------------------------------------------------------------------------------
 1 | from aipl import defop, Table, SubColumn, LazyRow
 2 | 
 3 | __test__ = '''
 4 | !test-input
 5 | a b c
 6 | !split>col1
 7 | !table t1
 8 | !test-input
 9 | d e f
10 | !split>col2
11 | !cross <<t1
12 | !format
13 |   {col1}/{col2}
14 | !ravel
15 | !join
16 | !test-equal
17 | a/d b/d c/d a/e b/e c/e a/f b/f c/f
18 | '''
19 | 
20 | def test_cross(aipl):
21 |     aipl.run_test(__test__)
22 | 
23 | 
24 | def iterate_tables(t:Table, rankin=1):
25 |     if t.rank <= rankin:
26 |         yield t
27 |     else:
28 |         for row in t:
29 |             yield from iterate_tables(row.value, rankin=rankin)
30 | 
31 | 
32 | @defop('cross', 0.5, 1.5, rankin2=100)
33 | def op_cross(aipl, row:LazyRow, t:Table) -> Table:
34 |     'Construct cross-product of left and right inputs (pass right input via `<<tablename`).'
35 |     ret = Table()
36 |     for tright in iterate_tables(t):
37 |         for rightrow in tright:
38 |             ret.rows.append(dict(__parent=row, left=row._row, right=rightrow._row))
39 | 
40 |     # left columns are available automatically or from __parent
41 |     for c in tright.columns:
42 |         ret.add_column(SubColumn('right', c))
43 | 
44 |     return ret
45 | 


--------------------------------------------------------------------------------
/aipl/ops/test.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | !test- used for testing .aipl scripts.
 3 | '''
 4 | 
 5 | import json
 6 | 
 7 | from aipl import defop, AIPLException, Table
 8 | 
 9 | @defop('test-input', 100, 1.5, rankin2=0)  # immed
10 | def op_test_input(aipl, t:Table, prompt=''):
11 |     'In test mode, replace input with prompt.'
12 |     if aipl.options.test:
13 |         return aipl.new_input(prompt)
14 |     return t
15 | 
16 | 
17 | @defop('test-equal', 0, None, rankin2=0)
18 | def op_test_equal(aipl, v:str, prompt=''):
19 |     'In test mode, error if value is not equal to prompt.'
20 |     if aipl.options.test:
21 |         if v != prompt:
22 |             raise AIPLException(f'assert failed! value not equal:\n' + v)
23 | 
24 | 
25 | @defop('test-json', 100, None, rankin2=0)
26 | def op_test_json(aipl, t:Table, prompt:str=''):
27 |     'Error if value Column is not equal to json blob in prompt.'
28 | 
29 |     class _jsonEncoder(json.JSONEncoder):
30 |         def default(self, obj):
31 |             return str(obj)
32 | 
33 |     if t._asdict() != json.loads(prompt):
34 |         jsonenc = _jsonEncoder()
35 |         raise AIPLException(f'assert failed! value not equal\n  ' + jsonenc.encode(t._asdict()))
36 | 


--------------------------------------------------------------------------------
/examples/benchmarks/bigbench-binary-classification-local.aipl:
--------------------------------------------------------------------------------
 1 | # in: list of models; out: % accuracy in classifying the given task
 2 | 
 3 | !csv-parse model-task.csv
 4 | 
 5 | !format
 6 | https://raw.githubusercontent.com/google/BIG-bench/main/bigbench/benchmark_tasks/{task}/task.json
 7 | !fetch-url
 8 | # name=name description=description 
 9 | !json-parse examples=examples
10 | 
11 | !format>statement
12 | {input}
13 | !take 20
14 | 
15 | # try these tasks without any prompt context and see what happens!
16 | !format>zero-shot
17 | {statement}
18 | ---
19 | Classify with 1 if yes, 0 if no.
20 | Classification: 
21 | !llm-local>classification model={model} max_tokens=2
22 | 
23 | # TODO: be able to look at responses per-model; currently can't tell what model had what classification
24 | # !format
25 | # {model} {classification} ({target_scores_Yes}): {statement}
26 | !format
27 | {classification} ({target_scores_Yes}): {statement}
28 | !print
29 | 
30 | !metrics-accuracy>accuracy classification target_scores_Yes
31 | !format
32 | {model:15} {task:25} {accuracy:.2f}
33 | !print
34 | # !columns zero-shot classification target_scores_Yes
35 | # !json 2
36 | # !save {model}_{task}.json
37 | 
38 | !print


--------------------------------------------------------------------------------
/aipl/ops/regex.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from aipl import defop
 4 | 
 5 | 
 6 | def preprompt_capture(prompt:str='') -> re.Pattern:
 7 |     import re
 8 |     return re.compile(prompt)
 9 | 
10 | @defop('regex-capture', 0, 0.5, preprompt=preprompt_capture)
11 | def regex_capture(aipl, v:str, prompt:re.Pattern=None) -> dict:
12 |     'Capture from prompt regex into named matching groups.'
13 |     m = prompt.match(v)
14 |     if not m:
15 |         return {}
16 |     return m.groupdict()
17 | 
18 | 
19 | def preprompt_translate(prompt:str=''):
20 |     import re
21 |     d = []
22 |     for line in prompt.splitlines():
23 |        regex, output = line.split(maxsplit=1)
24 |        d.append((re.compile(regex), output))
25 | 
26 |     return d
27 | 
28 | @defop('regex-translate', 0, 0, preprompt=preprompt_translate)
29 | def regex_translate(aipl, v:str, prompt:list):
30 |     r'''Translate input according to regex translation rules in prompt, one per line, with regex and output separated by whitespace:
31 |         \bDr\.?\b Doctor
32 |         \bJr\.?\b Junior
33 |     '''
34 |     for regex, output in prompt:
35 |         m = regex.match(v)
36 |         if m:
37 |             return output
38 |     return v
39 | 


--------------------------------------------------------------------------------
/about/roadmap.md:
--------------------------------------------------------------------------------
 1 | # Roadmap
 2 | 
 3 | ## Overall goals
 4 | 
 5 | - To compile and curate the hundreds of operators into a single place,
 6 |    - to design them to interoperate with each other in straightforward-by-default ways.
 7 |    - to port concepts from langchain and elsewhere as needed, clarifying and testing and upgrading them.
 8 | 
 9 | - To compile and curate recipes for dozens of well-commented reference chains,
10 | 
11 | - To allow a smart but "not a programmer" enduser to take a reference recipe and tweak it for their own custom use;
12 |    - to allow them to get moving on a prototype for their idea immediately;
13 |    - to allow them to see into the process step-by-step;
14 | 
15 | - To provide them a platform for experimentation and small-scale production.
16 | 
17 |    - to provide an incentive and process for them to submit their work, for others to learn, and also to potentially improve the reference;
18 |    - to compile test suites, so we can run A/B tests and gather data on specific prompt improvements, to continually optimize the reference chains.
19 | 
20 | operators + scripts + models + standardized tests = aipl
21 | 
22 | So that an AI proof-of-concept can be thrown together in an evening.
23 | 


--------------------------------------------------------------------------------
/aipl/ops/read.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse, urlunparse
 2 | 
 3 | from aipl import defop, dbcache, stderr, alias
 4 | 
 5 | 
 6 | @dbcache
 7 | def _fetch_url_bytes(aipl, url:str) -> bytes:
 8 |     import urllib.request
 9 |     stderr(f'fetching {url}...')
10 |     with urllib.request.urlopen(url) as resp:
11 |         return resp.read()
12 | 
13 | 
14 | @dbcache
15 | def _fetch_url(aipl, url:str) -> str:
16 |     import trafilatura
17 |     stderr(f'fetching {url}...')
18 |     # guess at decoding and other helpful things
19 |     return trafilatura.fetch_url(url)
20 | 
21 | 
22 | @defop('read', 0, 0)
23 | def op_read(aipl, url:str) -> str:
24 |     'Return contents of local filename.'
25 |     if '://' in url:
26 |         url = urlunparse(urlparse(url)._replace(fragment=''))
27 |         return _fetch_url(aipl, url)
28 | 
29 |     return open(url).read()
30 | 
31 | 
32 | @defop('read-bytes', 0, 0)
33 | def op_read_bytes(aipl, url:str) -> bytes:
34 |     'Return contents of URL or local filename as bytes.'
35 |     if '://' in url:
36 |         url = urlunparse(urlparse(url)._replace(fragment=''))
37 |         return _fetch_url_bytes(url)
38 | 
39 |     return open(url, mode='rb').read()
40 | 
41 | alias('fetch-url', 'read')
42 | 


--------------------------------------------------------------------------------
/aipl/ops/groupby.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | !groupby <colname...>
 3 | 
 4 | Group rows by given named columns, with output value for each key being table of corresponding rows.
 5 | '''
 6 | 
 7 | from copy import copy
 8 | from collections import defaultdict
 9 | 
10 | from aipl import defop
11 | from aipl.table import Table, Column
12 | 
13 | 
14 | @defop('groupby', 1.5, 1.5)
15 | def op_groupby(aipl, t:Table, *args) -> Table:
16 |     'Group rows into tables, by set of columns given as args.'
17 |     groups = defaultdict(list)  # groupkey -> list of rowdict
18 |     for row in t:
19 |         k = tuple([row[colname] for colname in args])
20 |         groups[k].append(row._row)
21 | 
22 |     ret = Table()
23 | 
24 |     newkey = aipl.unique_key
25 |     for key, rows in groups.items():
26 |         outdict = dict(zip(args, key))
27 |         outtable = copy(t)
28 |         outtable.rows = rows
29 |         outdict[newkey] = outtable
30 |         ret.rows.append(outdict)
31 | 
32 |     for colname in args:
33 |         ret.add_column(Column(colname, colname))
34 | 
35 |     ret.add_column(Column(newkey))
36 |     return ret
37 | 
38 | 
39 | def test_groupby(aipl):
40 |     r = aipl.run_test('!split-into name num  !groupby name', 'Bob 4', 'Alice 3', 'Carol 8', 'Bob 2', 'Alice 5', 'Bob 1')
41 |     assert len(r) == 3
42 | 


--------------------------------------------------------------------------------
/docs/writing-operators.md:
--------------------------------------------------------------------------------
 1 | # Writing a New Operator in Python
 2 | 
 3 | In AIPL you can use !!python to add a new operator.  For instance, here's the definition of `!lower`, a scalar string to string operator:
 4 | 
 5 |     !!python
 6 |     @defop('lower', rankin=0, rankout=0)
 7 |     def _(aipl, v:str) -> str:
 8 |         return v.lower()
 9 | 
10 | ## Operators internal to the AIPL codebase
11 | 
12 | All .py files in aipl.ops are imported automatically.
13 | You can use the exact same code from the prompt above.
14 | 
15 | Each and every operator internal to the aipl codebase should have:
16 | 
17 |   - Full docs for operator in the file's docstring, including any subtleties or warts
18 |   - Concise docs in function's docstring.
19 |   - At least one basic test and demonstration of functionality
20 | 
21 | Any imports of external libraries should be done within the operator itself, not at toplevel.
22 | 
23 | ## Full Example: `aipl/ops/lower.py`
24 | 
25 |     '''
26 |     !lower converts the input string to lowercase.
27 |     Unicode cased characters are supported per [Python str.lower]().
28 |     '''
29 | 
30 |     from aipl import defop
31 | 
32 | 
33 |     @defop('lower', rankin='scalar', rankout='scalar')
34 |     def _(aipl, v:str) -> str:
35 |         'Convert the input string to lowercase.'
36 |         return v.lower()
37 | 
38 | 
39 |     def test_lower(aipl):
40 |         r = aipl.run('!lower', 'HEY you')
41 |         assert r[0] == 'hey you'
42 | 


--------------------------------------------------------------------------------
/aipl/repl.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import readline
 3 | import traceback
 4 | 
 5 | from aipl import parse, Table, AIPLException
 6 | 
 7 | 
 8 | def repl(aipl, inputs:list[Table]):
 9 |     'Standard Read-Eval-Print-Loop (REPL)'
10 |     import rich
11 |     def completer(text, state):
12 |         ops = list(aipl.operators.keys()) + list(aipl.aliases.keys())
13 |         text = text[1:]
14 |         results = [x for x in ops if x.startswith(text)]
15 |         if results:
16 |             return "!" + results[state]
17 | 
18 |     readline.parse_and_bind("tab: complete")
19 |     readline.set_completer_delims(' \n=')
20 |     readline.set_completer(completer)
21 | 
22 |     while True:
23 |         sys.stdout.flush()
24 |         try:
25 |             cmdtext = input('> ')
26 |         except KeyboardInterrupt as e:
27 |             break  # exit on ^C
28 |         except EOFError:
29 |             print("\n")
30 |             continue
31 | 
32 |         if not cmdtext.strip():  # do nothing empty line
33 |             continue
34 | 
35 |         try:
36 |             cmds = parse(cmdtext)
37 |             op = aipl.get_op(cmds[0].opname)
38 |             if op.needs_prompt:
39 |                 while True:
40 |                     line = sys.stdin.readline()
41 |                     if not line.strip():
42 |                         break
43 |                     cmdtext += '\n' + line
44 | 
45 |             inputs = aipl.run(cmdtext, inputs)
46 |             rich.print(inputs[-1])
47 |         except AIPLException as e:
48 |             print(e.args[0])
49 |         except Exception as e:
50 |             traceback.print_exc()
51 | 


--------------------------------------------------------------------------------
/examples/cluster.aipl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bin/aipl
 2 | 
 3 | # Cluster embeddings in database into 10 categories; summarize a random sample
 4 | # of each; query GPT for a theme and subreddit; store all in db.
 5 | 
 6 | # the input is some urls
 7 | !name sourceurl
 8 | !read
 9 | !extract-links baseurl={sourceurl}
10 | !name url
11 | 
12 | # get only urls below the given url and remove the #fragment
13 | !match ^{sourceurl}
14 | !filter
15 | !url-defrag>mainurl
16 | 
17 | !read
18 | !extract-text>text
19 | 
20 | !split maxsize=4000
21 | !take 1
22 | !ravel
23 | 
24 | # get an embedding for each
25 | # XXX: how to exclude for URLs we've already done?
26 | !llm-embedding model=text-embedding-ada-002
27 | 
28 | !cluster n=10
29 | !name label
30 | 
31 | !columns url=mainurl text=text label=label
32 | 
33 | !dbdrop categorized
34 | !dbinsert categorized
35 | 
36 | # make a 2-column table (first being the category label, second being the list of rows)
37 | !groupby label
38 | 
39 | # find topic and subreddit
40 | !sample n=3
41 | 
42 | # split the text extraction from above into LLM-sized chunks
43 | !split <text
44 | 
45 | # take the first chunk
46 | !take 1
47 | 
48 | # prompt engineer that shit
49 | !format
50 | 
51 | URL: {url}
52 | """
53 | {_}
54 | """
55 | 
56 | !join
57 | !format
58 | What specific topic would these webpages fall under?
59 | Suggest a theme of only a few words, suitable as a title for the list containing these pages.
60 | Also suggest the subreddit that would most welcome the content.
61 | 
62 | Summaries:
63 | 
64 | {_}
65 | 
66 | Theme and subreddit:
67 | 
68 | !llm max-tokens=64 model=gpt-3.5-turbo
69 | !print
70 | 


--------------------------------------------------------------------------------
/aipl/ops/sort.py:
--------------------------------------------------------------------------------
 1 | from copy import copy
 2 | 
 3 | from aipl import defop, Table, alias, Column
 4 | 
 5 | 
 6 | @defop('sort', 1.5, 1.5)
 7 | def op_sort(aipl, t:Table, *args):
 8 |     'Sort the table by the given columns.'
 9 |     ret = copy(t)
10 |     cols = [t.get_column(cname) for cname in args] or [t.current_col]
11 |     ret.rows = sorted(t.rows, key=lambda r: tuple(c.get_value(r) for c in cols))
12 |     return ret
13 | 
14 | 
15 | @defop('grade-up', 1.5, 1)
16 | def op_grade_up(aipl, t:Table, *args):
17 |     'Assign ranks to unique elements in an array, incrementally increasing each by its corresponding rank value.'
18 |     values = t.values
19 |     return sorted(range(len(values)), key=values.__getitem__)
20 | 
21 | @defop('incr', 1.5, 1.5)
22 | def op_incr(aipl, t:Table, step:int= 1, base:int=1, *args):
23 |     'Add column that starts at base `base` incrementing by `step` for each row in `t`.'
24 |     incr_values = [base + x*step for x in range(len(t.rows))]
25 |     ret = copy(t)
26 |     ret.rows = []
27 |     for i, row in enumerate(t.rows):
28 |         row['incr'] = incr_values[i]
29 |         ret.rows.append(row)
30 | 
31 |     ret.add_column(Column('incr'))
32 | 
33 |     return ret
34 | 
35 | 
36 | def test_sort(aipl):
37 |     r = aipl.run_test('!sort', 3,1,4,2,8,5)
38 |     assert r.values == [1,2,3,4,5,8]
39 | 
40 | def test_grade_up(aipl):
41 |     r = aipl.run_test('!grade-up', 3,1,4,2,8,5)
42 |     assert r.values == [1, 3, 0, 2, 5, 4]
43 | 
44 | def test_incr(aipl):
45 |     r = aipl.run_test('!incr', 3, 1, 4, 2, 8, 5)
46 |     assert r.values == [1, 2, 3, 4, 5, 6]
47 | 
48 | alias('order-by', 'sort')
49 | 


--------------------------------------------------------------------------------
/aipl/ops/python.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import traceback
 3 | import operator
 4 | import time
 5 | 
 6 | from aipl import defop, InnerPythonException
 7 | 
 8 | 
 9 | def inner_exec(obj, *args, **kwargs):
10 |     try:
11 |         return exec(obj, *args, **kwargs)
12 |     except Exception as e:
13 |         exc_type, exc_value, exc_traceback = sys.exc_info()
14 |         tb = traceback.extract_tb(exc_traceback)
15 |         raise InnerPythonException(exc_value, tb[1:], obj)
16 | 
17 | 
18 | def inner_eval(obj, *args, **kwargs):
19 |     try:
20 |         return eval(obj, *args, **kwargs)
21 |     except Exception as e:
22 |         exc_type, exc_value, exc_traceback = sys.exc_info()
23 |         tb = traceback.extract_tb(exc_traceback)
24 |         raise InnerPythonException(exc_value, tb[1:], obj)
25 | 
26 | 
27 | @defop('python',0,None)
28 | def op_python(aipl, prompt:str=''):
29 |     'exec() Python toplevel statements.'
30 |     inner_exec(prompt, aipl.globals)
31 | 
32 | 
33 | @defop('python-expr', 0.5, 0, rankin2=0)
34 | def op_python_expr(aipl, row, expr:str):
35 |     'Add columns for Python expressions.'
36 |     return inner_eval(expr, aipl.globals, row)
37 | 
38 | 
39 | @defop('python-input', 0, 1.5)
40 | def op_python_input(aipl, prompt:str=''):
41 |     'eval() Python expression and use as toplevel input table.'
42 |     return inner_eval(prompt, aipl.globals)
43 | 
44 | defop(int)
45 | defop(float)
46 | defop(repr)
47 | defop(range, rankout='vector')
48 | defop(sum, rankin='vector')
49 | defop(operator.add)
50 | defop(operator.sub)
51 | defop(operator.mul)
52 | defop(operator.truediv)
53 | defop(len, rankin='vector')
54 | defop(len, opname='strlen')
55 | 


--------------------------------------------------------------------------------
/aipl/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping, List
 2 | from collections import ChainMap
 3 | import sys
 4 | 
 5 | 
 6 | def reprify(s) -> str:
 7 |     if isinstance(s, str):
 8 |         return s
 9 |     return repr(s)
10 | 
11 | def stderr(*args, **kwargs):
12 | #    args = [strify(x) for x in args]
13 |     args = [reprify(x) for x in args]
14 |     print(*args, file=sys.stderr, flush=True, **kwargs)
15 | 
16 | 
17 | def fmtarg(v:str, r:Mapping=None) -> str:
18 |     if isinstance(v, str):
19 |         v = v.encode('utf-8').decode('unicode-escape')
20 |         if r:
21 |             return v.format_map(r)
22 |     return v
23 | 
24 | 
25 | def fmtargs(args, contexts:List[Mapping]):
26 |     d = ChainMap(*reversed(contexts))
27 |     return [fmtarg(arg, d) for arg in args if not isinstance(arg, str) or not arg.startswith('<')]
28 | 
29 | 
30 | def fmtkwargs(kwargs, contexts:List[Mapping]):
31 |     d = ChainMap(*contexts)
32 |     return {k:fmtarg(v, d) for k,v in kwargs.items()}
33 | 
34 | 
35 | class AttrDict(dict):
36 |     def __getattr__(self, k):
37 |         if k not in self:
38 |             return None
39 |         return self[k]
40 | 
41 |     def __setattr__(self, k, v):
42 |         self[k] = v
43 | 
44 | 
45 | def strify(x, maxlen=0):
46 |     if isinstance(x, (list, tuple)):
47 |         if not x:
48 |             return '[]'
49 |         return f'[({len(x)}) {strify(x[0], maxlen=15)}]'
50 |     if isinstance(x, dict):
51 |         return '{' + ' '.join(f'{k}={strify(v, maxlen=15)}' for k, v in x.items()) + '}'
52 |     x = str(x).replace("\n", '\\n')
53 |     if maxlen and len(x) > maxlen:
54 |         x = x[:maxlen] + f'...({len(x)} bytes)'
55 |     return x
56 | 


--------------------------------------------------------------------------------
/aipl/ops/extract.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from urllib.parse import urljoin
 3 | 
 4 | from aipl import defop
 5 | 
 6 | 
 7 | @defop('extract-text-all', 0, 0)
 8 | def op_extract_text_all(aipl, html:str, **kwargs) -> str:
 9 |     'Extract all text from HTML'
10 |     from bs4 import BeautifulSoup
11 |     soup = BeautifulSoup(html, 'html.parser')
12 |     return soup.get_text()
13 | 
14 | 
15 | @defop('extract-text', 0, 0)
16 | def op_extract_text(aipl, html:str, **kwargs) -> str:
17 |     'Extract meaningful text from HTML'
18 |     parms = dict(include_comments=False,
19 |                  include_tables=False,
20 |                  no_fallback=True)
21 |     parms.update(kwargs)
22 | 
23 |     import trafilatura
24 |     content = trafilatura.extract(html, **parms)
25 |     if content is None:
26 |         return ''
27 |     else:
28 |         return content
29 | 
30 | 
31 | @defop('extract-links', 0, 1.5, outcols='linktext title href')
32 | def op_extract_links(aipl, html:str, baseurl='', **kwargs) -> List[dict]:
33 |     'Extract (linktext, title, href) from <a> tags in HTML'
34 |     if not html:
35 |         return
36 | 
37 |     from bs4 import BeautifulSoup
38 |     soup = BeautifulSoup(html, 'html.parser')
39 |     for link in soup.find_all('a', href=True):
40 |         href = link['href']
41 |         if baseurl:
42 |             href = urljoin(baseurl, href)
43 |         yield dict(linktext=link.text, title=link.get('title', ''), href=href)
44 | 
45 | 
46 | @defop('extract-selector', 0, 1)
47 | def _(aipl, html:str, selector:str) -> List[dict]:
48 |     from bs4 import BeautifulSoup
49 |     soup = BeautifulSoup(html, 'html.parser')
50 |     for el in soup.select(selector):
51 |         yield str(el)
52 | 


--------------------------------------------------------------------------------
/examples/summarize.aipl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bin/aipl
 2 | 
 3 | # fetch url, split webpage into chunks, summarize each chunk, then summarize the summaries.
 4 | 
 5 | # the inputs are urls
 6 | !read
 7 | 
 8 | # extract text from html
 9 | !extract-text
10 | 
11 | # split into chunks of lines that can fit in the context window
12 | !split maxsize=8000 sep=\n
13 | 
14 | # have GPT summary each chunk
15 | !format
16 | 
17 | Please read the following section of a webpage (500-1000 words) and provide a
18 | concise and precise summary in a few sentences, optimized for keywords and main
19 | content topics. Write only the summary, and do not include phrases like "the
20 | article" or "this webpage" or "this section" or "the author". Ensure the tone
21 | is precise and concise, and provide an overview of the entire section:
22 | 
23 | """
24 | {_}
25 | """
26 | 
27 | !llm model=gpt-3.5-turbo
28 | 
29 | # join the section summaries together
30 | !join sep=\n-
31 | 
32 | # have GPT summarize the combined summaries
33 | 
34 | !format
35 | 
36 | Based on the summaries of each section provided, create a one-paragraph summary
37 | of approximately 100 words. Begin with a topic sentence that introduces the
38 | overall content topic, followed by several sentences describing the most
39 | relevant subsections. Provide an overview of all section summaries and include
40 | a conclusion or recommendations only if they are present in the original
41 | webpage. Maintain a precise and concise tone, and make the overview coherent
42 | and readable, while preserving important keywords and main content topics.
43 | Remove all unnecessary text like "The document" and "the author".
44 | 
45 | """
46 | {_}
47 | """
48 | 
49 | !llm model=gpt-3.5-turbo
50 | 
51 | !print
52 | 


--------------------------------------------------------------------------------
/tools/aipl.vim:
--------------------------------------------------------------------------------
 1 | if exists("b:current_syntax")
 2 |   finish
 3 | endif
 4 | 
 5 | syntax match aiplComment "^#.*$"
 6 | 
 7 | syntax region aiplString start=/^[^!]/ end=/^\ze!/ contained contains=aiplTemplateParameter,aiplComment
 8 | syntax match aiplTemplateParameter "{[^}]*}" contained
 9 | 
10 | syn match aiplDef "^!!def\s\+" contained nextgroup=aiplOperatorName
11 | syn match aiplOperatorName "[^ ]\+\n" contained nextgroup=aiplNestedOperator
12 | 
13 | syntax match aiplCommand "[^ >!][^ >]*" nextgroup=aiplRedirect contained
14 | syntax match aiplOperator /^!\+/ contained nextgroup=aiplCommand contained
15 | syntax match aiplRedirect ">" nextgroup=aiplRedirectTarget contained
16 | syntax match aiplRedirectTarget "[^ >]\+" contained nextgroup=aiplRedirect
17 | 
18 | syntax region aiplCommandRegion start=/^!/ end=/^\ze!/ contains=aiplOperator,aiplComment,aiplString skipempty
19 | 
20 | syntax region aiplDefinition start=/^!!def\ze\s/ end="^\ze!" contains=aiplNestedCommandRegion,aiplDef
21 | syntax region aiplNestedCommandRegion start=/^ !/ end=/^\ze \?!/ contained contains=aiplNestedOperator,aiplNestedString,aiplComment skipempty
22 | syntax match aiplNestedOperator /^ !\+/ contained nextgroup=aiplCommand
23 | syntax region aiplNestedString start=/^ [^!]/ end=/^\ze !/ contained contains=aiplTemplateParameter,aiplComment
24 | 
25 | highlight link aiplComment Comment
26 | highlight link aiplOperator Operator
27 | highlight link aiplNestedOperator Operator
28 | highlight link aiplRedirect Operator
29 | highlight link aiplDef Keyword
30 | highlight link aiplKeyword Keyword
31 | highlight link aiplNestedString String
32 | highlight link aiplString String
33 | highlight link aiplTemplateParameter Identifier
34 | 
35 | let b:current_syntax = "aipl"
36 | 


--------------------------------------------------------------------------------
/aipl/caching.py:
--------------------------------------------------------------------------------
 1 | from functools import wraps
 2 | 
 3 | from aipl import AIPL, stderr
 4 | 
 5 | 
 6 | def dbcache(func):
 7 |     'Decorator to persistently cache result from func(aipl, *args, *kwargs).'
 8 |     @wraps(func)
 9 |     def cachingfunc(aipl:AIPL, *args, **kwargs):
10 |         if not aipl.cache_db:
11 |             return func(aipl, *args, **kwargs)
12 | 
13 |         key = f'{args} {kwargs}'
14 |         tbl = 'cached_'+func.__name__
15 |         ret = aipl.cache_db.select(tbl, key=key)
16 |         if ret:
17 |             row = ret[-1]
18 |             if 'output' in row:
19 |                 return row['output']
20 | 
21 |             del row['key']
22 |             stderr('[using cached value]')
23 |             return row
24 | 
25 |         result = func(aipl, *args, **kwargs)
26 | 
27 |         if isinstance(result, dict):
28 |             aipl.cache_db.insert(tbl, key=key, **result)
29 |         else:
30 |             aipl.cache_db.insert(tbl, key=key, output=result)
31 | 
32 |         return result
33 | 
34 |     return cachingfunc
35 | 
36 | 
37 | def expensive(mockfunc=None):
38 |     'Decorator to persistently cache result from func(aipl, *args, **kwargs).  Use as @expensive(mock_func) where mock_func has identical signature to func and returns a compatible result during --dry-run.'
39 |     def _decorator(func):
40 |         @wraps(func)
41 |         def _wrapper(aipl:AIPL, *args, **kwargs):
42 |             if aipl.options.dry_run:
43 |                 if mockfunc:
44 |                     return mockfunc(aipl, *args, **kwargs)
45 |                 else:
46 |                     return f'<{func.__name__}({args} {kwargs})>'
47 | 
48 |             return dbcache(func)(aipl, *args, **kwargs)
49 | 
50 |         return _wrapper
51 |     return _decorator
52 | 


--------------------------------------------------------------------------------
/tools/vscode/syntaxes/aipl.tmLanguage.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "https://raw.githubusercontent.com/martinring/tmlanguage/master/tmlanguage.json",
 3 |     "name": "aipl",
 4 |     "patterns": [
 5 |         {
 6 |             "begin": "#",
 7 |             "beginCaptures": {
 8 |                 "0": {
 9 |                     "name": "punctuation.definition.comment.aipl"
10 |                 }
11 |             },
12 |             "end": "\\n",
13 |             "name": "comment.line.number-sign.aipl",
14 |             "patterns": [
15 |                 {
16 |                     "match": "(\\bTODO\\b|\\bFIXME\\b|\\bNOTE\\b|@todo)",
17 |                     "name": "keyword.other.documentation.task.aipl"
18 |                 }
19 |             ]
20 |         },
21 |         
22 |         {
23 |             "begin": "!",
24 |             "beginCaptures": {
25 |                 "0": {
26 |                     "name": "punctuation.definition.command.aipl"
27 |                 }
28 |             },
29 |             "end": "\\s|\\n",
30 |             "name": "keyword.control.command.aipl",
31 |             "patterns": [
32 |                 {
33 |                     "begin": ">",
34 |                     "end": "\\s|$",
35 |                     "beginCaptures": {
36 |                         "0": {
37 |                             "name": "punctuation.separator.command.aipl"
38 |                         }
39 |                     },
40 |                     "endCaptures": {
41 |                         "0": {
42 |                             "name": "variable.parameter.command.aipl"
43 |                         }
44 |                     }
45 |                 }
46 |             ]
47 |         }
48 |         
49 |     ],
50 |     "repository": {},
51 |     "scopeName": "source.aipl"
52 | }
53 | 


--------------------------------------------------------------------------------
/examples/nyt-cooking.aipl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bin/aipl
 2 | 
 3 | # Simple script that takes URLs of recipes from NYT cooking and converst to a markdown recipe format (specifically for Obsidian)
 4 | # Output is one markdown file per recipe
 5 | 
 6 | # extract recipe name from URL, for use in file name
 7 | @defop('getname', 0.5, 0)
 8 | def getname(aipl, d):
 9 |     s = d['path'].split('-')
10 |     return ' '.join(s[1:])
11 | 
12 | # sample URLs
13 | !split>url
14 | https://cooking.nytimes.com/recipes/1019883-chicken-piccata
15 | https://cooking.nytimes.com/recipes/1013317-zucchini-and-carrot-fritters-with-yogurt-mint-dip
16 | https://cooking.nytimes.com/recipes/1022534-green-chile-chicken-tacos
17 | 
18 | !url-split
19 | !getname>name
20 | 
21 | !fetch-url <url
22 | !extract-text
23 | 
24 | # Obsidian recipe format below comes from this tutorial: https://forum.obsidian.md/t/obsidian-as-recipe-manager-and-shopping-list-tutorial/40799
25 | # NB: in order to preserve markdown formatting, a leading space is needed before the # so that aipl doesn't interpret it as a comment string
26 | !format
27 | Here is an Obsidian template I use for recipes:
28 | 
29 |  ### Recipe Name 
30 | 
31 | >Notes: 
32 | 
33 | Time:
34 | Serves:
35 | 
36 |  ### Ingredients
37 |  #ingredients 
38 | - [x] first ingredient
39 | - [x] second ingredient
40 | - [x] third ingredient
41 | 
42 | ---
43 |  #### Intro:
44 | 
45 | 
46 | ---
47 |  #### Directions
48 | 
49 | 
50 |  #### FINISH:
51 | 
52 | 
53 | And here is a poorly typed recipe that I would like you to re-format in the style of the above template please. 
54 | Please put any tips in the Notes section.
55 | Just return the reformatted recipe, no extra words.
56 | 
57 | {_}
58 | 
59 | !llm model=gpt-3.5-turbo
60 | 
61 | # tack on some metadata
62 | !format
63 | ---
64 | alias:
65 | source: {url}
66 | tags: recipe
67 | ---
68 | {_}
69 | 
70 | # and we're done!
71 | !save {name}.md


--------------------------------------------------------------------------------
/aipl/ops/xml.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from aipl import defop
 3 | 
 4 | def _xml(s):
 5 |     if not isinstance(s, str):
 6 |         return s
 7 | 
 8 | #    from bs4 import BeautifulSoup
 9 | #    return BeautifulSoup(xml, 'xml')
10 | 
11 |     from lxml import etree
12 |     root = etree.fromstring(s.encode())
13 |     for elem in root.getiterator():
14 |         # Skip comments and processing instructions,
15 |         # because they do not have names
16 |         if not (
17 |             isinstance(elem, etree._Comment)
18 |             or isinstance(elem, etree._ProcessingInstruction)
19 |         ):
20 |             # Remove a namespace URI in the element's name
21 |             elem.tag = etree.QName(elem).localname
22 | 
23 |     # Remove unused namespace declarations
24 |     etree.cleanup_namespaces(root)
25 | 
26 |     return root
27 | 
28 | 
29 | class XMLStringableElement:
30 |     def __init__(self, e):
31 |         self._element = e
32 |     def __getattr__(self, k):
33 |         return getattr(self._element, k)
34 |     def __str__(self):
35 |         return getattr(self._element, 'text', '') or ''
36 | 
37 | def StringifiableObject(s):
38 |     'create pass-through wrapper to stringify with s.text if available'
39 |     if not hasattr(s, 'text'):
40 |         return s
41 |     return XMLStringableElement(s)
42 | 
43 | 
44 | @defop('xml-xpath', 0, 1)
45 | def op_xml_xpath(aipl, v:str, *args) -> List['XmlElement']:
46 |     "Return a vector of XMLElements from parsing entries in value."
47 |     xml = _xml(v)
48 |     for arg in args:
49 |         for entry in xml.xpath(arg):
50 |             yield StringifiableObject(entry)
51 | 
52 | 
53 | @defop('xml-xpaths', 0, 0.5)
54 | def op_xml_xpaths(aipl, v:str, **kwargs) -> List['XmlElement']:
55 |     "Return a vector of XMLElements from parsing entries in value; kwargs become column_name=xpath."
56 |     xml = _xml(v)
57 |     ret = {}
58 |     for varname, xpath in kwargs.items():
59 |         ret[varname] = StringifiableObject(xml.xpath(xpath)[0])
60 |     return ret
61 | 


--------------------------------------------------------------------------------
/aipl/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from dataclasses import dataclass
 3 | 
 4 | @dataclass
 5 | class Error:
 6 |     'A cascading error that does not break the pipeline'
 7 |     linenum:int = 0
 8 |     opname:str = ''
 9 |     exception:Exception = None
10 | 
11 |     def __str__(self):
12 |         return f'AIPL Error (line {self.linenum} !{self.opname}): {self.exception}'
13 | 
14 |     def __getitem__(self, k):
15 |         return self
16 | 
17 | 
18 | class AIPLCompileError(Exception):
19 |     'A nice error message during compilation to print to stderr and exit without a stacktrace.'
20 | 
21 | 
22 | class AIPLException(Exception):
23 |     'A nice error message to print to stderr and exit without a stacktrace.'
24 | 
25 | 
26 | class InnerPythonException(AIPLException):
27 |     'A nice error message when inner Python exec/eval raises.'
28 |     def __str__(self):
29 |         exc, tb, codestr = self.args
30 |         r = []
31 |         if hasattr(self, 'command'):  # added by other error handling
32 |             linenum = self.command.linenum
33 |             r.append(f'In "!{self.command.opname}" (line {self.command.linenum}):')
34 |         else:
35 |             linenum = 0
36 | 
37 |         for frame in tb:
38 |             r.append(f'Line ~{frame.lineno+linenum}, in {frame.name}')
39 |             r.append('    ' + codestr.splitlines()[frame.lineno-1])
40 | 
41 |         r.append(f'{type(exc).__name__}: {exc}')
42 | 
43 |         return '\n'.join(r)
44 | 
45 | 
46 | class UserAbort(BaseException):
47 |     'UserAbort not caught by internal error handling; will always exit.'
48 | 
49 | 
50 | from .utils import stderr
51 | from .db import Database
52 | from .table import Table, Column, SubColumn, LazyRow
53 | from .interpreter import AIPL, defop, Command, alias
54 | from .caching import expensive, dbcache
55 | from .parser import parse
56 | from .repl import repl
57 | from .main import main
58 | 
59 | 
60 | def import_submodules(pkgname):
61 |     'Import all files below the given *pkgname*'
62 |     import pkgutil
63 |     import importlib
64 | 
65 |     m = importlib.import_module(pkgname)
66 |     for module in pkgutil.walk_packages(m.__path__):
67 |         importlib.import_module(pkgname + '.' + module.name)
68 | 
69 | 
70 | import_submodules('aipl.ops')
71 | 


--------------------------------------------------------------------------------
/tests/test-xml.aipl:
--------------------------------------------------------------------------------
 1 | # testing xml operators
 2 | 
 3 | !test-input
 4 | 
 5 |     <?xml version="1.0" encoding="UTF-8"?>
 6 |     <feed xmlns="http://www.w3.org/2005/Atom">
 7 |       <link href="http://arxiv.org/api/query?search_query%3Dcat%3Acs.AI%26id_list%3D%26start%3D1000%26max_results%3D10" rel="self" type="application/atom+xml"/>
 8 |       <title type="html">ArXiv Query: search_query=cat:cs.AI&amp;id_list=&amp;start=1000&amp;max_results=10</title>
 9 |       <id>http://arxiv.org/api/DT5MV1FVXpfuZZ1aLNCXrU8CaYU</id>
10 |       <updated>2023-05-17T00:00:00-04:00</updated>
11 |       <entry>
12 |         <id>http://arxiv.org/abs/1109.2347v1</id>
13 |         <updated>2011-09-11T20:09:48Z</updated>
14 |         <published>2011-09-11T20:09:48Z</published>
15 |         <title>Breaking Instance-Independent Symmetries In Exact Graph Coloring</title>
16 |         <summary>Code optimization and high level synthesis can be posed as constraint satisfaction and optimization problems, such as graph coloring used in register allocation.</summary>
17 |         <author>
18 |           <name>I. L. Markov</name>
19 |         </author>
20 |         <arxiv:doi xmlns:arxiv="http://arxiv.org/schemas/atom">10.1613/jair.1637</arxiv:doi>
21 |         <link title="doi" href="http://dx.doi.org/10.1613/jair.1637" rel="related"/>
22 |         <arxiv:journal_ref xmlns:arxiv="http://arxiv.org/schemas/atom">Journal Of Artificial Intelligence Research, Volume 26, pages 289-322, 2006</arxiv:journal_ref>
23 |         <link href="http://arxiv.org/abs/1109.2347v1" rel="alternate" type="text/html"/>
24 |         <link title="pdf" href="http://arxiv.org/pdf/1109.2347v1" rel="related" type="application/pdf"/>
25 |         <arxiv:primary_category xmlns:arxiv="http://arxiv.org/schemas/atom" term="cs.AI" scheme="http://arxiv.org/schemas/atom"/>
26 |         <category term="cs.AI" scheme="http://arxiv.org/schemas/atom"/>
27 |       </entry>
28 |     </feed>
29 | 
30 | !xml-xpath //entry/link[@type="application/pdf"]
31 | !xml-xpaths published=../published updated=../updated title=../title href=@href
32 | 
33 | !ravel
34 | !test-json
35 | 
36 | [{
37 |     "published": "2011-09-11T20:09:48Z",
38 |     "updated": "2011-09-11T20:09:48Z",
39 |     "title": "Breaking Instance-Independent Symmetries In Exact Graph Coloring",
40 |     "href": "http://arxiv.org/pdf/1109.2347v1"
41 | }]
42 | 


--------------------------------------------------------------------------------
/examples/hanukkah-of-data-5783.aipl:
--------------------------------------------------------------------------------
 1 | @defop('sql', 0, 1.5)
 2 | def sql(aipl, q:str, db:str):
 3 |     import sqlite3
 4 |     con = sqlite3.connect(db)
 5 |     con.row_factory = sqlite3.Row
 6 |     return [
 7 |         {k:r[k] for k in r.keys()}
 8 |             for r in con.execute(q).fetchall()
 9 |     ]
10 | 
11 | 
12 | # !download < https://hanukkah.bluebird.sh/5783/noahs-sqlite.zip
13 | 
14 | !split>model>>models
15 | gpt-3.5-turbo
16 | gpt-4
17 | 
18 | !read
19 | https://hanukkah.bluebird.sh/5783/0
20 | !extract-text
21 | !format
22 | This is the intro to a set of database puzzles:
23 | """
24 | {_}
25 | """
26 | Give only the answer to the puzzle question, without any surrounding text.
27 | 
28 | !cross <<models
29 | 
30 | !llm model={model}
31 | !format
32 | unzip -f -P {_} noahs-sqlite.zip
33 | !sh
34 | 
35 | 
36 | # read all the puzzles
37 | !split>puznum sep=" "
38 | 1 2 3 4 5 6 7 8
39 | !format
40 | https://hanukkah.bluebird.sh/5783/{_}
41 | !read
42 | !extract-text
43 | 
44 | !format >text >>text
45 | I have a sqlite database. Here are the first few rows from each of the tables:
46 | 
47 | - customers
48 | customerid,name,address,citystatezip,birthdate,phone
49 | 1001,Jack Quinn,201 E Park St,"Los Angeles, CA 91343",1960-05-14,805-287-8515
50 | 1002,David Powell,224C Tysens Ln,"Staten Island, NY 10306",1978-04-04,516-768-1652
51 | 1003,Carrie Green,1608 W 53rd Way,"Tampa, FL 33614",1969-01-21,727-209-0470
52 | 
53 | - orders
54 | orderid,customerid,ordered,shipped,items,total
55 | 1001,4308,2017-01-31 00:32:19,2017-01-31 07:15:00,,25.52
56 | 1002,11683,2017-01-31 00:58:31,2017-01-31 18:00:00,,35.33
57 | 1003,5676,2017-01-31 01:34:40,2017-01-31 09:00:00,,30.79
58 | 
59 | - products
60 | sku,desc,wholesale_cost
61 | DLI0002,Smoked Whitefish Sandwich,9.33
62 | PET0005,"Vegan Cat Food, Turkey & Chicken",4.35
63 | HOM0018,Power Radio (red),21.81
64 | 
65 | - orders_items
66 | orderid,sku,qty,unit_price
67 | 1001,COL0820,1,25.52
68 | 1002,TOY8907,1,12.92
69 | 1002,KIT5813,1,7.99
70 | 
71 | The schema of the sqlite database exactly matches the schema above.
72 | 
73 | Here is a database puzzle to be solved using the above schema.
74 | 
75 | """
76 | {_}
77 | """
78 | 
79 | Give only a SQLite SELECT query to answer the question.
80 | 
81 | !cross <<models <<text
82 | !llm>query model={model}
83 | 
84 | #!sql db=noahs.sqlite
85 | !format
86 | ---
87 | {puznum}
88 | {query}
89 | 
90 | {_}
91 | ---
92 | !save hod-{puznum}.sql
93 | 


--------------------------------------------------------------------------------
/aipl/test_core.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from collections import defaultdict
 3 | import string
 4 | 
 5 | import pytest
 6 | 
 7 | from .interpreter import defop
 8 | from .table import Table, LazyRow
 9 | 
10 | 
11 | @defop('parse-keyval', 0, 0.5)
12 | def op_parse_keyval(aipl, s:str) -> dict:
13 |     k, v = s.split('=', maxsplit=1)
14 |     return {k:v}
15 | 
16 | @defop('combine-dict', 1.5, 0.5)
17 | def op_combine_dict(aipl, t:Table) -> dict:
18 |     ret = {}
19 |     for row in t:
20 |         ret.update(row._asdict())
21 |     return ret
22 | 
23 | @defop('cases', 0, 0.5)
24 | def op_cases(aipl, v:str) -> dict:
25 |     return dict(upper=v.upper(), lower=v.lower())
26 | 
27 | @defop('lowercase', 0, 0)
28 | def op_lowercase(aipl, v:str) -> str:
29 |     return v.lower()
30 | 
31 | @defop('uppercase', 0, 0)
32 | def op_uppercase(aipl, v:str) -> str:
33 |     return v.upper()
34 | 
35 | @defop('lettertypes', 0, 1.5, outcols='letters digits')
36 | def op_letters(aipl, v:str) -> List[dict]:
37 |     'Yield dict(letters=, digits=) for each word in input.'
38 |     for word in v.split():
39 |         letters = defaultdict(int)
40 |         for c in word:
41 |             if c in string.ascii_letters:
42 |                 letters['letters'] += 1
43 |             elif c in string.digits:
44 |                 letters['digits'] += 1
45 |         yield letters
46 | 
47 | def test_lowercase(aipl):
48 |     # scalar to scalar
49 |     # 2 rows; single column
50 |     t = aipl.run_test('!split !lowercase !join', 'A b C', 'DeF')
51 |     assert len(t.rows) == 2
52 |     assert t[0].value == 'a b c'
53 |     assert t[1].value == 'def'
54 | 
55 | def test_cases(aipl):
56 |     t = aipl.run_test('!split !cases !join', 'A b C', 'DeF')
57 |     assert len(t.rows) == 2
58 |     assert t[0].value == 'a b c'
59 |     assert t[1].value == 'def'
60 | 
61 | 
62 | def test_op_dicts(aipl):
63 |     'test ops of rankin/rankout == 0.5'
64 |     t = aipl.run_test('!split sep=, !parse-keyval !combine-dict', 'a=1,b=2,c=3')
65 |     assert t._asdict()[0] == dict(a='1', b='2', c='3')
66 | 
67 | 
68 | def test_col_reference(aipl):
69 |     t = aipl.run_test('!split sep=, !parse-keyval !combine-dict !format\n{first} {last}', 'last=smith,first=mike')
70 |     assert t[0].value == 'mike smith'
71 | 
72 | 
73 | def test_out_table_dict(aipl):
74 |     'Tests when a rankout of 1.5 is returned a dict.'
75 |     r = aipl.run_test('!lettertypes', '1abc cd23 de53')
76 |     t = r[0].value
77 |     assert set(t.colnames) == set(['digits', 'letters'])
78 |     assert t[0]['digits'] == 1 and t[0]['letters'] == 3
79 | 


--------------------------------------------------------------------------------
/about/vision.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # A Toolmaker's Vision
 3 | 
 4 | With a simple framework, a common connection interface, and the right set of components, the work becomes relatively easy:
 5 | 
 6 |     - the work being done is only the *essence* of the work to be done
 7 |        - no unnecessary complexity
 8 |        - no impedance mismatch between components
 9 | 
10 |     - the work is more than easy--it is delightful
11 |        - some of this is just raw "oh thank god yes this is what software should be like"
12 |        - some is a surprising depth, an invitation to explore that will often be rewarded
13 |        - some is a nostalgic [feeling of the computer](bluebird.sh/feeling)
14 | 
15 |     - the work is so easy and delightful that it becomes playful.
16 | 
17 | Not just can more work get done faster, but a *whole new level* of possibility opens up.
18 | 
19 | Like the piano, or the typewriter, or the spreadsheet.
20 | 
21 | ## The ladder of computing
22 | 
23 | The progression of computational tools goes from calculators, to spreadsheets, to notebooks, to scripts, to programs, to systems.
24 | Each level gives you more power and flexibility, and also needs more mana and a higher skill level to use.
25 | The lower you go, the more it's geared towards an individual user; the higher you go, the more towards users operating as part of a larger organization.
26 | 
27 | In the realm of AI, ChatGPT is a calculator: you can run only 1 calculation at a time.
28 | If you have a one-off question for GPT, you can just open the website and type it in, and they handle some niceties for you.
29 | 
30 | But if you keep coming back and pasting in a prompt, or you want to run the same prompt with madlibs or a mail merge or across a range of temperatures, or you have to fetch a page from the web, or you have to split the text up so it fits in the context window...you're going to want to use the API (or maybe another LLM).
31 | 
32 | But you have to write code to use the API.  If your use case is very simple or prescribed, someone may have written the code such that you can use it as an existing program or service.  But for anything requiring even a bit of customization outside of that, you would have to at least use a notebook (which aren't pure text and can be unwieldy), or graduate to a script.
33 | 
34 | Python has grown into a huge language, and is no longer at the 'script' level for data processing tasks (though it is easier than doing it in Rust!).  Even if the libraries to do what you want already exist, you still need a fair amount of programming experience and skill to make it happen.
35 | 
36 | AIPL is intended to be at the script level for data processing and AI.
37 | 


--------------------------------------------------------------------------------
/aipl/ops/json.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Provides !json which converts Table into json blob,
 3 | and !json_parse which converts json blob into row.
 4 | '''
 5 | 
 6 | import json
 7 | 
 8 | from aipl import defop, Table, Column
 9 | 
10 | 
11 | class _jsonEncoder(json.JSONEncoder):
12 |     def default(self, obj):
13 |         return str(obj)
14 | 
15 | 
16 | @defop('json', 100, 0)
17 | def op_json(aipl, t:Table, indent:int = None) -> str:
18 |     'Convert Table into a json blob.'
19 |     jsonenc = _jsonEncoder(indent=indent)
20 |     return jsonenc.encode(t._asdict())
21 | 
22 | 
23 | def _json_find(v:dict|list|int|float|str, args):
24 |     if not args:
25 |         yield v
26 |     elif isinstance(v, (str, int, float)):
27 |         yield v
28 |     elif isinstance(v, (list, tuple)):
29 |         for item in v:
30 |             yield from _json_find(item, args)
31 |     elif isinstance(v, dict):
32 |         for k, item in v.items():
33 |             if args and k != args[0]:
34 |                 continue
35 |             yield from _json_find(item, args[1:])
36 |     else:
37 |         raise 'error'
38 | 
39 | 
40 | class FlatteningDict(dict):
41 |     def __init__(self, d:dict):
42 |         for k, v in d.items():
43 |             self[k] = v
44 | 
45 |     def __setitem__(self, k, v):
46 |         if isinstance(v, dict):
47 |             for newk, newv in v.items():
48 |                 self[k+'_'+newk] = newv  # should recurse
49 |         else:
50 |             super().__setitem__(k, v)
51 | 
52 | def test_flattening_dict():
53 |     r = FlatteningDict(dict(a=dict(b=1, c=2), d=4, e=dict(f=dict(g=5))))
54 |     assert r == dict(a_b=1, a_c=2, d=4, e_f_g=5)
55 | 
56 | def pyobj_to_table(r) -> Table|dict|int|float|str:
57 |     if r is None:
58 |         return None
59 |     elif isinstance(r, (list, tuple)):
60 |         keys = set()
61 |         ret = Table()
62 |         for inobj in r:
63 |             outobj = pyobj_to_table(inobj)
64 |             assert isinstance(outobj, dict)
65 |             ret.rows.append(outobj)
66 |             keys |= set(outobj.keys())
67 | 
68 |         for k in keys:
69 |             ret.add_column(Column(k, k))
70 |         return ret
71 |     elif isinstance(r, dict):
72 |         # = {'__parent': parent_row} if parent_row is not None else {}
73 |         return FlatteningDict({k:pyobj_to_table(v) for k, v in r.items()})
74 |     else:
75 |         assert isinstance(r, (str, int, float)), type(r)
76 |         return r
77 | 
78 | 
79 | @defop('json-parse', 0, 1.5)
80 | def op_json_parse(aipl, v:str, **kwargs) -> Table:
81 |     'Convert a json blob into a Table.'
82 |     r = json.loads(v)
83 |     if not kwargs:
84 |         if isinstance(r, dict):
85 |             return pyobj_to_table([r])
86 |         else:
87 |             return pyobj_to_table(r)
88 |     else:
89 |         for colname, findstr in kwargs.items():
90 |             for ret in _json_find(r, findstr.split('.')):
91 |                 return pyobj_to_table(ret)
92 | 


--------------------------------------------------------------------------------
/aipl/ops/debug.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import sys
 3 | 
 4 | from aipl import defop, LazyRow, UserAbort, Table, AIPL, Command
 5 | 
 6 | 
 7 | @defop('option', None, None)
 8 | def op_option(aipl, **kwargs):
 9 |     'Set option=value.'
10 |     for k, v in kwargs.items():
11 |         aipl.options[k] = v
12 | 
13 | 
14 | @defop('debug', None, None)
15 | def op_debug(aipl, *args):
16 |     'set debug flag and call breakpoint() before each command'
17 |     aipl.options.debug = True
18 | 
19 | def _vd_singlestep(aipl, cmd:Command, *inputs:List[LazyRow]):
20 |     import visidata
21 |     @visidata.VisiData.api
22 |     def uberquit(vd):
23 |         raise UserAbort('user abort')
24 | 
25 |     inputs = list(r._asdict() for r in inputs)
26 |     sheet = visidata.PyobjSheet('current_input', source=inputs)
27 |     sheet.help = '{sheet.recentcmd}'
28 |     argstr = ' '.join(str(x) for x in cmd.args)
29 |     kwargstr = ' '.join(f'{k}={v}' for k, v in cmd.kwargs.items())
30 |     sheet.recentcmd = f'[line {cmd.linenum}] !' + ' '.join([cmd.opname, argstr, kwargstr])
31 |     sheet.addCommand('Q', 'quit-really', 'uberquit()')
32 |     visidata.vd.run(sheet)
33 | 
34 | 
35 | def stderr_rich(*args):
36 |     import rich
37 |     rich.print(*args, file=sys.stderr)
38 | 
39 | 
40 | @defop('pp', 100, None)
41 | def op_rich(aipl, t:Table, *args, **kwargs):
42 |     'Pretty-print the whole table to stderr.'
43 |     stderr_rich(t, *args, **kwargs)
44 | 
45 | def install_rich(aipl, *args):
46 |     import rich
47 |     AIPL.pre_command = lambda aipl, cmd, t=Table(), *args: stderr_rich(t, cmd)
48 | 
49 | 
50 | def _rich_table(t:Table, console, console_options):
51 |     import rich
52 |     import rich.table
53 | 
54 |     table = rich.table.Table(show_header=True,
55 |                              row_styles=['', 'bold'],
56 |                              header_style="bold magenta")
57 |     colnames = []
58 |     for col in t.columns:
59 |         if col.hidden:
60 |             if col is not t.current_col:
61 |                 continue
62 |         colname = col.name
63 | 
64 |         colnames.append(colname)
65 |         table.add_column(colname)
66 | 
67 |     maxrows = 3
68 |     maxstrlen = 280
69 |     for i, row in enumerate(t):
70 |         if i >= maxrows:
71 |             table.add_row('[... %s more rows ...]' % (len(t) - maxrows))
72 |             break
73 |         rowdata = []
74 |         for colname in colnames:
75 |             cell = row[colname]
76 |             if not isinstance(cell, (Table, str)):
77 |                 cell = str(cell)
78 |             if isinstance(cell, str) and len(cell) > maxstrlen:
79 |                 cell = cell[:maxstrlen] + ' [...]'
80 |             rowdata.append(cell)
81 |         table.add_row(*rowdata)
82 |     return [table]
83 | 
84 | 
85 | def _rich_command(cmd:Command, console, console_options):
86 |     return [str(cmd)]
87 | 
88 | 
89 | Table.__rich_console__ = _rich_table
90 | Command.__rich_console__ = _rich_command
91 | AIPL.step_rich = install_rich
92 | AIPL.step_vd = _vd_singlestep
93 | 


--------------------------------------------------------------------------------
/aipl/ops/metrics.py:
--------------------------------------------------------------------------------
 1 | from aipl.table import Table
 2 | from aipl import defop, LazyRow
 3 | import numpy as np
 4 | 
 5 | def _is_int(val):
 6 |     try:
 7 |         int(val)
 8 |         return True
 9 |     except ValueError:
10 |         return False
11 | 
12 | def _to_np_int_array(t:Table, colname:str) -> np.array:
13 |     column = [int(row[colname]) if _is_int(row[colname]) else np.nan for row in t]
14 |     return np.array(column)
15 | 
16 | def _true_positives(predictions:np.array, true_values:np.array) -> float:
17 |     return ((predictions == 1) & (true_values == 1)).sum()
18 | 
19 | def _true_negatives(predictions:np.array, true_values:np.array) -> float:
20 |     return ((predictions == 0) & (true_values == 0)).sum()
21 | 
22 | def _false_positives(predictions:np.array, true_values:np.array) -> float:
23 |     return ((predictions == 1) & (true_values == 0)).sum()
24 | 
25 | def _false_negatives(predictions:np.array, true_values:np.array) -> float:
26 |     return ((predictions == 0) & (true_values == 1)).sum()
27 | 
28 | def _recall(predictions:np.array, true_values:np.array) -> float:
29 |     N = true_values.shape[0]
30 |     return (true_values == predictions).sum() / N
31 | 
32 | def _precision(predictions:np.array, true_values:np.array) -> float:
33 |     TP = _true_positives(predictions, true_values)
34 |     FP = _false_positives(predictions, true_values)
35 |     return TP / (TP+FP)
36 | 
37 | def _balanced_accuracy(predictions:np.array, true_values:np.array, add_one_smoothing:bool) -> float:
38 |     TP = _true_positives(predictions, true_values)
39 |     TN = _true_negatives(predictions, true_values)
40 |     FP = _false_positives(predictions, true_values)
41 |     FN = _false_negatives(predictions, true_values)
42 |     if add_one_smoothing:
43 |         true_positive_rate = (TP + 1) / (TP + FN + 1)
44 |         true_negative_rate = (TN + 1) / (TN + FP + 1)
45 |     else:
46 |         true_positive_rate = TP / (TP + FN)
47 |         true_negative_rate = TN / (TN + FP)
48 |     return (true_positive_rate + true_negative_rate) / 2
49 | 
50 | @defop('metrics-accuracy', 1.5, 0)
51 | def op_accuracy(aipl, t:Table, predictions_colname:str, true_values_colname:str, add_one_smoothing:bool=None) -> float:
52 |     true_values = _to_np_int_array(t, true_values_colname)
53 |     predictions = _to_np_int_array(t, predictions_colname)
54 |     return _balanced_accuracy(predictions, true_values, add_one_smoothing=='True')
55 | 
56 | @defop('metrics-precision', 1.5, 0)
57 | def op_precision(aipl, t:Table, predictions_colname:str, true_values_colname:str, add_one_smoothing:bool=None) -> float:
58 |     true_values = _to_np_int_array(t, true_values_colname)
59 |     predictions = _to_np_int_array(t, predictions_colname)
60 |     return _precision(predictions, true_values)
61 | 
62 | @defop('metrics-recall', 1.5, 0)
63 | def op_precision(aipl, t:Table, predictions_colname:str, true_values_colname:str, add_one_smoothing:bool=None) -> float:
64 |     true_values = _to_np_int_array(t, true_values_colname)
65 |     predictions = _to_np_int_array(t, predictions_colname)
66 |     return _recall(predictions, true_values)


--------------------------------------------------------------------------------
/aipl/db.py:
--------------------------------------------------------------------------------
  1 | from functools import cached_property
  2 | import sys
  3 | import json
  4 | import sqlite3
  5 | 
  6 | from .utils import AttrDict
  7 | 
  8 | 
  9 | def dict_factory(cursor, row):
 10 |     return AttrDict((k, v) for (k, *_), v in zip(cursor.description, row))
 11 | 
 12 | 
 13 | def sqlite_to_pyobj(v, t:str):
 14 |     if t == 'JSON':
 15 |         return json.loads(v)
 16 |     return v
 17 | 
 18 | 
 19 | def pyobj_to_sqlite(v):
 20 |     if isinstance(v, (dict, list, tuple)):
 21 |         return json.dumps(v)
 22 |     return v
 23 | 
 24 | 
 25 | def sqlite_type(v):
 26 |     if isinstance(v, int): return 'INTEGER'
 27 |     if isinstance(v, float): return 'REAL'
 28 |     if isinstance(v, (dict, list, tuple)): return 'JSON'
 29 |     return 'TEXT'
 30 | 
 31 | 
 32 | class Database:
 33 |     def __init__(self, dbfn):
 34 |         self.dbfn = dbfn
 35 |         self.tables = {}  # tablename -> { colname -> { .type:str, ... } }
 36 | 
 37 |     @cached_property
 38 |     def con(self):
 39 |         con = sqlite3.connect(self.dbfn)
 40 |         con.row_factory = dict_factory
 41 |         return con
 42 | 
 43 |     def __enter__(self):
 44 |         return self
 45 | 
 46 |     def __exit__(self, type, value, tb):
 47 |         if not tb:
 48 |             self.con.commit()
 49 |         return False
 50 | 
 51 |     def get_table_info(self, tblname:str):
 52 |         if tblname not in self.tables:
 53 |             tinfo = self.query(f'PRAGMA table_info("{tblname}")')
 54 |             if not tinfo:
 55 |                 return {}
 56 | 
 57 |             self.tables[tblname] = {c['name']:c for c in tinfo}
 58 | 
 59 |         return self.tables[tblname]
 60 | 
 61 |     def insert(self, tblname, **kwargs):
 62 |         if tblname not in self.tables:
 63 |             fieldstr = ', '.join(f'"{k}" {sqlite_type(v)}' for k,v in kwargs.items())
 64 |             self.con.execute(f'CREATE TABLE IF NOT EXISTS "{tblname}" ({fieldstr})')
 65 | 
 66 |         fieldnames = ','.join(f'"{x}"' for x in kwargs.keys())
 67 |         valholders = ','.join(['?']*len(kwargs))
 68 |         self.con.execute(f'INSERT INTO "{tblname}" ({fieldnames}) VALUES ({valholders})', tuple(pyobj_to_sqlite(v) for v in kwargs.values()))
 69 |         self.con.commit()
 70 |         return kwargs
 71 | 
 72 |     def table(self, tblname):
 73 |         return self.query(f'SELECT * FROM "{tblname}"')
 74 | 
 75 |     def select(self, tblname, **kwargs):
 76 |         tinfo = self.get_table_info(tblname)
 77 |         if not tinfo:
 78 |             return []
 79 | 
 80 |         wheres = [f'"{k}"=?' for k in kwargs.keys()]
 81 |         wherestr = ' AND '.join(wheres)
 82 |         results = self.query(f'SELECT * FROM "{tblname}" WHERE {wherestr}',
 83 |                               *tuple(kwargs.values()))
 84 | 
 85 |         return [AttrDict((k, sqlite_to_pyobj(v, tinfo[k]['type']))
 86 |                     for k, v in row.items()
 87 |                 ) for row in results]
 88 | 
 89 |     def query(self, qstr, *args):
 90 |         try:
 91 |             cur = self.con.cursor()
 92 |             res = cur.execute(qstr, args)
 93 |             return res.fetchall()
 94 |         except sqlite3.OperationalError as e:
 95 |             print(e, file=sys.stderr)
 96 |             return []
 97 | 
 98 |     def sql(self, qstr):
 99 |         return self.con.execute(qstr)
100 | 


--------------------------------------------------------------------------------
/aipl/ops/llm.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | !llm and !llm-embedding use the OpenAI API to make queries to GPT.
 3 | 
 4 | Requires OPENAI_API_KEY and OPENAI_API_ORG envvars to be set.
 5 | '''
 6 | 
 7 | from typing import List, Dict
 8 | import os
 9 | import subprocess
10 | from pathlib import Path
11 | 
12 | from aipl import defop, expensive, stderr, AIPLException, clients
13 | 
14 | 
15 | def _parse_msg(s:str):
16 |     if s.startswith('@@@s'):
17 |         return dict(role='system', content=s)
18 |     elif s.startswith('@@@a'):
19 |         return dict(role='assistant', content=s)
20 |     else:  # if s.startswith('@@@u'):
21 |         return dict(role='user', content=s)
22 | 
23 | def op_llm_mock(aipl, v:str, **kwargs) -> str:
24 |     model = kwargs.get('model')
25 |     used = clients.count_tokens(v, model=model)
26 |     cost = clients.openai_pricing[model]*used/1000
27 |     aipl.cost_usd += cost
28 |     return f'<llm {model} answer>'
29 | 
30 | @defop('llm', 0, 0)
31 | @expensive(op_llm_mock)
32 | def route_llm_query(aipl, v:str, **kwargs) -> str:
33 |     'Send chat messages to `model` (default: gpt-3.5-turbo).  Lines beginning with @@@s or @@@a are sent as system or assistant messages respectively (default user).  Passes all named args directly to API.'
34 |     client_str = kwargs.get('client')
35 |     if client_str is None:
36 |         if 'LLM_CLIENT_ENDPOINT' in os.environ:
37 |             client = clients.SelfHostedChatClient()
38 |         else:
39 |             client = clients.OpenAIClient()
40 |     else:
41 |         if client_str == 'selfhosted':
42 |             client = clients.SelfHostedChatClient()
43 |         elif client_str == 'openai':
44 |             client = clients.OpenAIClient()
45 |         elif client_str == 'gooseai':
46 |             client = clients.GooseClient()
47 |         else:
48 |             raise AIPLException(f"client '{client_str}' not recognized")
49 | 
50 |     return client.completion(aipl, v, **kwargs)
51 | 
52 | @defop('llm-embedding', 0, 0.5)
53 | @expensive()
54 | def route_llm_embedding_query(aipl, v:str, **kwargs) -> str:
55 |     'Get a [text embedding](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) for a string from `model`: a measure of text-relatedness, to be used with e.g. !cluster.'
56 |     model = kwargs.get('model')
57 |     if model in clients.gooseai_models:
58 |         raise AIPLException("GooseAI embeddings not yet supported")
59 |     elif model in clients.openai_pricing:
60 |         return embedding_openai(aipl, v, **kwargs)
61 |     else:
62 |         raise AIPLException(f"{model} not found!")
63 | 
64 | def embedding_openai(aipl, v:str, **kwargs) -> dict:
65 |     'Get a an openai [text embedding](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) for a string: a measure of text-relatedness, to be used with e.g. !cluster.'
66 |     import openai
67 | 
68 |     if not v:
69 |         raise Exception('no content for embedding')
70 | 
71 |     if 'OPENAI_API_KEY' not in os.environ or 'OPENAI_API_ORG' not in os.environ:
72 |         raise AIPLException('''OPENAI_API_KEY and OPENAI_API_ORG envvars must be set for !llm''')
73 | 
74 |     resp = openai.Embedding.create(input=v, **kwargs)
75 | 
76 |     used = resp['usage']['total_tokens']
77 |     stderr(f'Used {used} tokens')
78 | 
79 |     return dict(model=kwargs.get('model'),
80 |                 used_tokens=used,
81 |                 embedding=resp['data'][0]['embedding'])


--------------------------------------------------------------------------------
/examples/wiki-to-map.aipl:
--------------------------------------------------------------------------------
 1 | !test-input
 2 | https://en.wikipedia.org/wiki/Antikythera_mechanism
 3 | 
 4 | # load text of wikipedia article
 5 | !read
 6 | !extract-text>text
 7 | 
 8 | # possibly, make a summary of the whole thing
 9 | !format
10 | The following is a wikipedia article, which we're interested in extracting locations from for a map.
11 | ---
12 | {text}
13 | ---
14 | Please write a 2-sentence summary of the article, focused on the overall location(s) mentioned in the text, which will be used in the next step to provide an overall context for extracting a location from each paragraph.
15 | !llm>summary model=gpt-3.5-turbo-16k
16 | 
17 | # go through each paragraph and extract text locations
18 | !format
19 | {text}
20 | !split>paragraph sep=\n
21 | !match ^[^-].{{140,}}$
22 | !filter 
23 | !format
24 | We're extracting locations from a paragraph of a wikipedia article. Here's our summary of the whole article: "{summary}".
25 | ---
26 | Here's the paragraph we want the locations from:
27 | {paragraph}
28 | ---
29 | Locations should be in quotes, comma-separated, and unique for the geocoding step. Add in the country or other major geographic container at the end of the location (eg, "New York City, NY" becomes "New York City, NY, USA", "Galapagos Archipelago" becomes "Galapagos Archipelago, Ecuador").
30 | 
31 | Please ONLY respond with a list of locations, nothing else--there is no need to add caveats or explanations, the answers will inevitably not be "perfect". If there are no locations mentioned in this specific paragraph, please only respond with a comma ",". Do not use locations from the summary unless they appear in the paragraph.
32 | !llm
33 | !split>location sep=,
34 | 
35 | !format
36 | The following is a location. Please respond with your best guess at to its latitude and longitude. DO NOT respond with anything except <LATITUDE>, <LONGITUDE>. You may infer the country of origin.
37 | 
38 | For example:
39 | LOCATION: New York City, NY
40 | GEOCODED: 40.712778, -74.006111
41 | 
42 | LOCATION: Wreck of the Titanic
43 | GEOCODED: 41.725556, -49.946944
44 | 
45 | LOCATION: Acropolis of Athens
46 | GEOCODED: 37.9715, 23.7262
47 | 
48 | LOCATION: {location}
49 | GEOCODED: 
50 | !llm>geocoded
51 | 
52 | !json indent=2
53 | 
54 | !print
55 | !save locations.json
56 | 
57 | # make into a map
58 | !python 
59 | import json
60 | import folium
61 | import itertools
62 | 
63 | colors = itertools.cycle(['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue', 'darkpurple', 'pink', 'lightblue', 'lightgreen', 'gray', 'black', 'lightgray'])
64 | 
65 | # Load locations data from JSON file
66 | with open('locations.json', 'r') as f:
67 |     data = json.load(f)
68 | 
69 | # Initialize map centered around (0, 0)
70 | m = folium.Map(location=[0, 0], zoom_start=4)
71 | 
72 | # List to store coordinates for lines
73 | coords = []
74 | 
75 | # Parse and add each location to the map
76 | for item in data:
77 |     for subitem in item["_"]:
78 |         if subitem["_"]:
79 |             paragraph = subitem["paragraph"]
80 |             color = next(colors)
81 |             for loc in subitem["_"]:
82 |                 geocoded = loc["geocoded"]
83 |                 location_name = loc["location"]
84 |                 try:
85 |                     lat, lon = [float(coord) for coord in geocoded.split(',')]
86 |                 except:
87 |                     print("skipping", location_name, geocoded, file=sys.stderr)
88 |                     continue
89 |                 marker = folium.Marker([lat, lon], icon=folium.Icon(color=color, icon="cloud"))
90 |                 marker.add_child(folium.Tooltip(text=location_name, permanent=True))
91 |                 marker.add_child(folium.Popup(paragraph, max_width=500))
92 |                 marker.add_to(m)
93 | 
94 | # Save the map to an HTML file
95 | m.save('map.html')
96 | 


--------------------------------------------------------------------------------
/aipl/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import traceback
  4 | import argparse
  5 | 
  6 | from aipl import AIPL, Table, UserAbort, AIPLException, parse, repl
  7 | 
  8 | def parse_args(args):
  9 |     parser = argparse.ArgumentParser(description='AIPL interpreter')
 10 |     parser.add_argument('--debug', '-d', action='store_true', help='abort on exception')
 11 |     parser.add_argument('--test', '-t', action='store_true', help='enable test mode')
 12 |     parser.add_argument('--interactive', '-i', action='store_true', help='interactive REPL')
 13 |     parser.add_argument('--step', action='store', default='', help='call aipl.step_<func>(cmd, input) before each step')
 14 |     parser.add_argument('--step-breakpoint', '-x', action='store_const', dest='step', const='breakpoint', help='breakpoint() before each step')
 15 |     parser.add_argument('--step-rich', '-v', action='store_const', dest='step', const='rich', help='output rich table before each step')
 16 |     parser.add_argument('--step-vd', '--vd', action='store_const', dest='step', const='vd', help='open VisiData with input before each step')
 17 |     parser.add_argument('--dry-run', '-n', action='store_true', help='do not execute @expensive operations')
 18 |     parser.add_argument('--cache-db', '-c', action='store', default='aipl-cache.sqlite', dest='cachedbfn', help='sqlite database for caching operators')
 19 |     parser.add_argument('--no-cache', action='store_const', dest='cachedbfn', const='', help='sqlite database for caching operators')
 20 |     parser.add_argument('--output-db', '-o', action='store', default='aipl-cache.sqlite', dest='outdbfn', help='sqlite database accessible to !db operators')
 21 |     parser.add_argument('--split', '--separator', '-s', action='store', default='\n', dest='separator', help='separator to split input on')
 22 |     parser.add_argument('script_or_global', nargs='*', help='scripts to run, or k=v global parameters')
 23 |     return parser.parse_args(args)
 24 | 
 25 | 
 26 | 
 27 | def main():
 28 | 
 29 |     args = parse_args(None)
 30 |     global_parameters = {}
 31 |     scripts = []
 32 |     inputs = []
 33 | 
 34 |     for arg in args.script_or_global:
 35 |         if '=' in arg:
 36 |             key, value = arg.split('=', maxsplit=1)
 37 |             global_parameters[key] = value
 38 |         else:
 39 |             scripts.append(arg)
 40 | 
 41 |     if not scripts:  # nothing to run -> REPL
 42 |         args.interactive = True
 43 | 
 44 |     aipl = AIPL(**vars(args))
 45 | 
 46 |     # dup stdin/stdout if necessary
 47 | 
 48 |     if not sys.stdin.isatty():
 49 |         try:
 50 |             fin = open('/dev/tty')
 51 |             aipl.stdin = open(os.dup(0))
 52 |             os.dup2(fin.fileno(), 0)
 53 |             stdin_contents = aipl.stdin.read()
 54 |             fin.close()
 55 |         except OSError as e:
 56 |             aipl.stdin = sys.stdin
 57 |             stdin_contents = ''
 58 |     else:
 59 |         aipl.stdin = sys.stdin
 60 |         stdin_contents = ''
 61 | 
 62 |     if not sys.stdout.isatty():
 63 |         try:
 64 |             fout = open('/dev/tty', mode='w')
 65 |             aipl.stdout = open(os.dup(1), 'w')  # for dumping to stdout from interface
 66 |             os.dup2(fout.fileno(), 1)
 67 |             fout.close() # close file descriptors for original stdin/stdout
 68 |         except OSError as e:
 69 |             aipl.stdout = sys.stdout
 70 |     else:
 71 |         aipl.stdout = sys.stdout
 72 | 
 73 |     aipl.globals.update(global_parameters)
 74 | 
 75 |     # add input from stdin
 76 |     input_text = stdin_contents.strip()
 77 | 
 78 |     if args.separator:
 79 |         inputlines = input_text.split(args.separator)
 80 |     else:
 81 |         inputlines = [input_text]
 82 | 
 83 |     inputs.append(aipl.new_input(*inputlines))
 84 | 
 85 |     try:
 86 |         for fn in scripts:
 87 |             inputs = aipl.run(open(fn).read(), inputs)
 88 | 
 89 |         if not scripts or args.interactive:
 90 |             repl(aipl, inputs)
 91 |     except UserAbort as e:
 92 |         print(f'aborted', e, file=sys.stderr)
 93 |         sys.exit(2)
 94 |     except AIPLException as e:
 95 |         print(e, file=sys.stderr)
 96 |         sys.exit(1)
 97 |     finally:
 98 |         if aipl.cost_usd:
 99 |             print(f'total cost: ${aipl.cost_usd:.02f}', file=sys.stderr)
100 | 


--------------------------------------------------------------------------------
/aipl/test_parse.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import textwrap
  3 | 
  4 | from .parser import parse
  5 | 
  6 | def test_single_line():
  7 |     assert ops(parse("!one !two !three\n")) == ["one", "two", "three"]
  8 | 
  9 | def test_simple_varname():
 10 |     command = parse("!split>output\n")
 11 |     assert command[0].opname == "split"
 12 |     assert command[0].kwargs == {}
 13 |     assert command[0].args == []
 14 |     assert command[0].varnames == ["output"]
 15 | 
 16 | def test_varname_afterwards():
 17 |     command = parse("!op arg >var")
 18 |     assert command[0].opname == "op"
 19 |     assert command[0].args == ["arg"]
 20 |     assert command[0].varnames == ["var"]
 21 | 
 22 | def test_global():
 23 |     command = parse("!op arg >>global_name")
 24 |     assert command[0].opname == "op"
 25 |     assert command[0].args == ["arg"]
 26 |     assert command[0].varnames == []
 27 |     assert command[0].globals == ["global_name"]
 28 | 
 29 | def test_split_newlines():
 30 |     command = parse("!split sep=\\n\n")
 31 |     assert command[0].opname == "split"
 32 |     assert command[0].kwargs == {"sep": "\n"}
 33 | 
 34 | def test_trailing_empty():
 35 |     commands = parse("!split\n\n!ravel\n")
 36 | 
 37 |     assert ops(commands) == ["split", "ravel"]
 38 | 
 39 |     assert commands[0].kwargs == {}
 40 |     assert commands[1].kwargs == {}
 41 | 
 42 | def test_no_final_newline():
 43 |     commands = parse("!split")
 44 |     assert ops(commands) == ["split"]
 45 | 
 46 | 
 47 | def test_no_final_newline_prompt():
 48 |     commands = parse("!split\nsome text")
 49 |     assert commands[0].opname == "split"
 50 |     assert commands[0].prompt == "some text"
 51 | 
 52 | 
 53 | def test_random_spaces():
 54 |     commands = parse("!a !b  \n c  d\n  \n d\n  e\n")
 55 |     assert ops(commands) == ["a", "b"]
 56 |     assert commands[0].args == []
 57 |     assert commands[0].kwargs == {}
 58 |     assert commands[1].prompt == "c  d\n\nd\n e"
 59 | 
 60 | 
 61 | def test_args():
 62 |     commands = parse("!fn arg1 arg2 arg3")
 63 |     assert commands[0].args == ["arg1", "arg2", "arg3"]
 64 | 
 65 | 
 66 | def test_args_with_kwargs():
 67 |     commands = parse("!fn arg1 key=abc arg2 key2=def arg3")
 68 |     assert commands[0].args == ["arg1", "arg2", "arg3"]
 69 |     assert commands[0].kwargs == {"key": "abc", "key2": "def"}
 70 | 
 71 | 
 72 | def test_nested_parse():
 73 |     commands = parse(textwrap.dedent('''
 74 |     !!def split-join
 75 |      !split
 76 | 
 77 |      !join
 78 | 
 79 |     !split-join
 80 |     '''))
 81 | 
 82 |     assert ops(commands) == ["def", "split_join"]
 83 |     assert commands[0].prompt == "!split\n\n!join"
 84 | 
 85 | def test_quoted():
 86 |     commands = parse(r'!fn "arg1" "\"\n"')
 87 |     assert commands[0].args == ["arg1", '"\n']
 88 | 
 89 | def test_single_quoted():
 90 |     commands = parse(r"!fn 'arg1' '\'\n'")
 91 |     assert commands[0].args == ["arg1", "'\n"]
 92 | 
 93 | def test_numbers():
 94 |     commands = parse("!fn 1 2.0 3.0e10 -3 -2e-7")
 95 |     assert commands[0].args == [1, 2.0, 3.0e10, -3, -2e-7]
 96 | 
 97 | def test_input_cols():
 98 |     commands = parse("!split <b sep=: <a")
 99 |     assert commands[0].opname == "split"
100 |     assert commands[0].args == []
101 |     assert commands[0].kwargs == {"sep": ":"}
102 |     assert commands[0].input_cols == ["b", "a"]
103 | 
104 | def test_input_globals():
105 |     commands = parse("!split <<b sep=: <<a")
106 |     assert commands[0].opname == "split"
107 |     assert commands[0].args == []
108 |     assert commands[0].kwargs == {"sep": ":"}
109 |     assert commands[0].input_tables == ["b", "a"]
110 | 
111 | def test_inline_prompt():
112 |     commands = parse("!split sep=: << a:b:c")
113 |     assert commands[0].opname == "split"
114 |     assert commands[0].args == []
115 |     assert commands[0].kwargs == {"sep": ":"}
116 |     assert commands[0].prompt == "a:b:c"
117 | 
118 | def test_multiple_commands():
119 |     commands = parse("!a !b !c << a:b:c")
120 |     assert commands[0].opname == "a"
121 |     assert commands[1].opname == "b"
122 |     assert commands[2].opname == "c"
123 |     assert commands[2].args == []
124 |     assert commands[2].kwargs == {}
125 |     assert commands[2].prompt == "a:b:c"
126 | 
127 | def test_inline_prompt_with_newline():
128 |     commands = parse("!split sep=: << a:b:c\nd:e:f\ng: :h\n")
129 |     assert commands[0].opname == "split"
130 |     assert commands[0].args == []
131 |     assert commands[0].kwargs == {"sep": ":"}
132 |     assert commands[0].prompt == "a:b:c\nd:e:f\ng: :h"
133 | 
134 | 
135 | def ops(commands):
136 |     return [command.opname for command in commands]
137 | 


--------------------------------------------------------------------------------
/docs/operators.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## An AIPL script is essentially a list of operators.
 3 | 
 4 | When an AIPL script is executed, each `!operator` is applied in sequence over all rows in the input.
 5 | 
 6 | Each operator uses 0 ("nonary"), 1 ("unary"), or 2 ("binary") operands on which to perform a certain operation and produce a resulting value.
 7 | Some operators may be "ambinary", and are able to be applied as either unary or binary operators.
 8 | 
 9 | ## The input
10 | 
11 | There is always a "most recent" value, which is the toplevel result of the previously applied operator.
12 | This value is commonly referred to as the "input", and it provides the set of default operands for the next unary operator, or the default "left" operands for the next binary operator.
13 | 
14 | ##### Note
15 | An operator that does not produce a result is a "tap", presumably having some desirable side-effect.  It may use its input operands but must not modify them.
16 | 
17 | ## The output
18 | 
19 | The operator is applied across the input rows, and these outputs are aggregated into a single "result", which immediately becomes the next "input".
20 | 
21 | The resulting table can be assigned to a name with `>>`, e.g. `!join>>foo`. It can be referred by name anywhere in the latter part of the script.
22 | 
23 | A single `>`, e.g. `>bar`, is used to assign a name to the bottommost column(s) of scalars, which are then available for formatting as `{bar}` in arguments and elsewhere.
24 | 
25 | ## Tacit dataflow
26 | 
27 | This completely tacit dataflow is great for unary (and nonary) operators.
28 | 
29 | For binary operators, the second or "right" operand can be passed as a special argument, e.g. `!cross <<foo`, where `foo` must have been a previous result named with `>>`.
30 | 
31 | Alternatively, the text on the lines below the operator, commonly called "the prompt", will be passed as the second operand.  For unary operators, if there is any non-whitespace text in the prompt, the prompt will override the default input and be passed as the first operand instead.  The result of this operator becomes the input, so the previous result must be named or it will be lost forever!!
32 | 
33 | A lone `<` as an argument signifies that everything until the end of the line is taken to be the prompt.
34 | 
35 | If `<` is at the end of a !command line, then a prompt is expected, and the REPL will read text until EOF.
36 | (In non-REPL mode, `<\n!` would force the input operand to be an empty string.
37 | 
38 | ## Tacit looping
39 | 
40 | The input may have as many as 99 dimensions, but operand(s) can only have 0 or 1 dimensions (until actual matrix operations are implemented, but then the limit would be 2).
41 | 
42 | Each operator must specify the dimensionality of its operands (using defop kwargs `rankin` and `rankin2`).
43 | 
44 | ### Unary operators
45 | When a unary operator is applied to an input with higher dimensionality, the operator will be applied recursively to each of the input's values.
46 | The result will have the same "outer" structure as the input, while the "innermost" values will the output values of the operator.
47 | 
48 | Each row containing the input value with the lowest dimensionality will be augmented with the output of the operator applied to it.
49 | 
50 | A column will be added to the tables containing those rows, such that the row and table values will now be these most recent results.
51 | 
52 | ### Binary operators
53 | 
54 | The left operand is the operator's "prompt" in the script or the current "input". The right operand must be a scalar or a toplevel output (for now).
55 | If needed, looping over the right operand must be done manually by the operator.
56 | 
57 | ## Rows and columns
58 | 
59 | All scalars and vectors are actually projections of "rows" and "tables", respectively.  The "value" of a row is a (boxed) scalar or another table.  A "simple row" has a scalar value.  The value of a table is a vector of the values of its rows.  A simple table has a value that is a vector of scalars.
60 | 
61 | A simple row is like `0-plus` or `0.5` dimension.  A simple table is like `1-plus` or `1.5` dimension.
62 | 
63 | A opaque row can have other potential data besides its value.
64 | A column knows how to get a particular projection of data from a opaque row.
65 | 
66 | A table is a list of opaque rows and a list of columns.
67 | The table can generate a list of virtual rows, one for each opaque row, which appear as mappings from column names to values.
68 | 
69 | A row can be part of many tables.
70 | Each column is on only one table, but a specific opaque row can be part of many tables.
71 | 
72 | The opaque row contains all the data, so both columns and tables can be lightweight objects.
73 | If rows are augmented but never need to be copied, then generating both column- and row-wise subsets of tables is a lightweight process.
74 | 
75 | ## Tacit context of previous results
76 | 
77 | [TBW]
78 | 


--------------------------------------------------------------------------------
/about/23-faq.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## Q: why would I not just write some code?
 4 | 
 5 | First of all, AIPL is code.  In fact it's entirely based on Python, and allows you to start writing code by using the `!python` operator.
 6 | However, there are reasons why you might not want to just write Python code in the first place.
 7 | 
 8 | The biggest being: you shouldn't over-engineer prototypes.
 9 | 
10 | Yes, if you were going to put some AI workflow into production, for "real" users, you would probably want to write some "real" code!
11 | You might need to consider factors like implementing real-time responses, handling large data volumes, or incorporating custom code within a loop. But that's going to cost real time, real money, and real skill.
12 | 
13 | Before reaching that stage, you need to know how your idea can be done with AI in its current form.
14 | You may need to explore which of the available models might be better or cheaper, figure out how exactly you would have to organize the pipeline so that you can get the results you need, and engineer the literal prompts themselves.
15 | You might even have to scrap the idea altogether if you can't get GPT (or whatever LLM) to respond accurately--and if that's the case, you want to find that out quickly, before investing any real resources.
16 | 
17 | You want something quick-and-dirty to experiment with. You want to be able to whip up a prototype in a couple hours.
18 | 
19 | But you need something bigger than prompting directly to ChatGPT within the browser. It's fine for testing one thing, and you can do the pre- and post-processing yourself by hand.  For anything greater than N=1, though, you're already wanting something more reproducible.
20 | 
21 | For instance, here's a script to summarize any number of webpages: https://github.com/saulpw/aipl/blob/develop/examples/summarize.aipl
22 | 
23 | To do this in Python would involve being explicit about iteration, caching, error-handling, and the result would be a more unwieldy script, with the requisite quoting and/or escaping, code out-of-order and perhaps code scattered across multiple files, even the boilerplate--these things add friction for someone who knows Python, and make it impossible for a non-coder.
24 | 
25 | At the tiny sets (N=10 or N=100) we use to validate our ideas, we want our focus to be on the experiments themselves as much as possible.
26 | 
27 | There's a progression of computational tools: from calculators, to spreadsheets, to notebooks, to scripts, to programs, to systems.  Each level gives you more power and flexibility, but requires more attention and skill.
28 | 
29 | In this context, ChatGPT is only a calculator, while Python is used to create programs and systems. Python notebooks are useful but have their quirks, and don't scale well without explicit adjustments.
30 | 
31 | AIPL fills the gap between notebooks and programs, functioning as a platform for scripts. Scripts are less flexible than full programs but are easier to write, modify, and maintain. They are self-contained in a single text file, making them easy to share and understand. AIPL scripts provide a clear flow of operations and include required content inline.
32 | 
33 | It's like asking, why would I write a bash script, why would I not just write some code?  And sure!  You might want to do that.  But maybe you start with `cut | sort | uniq | sort -n` and see if that gets the job done in a fraction of the time.
34 | 
35 | ## Q: What about the name "AIPL"?
36 | 
37 | "AIPL" pays homage to APL, the original array language by Ken Iverson in the 1960s, which inspired some [core design decisions in AIPL](../23-design.md).
38 | Though AIPL is a generic pipeline language and not at all limited to AI, its first use case was for LLM experiments, and some features (like inline prompts) are particularly useful in the LLM world.
39 | Also when GPT-4 suggested "Array-Inspired Pipeline Language" (I fed it the README and asked for a list of 5 backronyms), it pretty much nailed it, and that sealed the deal.
40 | 
41 | ## Q: What's the basic concept of AIPL?
42 | 
43 | AIPL is "just" a thin layer over Python, offering various operators for data processing and calling LLMs. New operators are regularly added. Users can even create their own commands using a function decorator (@defop(...)).
44 | 
45 | The role of AIPL is to execute these Python code snippets consistently across all data in predictable ways. It handles input, output, caching, logging, error handling, and the parallelism. And it turns out that infrastructure is 90% of the work of building a pipeline.
46 | 
47 | Users are then able to focus on the literal essence of their work: finding the data, arranging the appropriate operators in the correct order with appropriate parameters and prompts. We'll take care of the rest.
48 | 
49 | AIPL can be used for prototypes up to about a million rows, and the resulting .aipl script is the ultimate reference of the 'secret sauce'.
50 | 
51 | Then if/when the need arises for scaling or for a real-time usage pattern, converting the AIPL prototype into a "real" pipeline requires less effort than it would have taken to develop the initial implementation. It is a low-risk process, especially if users write their own runner in Python using another library. [porting cookbooks for $]
52 | 


--------------------------------------------------------------------------------
/aipl/parser.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Callable
  2 | import textwrap
  3 | import sys
  4 | from dataclasses import dataclass
  5 | import ast
  6 | from collections import defaultdict
  7 | from lark import Lark, Transformer, Discard, Token
  8 | 
  9 | aipl_grammar = Lark(r'''
 10 | start: line*
 11 | 
 12 | ws: [ _WS ]
 13 | _WS: /[ \t]+/
 14 | 
 15 | line: commands prompt | _EMPTY_LINE
 16 | 
 17 | commands: (command)+
 18 | command: command_sign OPNAME arg_list ws
 19 | 
 20 | OPNAME: IDENTIFIER
 21 | 
 22 | ?command_sign: /!!?/
 23 | 
 24 | _EMPTY_LINE: "\n"
 25 | 
 26 | varname: ">" IDENTIFIER
 27 | globalname: ">>" IDENTIFIER
 28 | input_col: "<" IDENTIFIER
 29 | input_table: "<<" IDENTIFIER
 30 | 
 31 | arg_list: arg*
 32 | 
 33 | arg: ws (KEY "=" literal | literal | varname | globalname | input_col | input_table)
 34 | 
 35 | ?literal: BARE_STRING | ESCAPED_STRING
 36 | BARE_STRING: /[^ \t\n!"'><]\S*/
 37 | 
 38 | ESCAPED_STRING: /(["']).*?(?<!\\)\1/
 39 | 
 40 | KEY: IDENTIFIER
 41 | 
 42 | IDENTIFIER: /[A-Za-z0-9_-]+/
 43 | 
 44 | prompt: ("<< " | "\n") STRING_LINE*
 45 | STRING_LINE: /[^!#\n][^\n]*(\n|$)/m | "\n"
 46 | 
 47 | COMMENT_LINE: /^#[^\n]*\n/m
 48 | %ignore COMMENT_LINE
 49 | ''', propagate_positions=True)
 50 | 
 51 | 
 52 | @dataclass
 53 | class Command:
 54 |     opname:str
 55 |     op: Callable|None
 56 |     varnames:List[str]
 57 |     globals: List[str]
 58 |     input_tables: List[str]
 59 |     input_cols: List[str]
 60 |     immediate:bool
 61 |     args:list
 62 |     kwargs:dict
 63 |     linenum:int
 64 |     prompt:str
 65 | 
 66 |     def __str__(self):
 67 |         return f'-> {self.opname} (line {self.linenum-1})'  # first line is implicit !!python
 68 | 
 69 | 
 70 | class ToAst(Transformer):
 71 |     def line(self, tree):
 72 |         if len(tree) == 0:
 73 |             return tree
 74 |         (commands, prompt) = tree
 75 |         if prompt:
 76 |             commands[-1].prompt = prompt
 77 |         return commands
 78 | 
 79 |     def commands(self, tree):
 80 |         return list(tree)
 81 | 
 82 |     def start(self, tree):
 83 |         output = []
 84 |         for line in tree:
 85 |             output.extend(line)
 86 |         return output
 87 | 
 88 |     def command(self, tree):
 89 |         command_sign, opname, arguments = tree
 90 | 
 91 |         return Command(
 92 |             opname=opname,
 93 |             op=None,
 94 |             linenum=command_sign.line,
 95 |             immediate=command_sign.value == '!!',
 96 |             varnames=arguments['varnames'],
 97 |             globals=arguments['globalnames'],
 98 |             input_cols=arguments['input_cols'],
 99 |             input_tables=arguments['input_tables'],
100 |             args=arguments['args'],
101 |             kwargs=dict(arguments['kwargs']),
102 |             prompt=None,
103 |         )
104 | 
105 |     def OPNAME(self, token):
106 |         return clean_to_id(token.value)
107 | 
108 |     def command_prompt(self, tree):
109 |         command, prompt = tree
110 |         if prompt:
111 |             command.kwargs['prompt'] = prompt
112 |         return command
113 | 
114 |     def arg_list(self, arg_list):
115 |         arguments = defaultdict(list)
116 | 
117 |         for key, arg in arg_list:
118 |             arguments[key].append(arg)
119 | 
120 |         return arguments
121 | 
122 |     def varname(self, tree):
123 |         return ('varnames', tree[0])
124 | 
125 |     def globalname(self, tree):
126 |         return ('globalnames', tree[0])
127 | 
128 |     def input_table(self, tree):
129 |         return ("input_tables", tree[0])
130 | 
131 |     def input_col(self, tree):
132 |         return ("input_cols", tree[0])
133 | 
134 |     def arg(self, tree):
135 |         if isinstance(tree[0], tuple):
136 |             return tree[0]
137 | 
138 |         if isinstance(tree[0], Token) and tree[0].type == 'KEY':
139 |             return ('kwargs', (clean_to_id(tree[0].value), tree[1]))
140 | 
141 |         return ('args', tree[0])
142 | 
143 |     def prompt(self, lines):
144 |         prompt = textwrap.dedent(''.join(token.value for token in lines)).strip()
145 |         if not prompt:
146 |             return None
147 |         return prompt
148 | 
149 |     def ws(self, tree):
150 |         return Discard
151 | 
152 |     def BARE_STRING(self, token):
153 |         return trynum(token.value)
154 | 
155 |     def IDENTIFIER(self, token):
156 |         return token.value
157 | 
158 |     def ESCAPED_STRING(self, token):
159 |         return ast.literal_eval(token.value)
160 | 
161 | def parse(program_text):
162 |     parse_tree = aipl_grammar.parse(program_text + "\n")
163 |     return ToAst().transform(parse_tree)
164 | 
165 | 
166 | def trynum(x:str) -> int|float|str:
167 |     try:
168 |         return int(x)
169 |     except Exception:
170 |         try:
171 |             return float(x)
172 |         except Exception:
173 |             return x.replace('\\n', '\n').replace('\\t', '\t').replace('\\\\', '\\')
174 | 
175 | 
176 | def clean_to_id(s:str) -> str:
177 |     return s.replace('-', '_').strip('!')
178 | 
179 | 
180 | if __name__ == '__main__':
181 |     for file in sys.argv[1:]:
182 |         print("Parsing: ", file)
183 |         # prepend `!!python` to the input to correctly handle any leading python code
184 |         # see also: AIPL.run() method in interpreter.py
185 |         parse_tree = aipl_grammar.parse('!!python\n' + open(file).read())
186 |         print("Parse tree: ", parse_tree.pretty())
187 |         for command in ToAst().transform(parse_tree):
188 |             print(command)
189 | 


--------------------------------------------------------------------------------
/aipl/clients.py:
--------------------------------------------------------------------------------
  1 | from aipl import defop, expensive, stderr, AIPLException
  2 | import openai
  3 | import os
  4 | 
  5 | # from the horse's mouth, 2023-05-30
  6 | openai_pricing = {
  7 |     "gpt-4": 0.06,
  8 |     "gpt-4-32k": 0.12,
  9 |     "gpt-3.5-turbo": 0.002,
 10 |     "gpt-3.5-turbo-16k": 0.002,
 11 |     "text-ada-001": 0.0016,
 12 |     "text-babbage-001": 0.0024,
 13 |     "text-curie-001": 0.0120,
 14 |     "text-davinci-003": 0.1200
 15 | }
 16 | 
 17 | # base price covers the first 25 tokens, then it's the per-token price (2023-06-06)
 18 | gooseai_models = {
 19 |     "fairseq-13b": {
 20 |         "pricing": {
 21 |             "base": 0.001250,
 22 |             "token": 0.000036
 23 |         },
 24 |         "encoding": ""
 25 |     },
 26 |     "gpt-neo-20b": {
 27 |         "pricing": {
 28 |             "base": 0.002650, 
 29 |             "token": 0.000063
 30 |         },
 31 |         "encoding": "gpt2"
 32 |     }
 33 | }
 34 | 
 35 | def _parse_msg(s:str):
 36 |     if s.startswith('@@@s'):
 37 |         return dict(role='system', content=s)
 38 |     elif s.startswith('@@@a'):
 39 |         return dict(role='assistant', content=s)
 40 |     else:  # if s.startswith('@@@u'):
 41 |         return dict(role='user', content=s)
 42 | 
 43 | 
 44 | def count_tokens(s:str, model:str=''):
 45 |     try:
 46 |         import tiktoken
 47 |         enc = tiktoken.encoding_for_model(model)
 48 |         return len(enc.encode(s))
 49 |     except ModuleNotFoundError as e:
 50 | #        stderr(str(e))
 51 |         return len(s)//4
 52 |     except KeyError as e:
 53 |         # just estimate
 54 |         return len(s)//4
 55 | 
 56 | 
 57 | class StandardClient:
 58 |     def compute_cost(self, aipl, resp, model):
 59 |         if self.client_type == 'openai':
 60 |             used = resp['usage']['total_tokens']
 61 |             result = resp['choices'][0]['message']['content']
 62 |             cost = openai_pricing[model]*used/1000
 63 |             if aipl: # makes it easier to run unit tests
 64 |                 aipl.cost_usd += cost
 65 | 
 66 |             stderr(f'Used {used} tokens (estimate {len(result)//4} tokens).  Cost: ${cost:.03f}')
 67 |         elif self.client_type == 'selfhosted':
 68 |             stderr('Used TODO tokens. Cost: $¯\\_(ツ)_/¯')
 69 | 
 70 |     def completion(self, aipl, v:str, **kwargs) -> str:
 71 |         'Send chat messages to GPT.  Lines beginning with @@@s or @@@a are sent as system or assistant messages respectively (default user).  Passes all [named args](https://platform.openai.com/docs/guides/chat/introduction) directly to API.'
 72 |         model = kwargs.get('model') or self.default_model
 73 |         temperature = kwargs.get('temperature') or 0
 74 |         params = dict(
 75 |             temperature=float(temperature),
 76 |             top_p=1,
 77 |             frequency_penalty=0,
 78 |             presence_penalty=0,
 79 |             model=model
 80 |         )
 81 |         params.update(kwargs)
 82 | 
 83 |         # TODO: there must be a less hacky way of doing these
 84 |         params['temperature'] = float(params['temperature'])
 85 |         if 'client' in params:
 86 |             del params['client']
 87 | 
 88 |         role = 'user'
 89 |         def _get_role_msg(s:str):
 90 |             if s.startswith('@@@s'):
 91 |                 return 'system', s[4:]
 92 |             elif s.startswith('@@@a'):
 93 |                 return 'assistant', s[4:]
 94 |             elif s.startswith('@@@u'):
 95 |                 return 'user', s[4:]
 96 |             else:
 97 |                 return role, s
 98 | 
 99 |         msgs = []
100 |         for m in v.splitlines():
101 |             role, msg = _get_role_msg(m)
102 |             if msgs and msgs[-1]['role'] == role:
103 |                 msgs[-1]['content'] += '\n' + msg
104 |             else:
105 |                 msgs.append(dict(role=role, content=msg))
106 | 
107 |         resp = openai.ChatCompletion.create(
108 |             messages=msgs,
109 |             **params
110 |         )
111 |         try:
112 |             result = resp['choices'][0]['message']['content']
113 |         except:
114 |             raise AIPLException(resp)
115 |         self.compute_cost(aipl, resp, model)
116 | 
117 |         return result
118 | 
119 | 
120 | class GooseClient(StandardClient):
121 |     def __init__(self):
122 |         if 'GOOSE_AI_KEY' not in os.environ:
123 |             raise AIPLException(f'''GOOSE_AI_KEY envvar must be set to use gooseai client type''')
124 |         self.client_type = 'gooseai'
125 |         self.default_model = 'gpt-neo-20b'
126 |         openai.api_key = os.environ['GOOSE_AI_KEY']
127 |         openai.api_base = "https://api.goose.ai/v1"
128 | 
129 |     def completion(self, aipl, v, **kwargs):
130 |         import requests
131 | 
132 |         model = kwargs.get('model') or self.default_model
133 |         if 'GOOSE_AI_KEY' not in os.environ:
134 |             raise AIPLException(f'''GOOSE_AI_KEY envvar must be set for !llm to use {model}''')
135 |         headers = {
136 |             'Content-Type': 'application/json',
137 |             'Authorization': f'Bearer {os.environ["GOOSE_AI_KEY"]}'
138 |         }
139 |         params = dict(
140 |             temperature=0
141 |         )
142 |         params.update(**kwargs)
143 |         # TODO: GooseAI supports multiple prompt completions in parallel
144 |         data = {'prompt': v, **params}
145 |         r = requests.post(f'https://api.goose.ai/v1/engines/{model}/completions', headers=headers, json=data)
146 |         j = r.json()
147 |         if 'error' in j:
148 |             raise AIPLException(f'''GooseAI returned an error: {j["error"]}''')
149 | 
150 |         response = j['choices'][0]['text']
151 |         # Only output tokens are charged
152 |         used = count_tokens(response, gooseai_models[model]['encoding'])
153 |         # GooseAI's base cost provides the first 25 tokens, then each token after is charged at the token rate
154 |         cost = gooseai_models[model]['pricing']['token'] * max(0, used-25) + gooseai_models[model]['pricing']['base']
155 |         if aipl:
156 |             aipl.cost_usd += cost
157 |         stderr(f'Used {used} tokens (estimate {len(v)//4} tokens).  Cost: ${cost:.03f}')
158 |         return response
159 | 
160 | 
161 | class OpenAIClient(StandardClient):
162 |     def __init__(self):
163 |         if 'OPENAI_API_KEY' not in os.environ or 'OPENAI_API_ORG' not in os.environ:
164 |             raise AIPLException('''OPENAI_API_KEY and OPENAI_API_ORG envvars must be set for openai client type''')
165 |         self.client_type = 'openai'
166 |         self.default_model = 'gpt-3.5-turbo'
167 | 
168 | 
169 | class SelfHostedChatClient(StandardClient):
170 |     def __init__(self):
171 |         if 'LLM_CLIENT_ENDPOINT' not in os.environ:
172 |             raise AIPLException('''LLM_CLIENT_ENDPOINT envvar must be set for selfhosted client type''')
173 |         openai.api_base = os.environ['LLM_CLIENT_ENDPOINT']
174 |         self.client_type = 'selfhosted'
175 |         if 'DEFAULT_SELFHOSTED_MODEL' in os.environ:
176 |             self.default_model = os.environ['DEFAULT_SELFHOSTED_MODEL']
177 | 
178 | 
179 | if __name__ == "__main__":
180 |     max_tokens = 10
181 |     prompt = '''A lesser-known robot character from sci-fi is'''
182 | 
183 |     print('openai\n', prompt, OpenAIClient().completion(None, prompt, max_tokens=max_tokens))
184 |     print('gooseai\n', prompt, GooseClient().completion(None, prompt, max_tokens=max_tokens))
185 |     print('selfhosted\n', prompt, SelfHostedChatClient().completion(None, prompt, max_tokens=max_tokens))
186 | 


--------------------------------------------------------------------------------
/about/23-design.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Design
  3 | 
  4 | AIPL is intended as a simple platform for quick proof of concept AI-based data pipelines to be implemented and tested.
  5 | 
  6 | ## Why?
  7 | 
  8 | The recent developments in LLMs and AI are a whole new level of capabilities (and costs).
  9 | I wanted to see what all the fuss was about, so I tried to do some basic things with [langchain](https://github.com/hwchase17/langchain) but it was this big complicated system.
 10 | So instead I implemented some small workflows myself with raw Python, and it turned out that AI is actually pretty straightforward.
 11 | This is a small hackable platform that makes it easy to experiment and get small-scale results.
 12 | It's called AIPL.
 13 | 
 14 | ### Emphasize the Dataflow
 15 | 
 16 | An AIPL script represents the essence of a data pipeline, with only the high-level operations and their parameters and prompts.
 17 | No boilerplate or quoting or complicated syntax.
 18 | Not even much of a language--just commands executed in order.
 19 | This keeps the focus on data flow and the high-level operations--the actual links in the chain.
 20 | It can be expanded or optimized or parallelized as needed.
 21 | 
 22 | ### Very Little Overhead
 23 | 
 24 | AIPL is array-oriented and concatenative, drawing inspiration from APL and Forth, both of which have powerful operators and very simple syntax.
 25 | Passing data implicitly between operators allows for efficient representation of data flows, and avoids [one of the hardest problems in computer science](https://www.namingthings.co/).
 26 | And the implicit looping of array languages makes it easier to scale interactivity.
 27 | 
 28 | ### Take Advantage of Python Ecosystem
 29 | 
 30 | AIPL is also intended to be practical (if only at small scale), so operators are easy to write using the existing cadre of Python libraries, and allow options and parameters passed to them verbatim.
 31 | 
 32 | ### Keep It Simple
 33 | 
 34 | The implementation is intentionally homespun, to remove layers of abstraction and reduce the friction of setup and operation.
 35 | It doesn't parallelize anything yet but it still should be able to handle hundreds of items even as it is, enough to prove a concept.
 36 | I expect it to be straightforward to scale it to mag 5 (up to a million items) if something takes off.
 37 | 
 38 | ### Learn and Explore
 39 | 
 40 | At the very least, AIPL should be a useful tool to learn, explore, and prototype small-scale data pipelines that have expensive operations like API calls and LLM generation.
 41 | 
 42 | ## What is "implicit looping"?
 43 | 
 44 | It's a concept borrowed from APL.
 45 | 
 46 | Yes, APL, that language from the 60s that looks like this:
 47 | 
 48 | ```
 49 | avg ← +⌿ ÷ ≢
 50 | ```
 51 | 
 52 | Now before you run away screaming, there are 3 big ideas in APL, and why Iverson won the Turing Award in 1980:
 53 | 
 54 | 1. implicit looping and tacit programming
 55 | 
 56 |     - brilliant, removes a large amount of code.  beyond just the loops too
 57 | 
 58 | 2. symbols
 59 | 
 60 | APL uses a special set of non-text symbols, a custom alphabet that nearly predates ASCII itself.
 61 | This is why it looks like alien gibberish to the uninitiated, and why APL has all but died out.
 62 | [Iverson's paper and talk for the Turing Award is entitled [Notation as a Tool of Thought](https://dl.acm.org/doi/pdf/10.1145/358896.358899),
 63 | so "notation" is ironically the focus *and* the fallacy of APL.]
 64 | 
 65 | The symbology is math-based (as APL is a language for teaching and doing linear algebra), and is elegantly designed. but the idea is unfortunately a non-starter for modern adoption.
 66 | 
 67 | People think in *words* or word-like chunks, and while letters of the alphabet make up the words, they are only symbols, and they carry zero semantic content.
 68 | Learning a new alphabet (and one with combinatoric semantics) is a huge barrier to learning a new language.
 69 | 
 70 | Now, I agree with Iverson's fundamental premise, that a sub-verbal facility with a core set of operations, is a tremendous tool for thought.
 71 | But the actual terse and non-verbal notation ultimately prevented APL from being in common use.
 72 | 
 73 | 3. vocabulary
 74 | 
 75 | APL defines an elegant core set of operators that are just the right level of abstraction for math, particularly linear algebra.
 76 | This is why APL-like languages are still used in the finance world: you can get a lot done quickly, and with very little code, when you know the domain and the operators are at the right level of abstraction and you can fit them in your head.
 77 | 
 78 | The real art of software is in developing a set of legos that are easily explainable and interoperate well together, conforming to an intuitive yet precisely-defined connection spec.
 79 | When done well, these legos are composable without anything else necessary to bind one's input to the other's output.  Then tacit programming becomes not just possible but desired.
 80 | 
 81 | ---
 82 | 
 83 | So AIPL borrows implicit looping and tacit programming from APL, and lets go of its alien symbology.
 84 | AIPL also borrows some of APL's vocabulary, but since data pipelines are a much different domain than math (and much more has been developed in the data domain over the past 50 years), we need to develop a different set of operators.
 85 | 
 86 | So AIPL is also a *vocabulary discovery platform*.
 87 | It is easy to add new operations in Python.
 88 | AIPL is really just a skin over Python; a notebook in script form.
 89 | 
 90 | ## For "port-able" prototypes
 91 | 
 92 | Once you have the operators in the right order and with the right parameters, it's "just" engineering to call them at the right time, with the right batch size, in the right format, caching the results, etc.
 93 | You have a "script", like for a movie.
 94 | 
 95 | You still have to do all the engineering; you have to put it into production.
 96 | But with the script, you know what's required, and you can start to plan out the process of development.
 97 | 
 98 | You can *port* it.
 99 | 
100 | Don't over-engineer your experiments and your prototypes.
101 | Just put legos together in a logical order and see how the whole chain works.
102 | Tune, iterate, and discover quickly if your idea is viable or not.
103 | 
104 | # The data table
105 | 
106 | Operators take 0, 1, or 2 "operands with dimensionality", and any number of scalar (int/float/str) parameters.
107 | 
108 | These "operands with dimensionality" are used like arrays in traditional array languages.
109 | However, those have multidimensional arrays of scalars, whereas AIPL operands are more like nestable database tables.
110 | 
111 | These tables have:
112 | 
113 | - a list of "rows"
114 | - named columns that can be reordered and removed without updating each row
115 | - homogenous types within a column (possibly NULL)
116 | - heterogenous types within a row
117 | 
118 | Every operator consumes 0, 1, or 2 operands and produces exactly 1 operand.
119 | (Some operators have only side-effects and don't actually do anything to the data; these take 1 operand and return the same.)
120 | 
121 | The simplest operator implementation takes 0/1/2 tables and returns a table.
122 | The return table may be one of the unmodified input operands, otherwise it must be a new table.
123 | The rows may be referenced for efficiency.
124 | 
125 | These operators must use the consistent pattern for iterating over the table's dimensions correctly, only "changing" the proper dimension (by default the last dimension).
126 | 
127 | Tables are more complex than simple vectors.
128 | But ideally, an operator could be defined only by its smallest operation, and a decorator(?) would do the consistent iteration.
129 | 
130 | 


--------------------------------------------------------------------------------
/aipl/table.py:
--------------------------------------------------------------------------------
  1 | from typing import Mapping, List
  2 | from copy import copy
  3 | 
  4 | from aipl import AIPLException
  5 | from .utils import fmtargs, fmtkwargs, stderr, strify
  6 | 
  7 | UNWORKING = object()
  8 | CURRENT_COLNAME='_'
  9 | 
 10 | class Row(dict):
 11 |     pass
 12 | 
 13 | 
 14 | class Column:
 15 |     def __init__(self, key, name=''):
 16 |         self.name = name or key
 17 |         self.key = key
 18 | 
 19 |     @property
 20 |     def hidden(self) -> bool:
 21 |         return self.name.startswith('_')
 22 | 
 23 |     def get_value(self, row:Row):
 24 |         if isinstance(self.key, (list, tuple)):
 25 |             obj = row
 26 |             for k in self.key:
 27 |                 obj = obj.get(k)
 28 |                 if obj is None:
 29 |                     return None
 30 |             return obj
 31 | 
 32 |         return row.get(self.key)
 33 | 
 34 |     def __str__(self):
 35 |         return f'[Column {self.name}]'
 36 | 
 37 |     def __repr__(self):
 38 |         return f"<Column {self.name} {self.key}>"
 39 | 
 40 |     def deepname(self, table):
 41 |         if table.rows:
 42 |             r = self.get_value(table.rows[0])
 43 |             if isinstance(r, Table):
 44 |                 return f'{self.name}:{r.deepcolnames}'
 45 | 
 46 |         return self.name or self.key
 47 | 
 48 | 
 49 | class SubColumn(Column):
 50 |     'Use for tables that have nested rows from other tables in the row dict at *self.key*'
 51 |     def __init__(self, key, origcol):
 52 |         super().__init__(key, origcol.name)
 53 |         self.origcol = origcol
 54 | 
 55 |     def get_value(self, row:dict):
 56 |         return self.origcol.get_value(row[self.key])
 57 | 
 58 | 
 59 | class LazyRow(Mapping):
 60 |     def __init__(self, table:'Table', row:Row):
 61 |         self._row = row
 62 |         self._table = table
 63 | 
 64 |     def __copy__(self):
 65 |         return LazyRow(self._table, self._row)
 66 | 
 67 |     def __len__(self):
 68 |         return len(self._table.columns)
 69 | 
 70 |     def __iter__(self):
 71 |         assert isinstance(self.value, Table), f"can't iterate {type(self.value).__name__}"
 72 |         return iter(self.value)
 73 | 
 74 |     def __getitem__(self, k):
 75 |         obj = self
 76 |         while True:
 77 |             c = obj._table.get_column(k)
 78 |             if c:
 79 |                 return c.get_value(obj._row)
 80 | 
 81 |             obj = obj.parent_row
 82 | 
 83 |             if obj is None:
 84 |                 raise KeyError(k)
 85 | 
 86 |     @property
 87 |     def value(self):
 88 |         return self._table.columns[-1].get_value(self._row)
 89 | 
 90 |     def items(self):
 91 |         return self._asdict().items()
 92 | 
 93 |     def keys(self):
 94 |         return self._asdict().keys()
 95 | 
 96 |     def _asdict(self, named_only=False):
 97 |         'if named_only=False, add current_col as "{CURRENT_COLNAME}" if it is hidden.  otherwise ignore it too'
 98 |         d = {}
 99 | 
100 |         for c in self._table.columns:
101 |             if c.hidden:
102 |                 if named_only or c is not self._table.current_col:
103 |                     continue
104 | 
105 |                 k = CURRENT_COLNAME
106 |             else:
107 |                 k = c.name
108 | 
109 |             v = c.get_value(self._row)
110 | 
111 |             if v is None:
112 |                 continue
113 |             elif isinstance(v, Table):
114 |                 if v.rank == 0:
115 |                     v = v.scalar
116 |                 else:
117 |                     v = [r._asdict() for r in v]
118 |             elif not isinstance(v, (int, float, str)):
119 |                 v = str(v)
120 | 
121 |             if k in d:
122 |                 del d[k]
123 |             d[k] = v
124 | 
125 |         return d
126 | 
127 |     @property
128 |     def parent_row(self) -> 'LazyRow':
129 |         return self._row.get('__parent', None)
130 | 
131 |     def __repr__(self):
132 |         return f"<LazyRow row={self._asdict()} parent={self.parent_row!r}>"
133 | 
134 | 
135 | class Table:
136 |     def __init__(self, rows:List[Mapping|LazyRow]=[], parent:'Table|None'=None):
137 |         self.rows = []  # list of dict
138 |         self.columns = []  # list of Column
139 |         self.parent = parent
140 |         self.scalar = None
141 | 
142 |         if isinstance(rows, (list, tuple)):  # should be sequence-but-not-string
143 |             for row in rows:
144 |                 self.append(row)
145 |         else:
146 |             self.scalar = rows
147 | 
148 |     def __len__(self):
149 |         return len(self.rows)
150 | 
151 |     def __bool__(self):
152 |         return len(self.rows) > 0
153 | 
154 |     def __copy__(self) -> 'Table':
155 |         'Returns structural copy of table with all columns and no rows.'
156 |         ret = Table()
157 | 
158 |         for c in self.columns:
159 |             ret.add_column(copy(c))
160 | 
161 |         ret.rows = []
162 |         ret.scalar = self.scalar
163 |         return ret
164 | 
165 |     def axis(self, rank:int=0):
166 |         if self.rank > rank:
167 |             firstrowval = self.columns[-1].get_value(self.rows[0])
168 |             return firstrowval.axis(rank)
169 | 
170 |         return self
171 | 
172 |     @property
173 |     def values(self):
174 |         if self.scalar is not None:
175 |             return [self.scalar]
176 |         return [r.value for r in self]
177 | 
178 |     @property
179 |     def shape(self) -> List[int]:
180 |         if self.scalar is not None:
181 |             return []
182 |         dims = [len(self.rows)]
183 |         if self.rows:
184 |             if self.columns:
185 |                 firstrowval = self.current_col.get_value(self.rows[0])
186 |                 if isinstance(firstrowval, Table):
187 |                     dims += firstrowval.shape
188 |         return dims
189 | 
190 |     @property
191 |     def rank(self) -> int:
192 |         return len(self.shape)
193 | 
194 |     @property
195 |     def colnames(self):
196 |         return [c.name for c in self.columns]
197 | 
198 |     @property
199 |     def colkeys(self):
200 |         return [c.key for c in self.columns]
201 | 
202 |     @property
203 |     def current_col(self) -> Column:
204 |         return self.columns[-1]
205 | 
206 |     @property
207 |     def deepcolnames(self) -> str:
208 |         return ','.join(f'{c.deepname(self)}' for c in self.columns if not c.hidden or c is self.current_col) or "no cols"
209 | 
210 |     def __getitem__(self, k:int) -> LazyRow:
211 |         if k >= len(self.rows):
212 |             raise IndexError('table index out of range')
213 |         return LazyRow(self, self.rows[k])
214 | 
215 |     def _asdict(self):
216 |         if self.scalar is not None:
217 |             return self.scalar
218 |         return [r._asdict() for r in self]
219 | 
220 |     def __repr__(self):
221 |         if self.scalar is not None:
222 |             return str(self.scalar)
223 | 
224 |         shapestr = 'x'.join(map(str, self.shape))
225 |         contentstr = ''
226 |         if self.rows:
227 |             contentstr += strify(self[0], maxlen=20)
228 |         if len(self.rows) > 1:
229 |             contentstr += ' ...'
230 |         return f'<Table [{shapestr} {self.deepcolnames}] {contentstr}>'
231 | 
232 |     def __iter__(self):
233 |         if self.scalar is not None:
234 |             yield self.scalar
235 |         else:
236 |             for r in self.rows:
237 |                 yield LazyRow(self, r)
238 | 
239 |     def add_new_columns(self, row:Row):
240 |         for k in row.keys():
241 |             if not k.startswith('__'):
242 |                 self.add_column(Column(k))
243 | 
244 |     def add_column(self, col:Column):
245 |         assert not col.name.startswith('__')
246 |         if self.rows:
247 |             assert col.get_value(self.rows[0]) is not UNWORKING
248 |         if col.name in self.colnames:
249 |             return
250 | 
251 |         if col.name.startswith('_cost'):
252 |             self.columns.insert(0, col)
253 |         else:
254 |             self.columns.append(col)
255 | 
256 |     def get_column(self, name:str) -> Column:
257 |         if name == CURRENT_COLNAME:
258 |             return self.columns[-1]
259 | 
260 |         for c in self.columns:
261 |             if c.name == name:
262 |                 return c
263 | 
264 |         return None
265 | 
266 |     def append(self, row:dict):
267 |         if isinstance(row, LazyRow):
268 |             self.rows.append(row._row)
269 |         elif isinstance(row, Mapping):
270 |             self.rows.append(row)
271 |             self.add_new_columns(row)
272 |         else:
273 |             raise TypeError(f"row must be Mapping or LazyRow not {type(row)}")
274 | 


--------------------------------------------------------------------------------
/docs/tutorial.md:
--------------------------------------------------------------------------------
  1 | # AIPL tutorial
  2 | 
  3 | Okay so you heard about this AIPL thing and you're curious to see if the claims hold true.
  4 | Are array languages really as powerful as they say?
  5 | Can you really prototype an AI pipeline (or any data pipeline) in a few minutes?
  6 | 
  7 | Well let's put it to the test.
  8 | For this little experiment, I wanted to see how good GPT is at solving the 8 puzzles from [Hanukkah of Data](https://hanukkah.bluebird.sh/5783).
  9 | Now, I've already tried pasting one of them into the ChatGPT web interface, so I have an idea how this could work.
 10 | (First, if you haven't yet, [install AIPL]()).
 11 | 
 12 |     # read Hannukah of Data puzzle from the web
 13 |     !read
 14 |     https://hanukkah.bluebird.sh/5783/1
 15 |     !extract-text
 16 |     !print
 17 | 
 18 | Okay so first off, there's no boilerplate here.  An AIPL script is just a list of commands (called "operators" hereafter), in order, one after the other.
 19 | Each of these operates on the input, and generates an output which becomes the next input.
 20 | 
 21 | Here's the toplevel syntax of AIPL:
 22 | 
 23 |     - Lines that start with `#` are comments, and ignored.
 24 |     - Lines that start with `!` are AIPL command lines, which contain one or more operators and their arguments.
 25 |     - All operators start with `!`.
 26 |     - All lines after a command line, if there are any, are the "verbatim input", which is passed verbatim to the operator instead of the previous input.
 27 | 
 28 | So in this case, `!read` is the operator that reads a URL or file into memory, and it's passed the URL of the first puzzle.
 29 | `!extract-text` takes HTML and, um, extracts the text from it.  `!print` prints it to stdout.
 30 | 
 31 | We can now run this script:
 32 | 
 33 |     aipl hod5783.aipl
 34 | 
 35 | and it should work, no questions asked.
 36 | 
 37 | ## inspecting the pipeline
 38 | 
 39 | If you want to see what is happening at each step, you can use `--step rich`:
 40 | 
 41 |     aipl hod5783.aipl --step rich
 42 | 
 43 | And then before every command, it will dump the input table using the [rich]() text library.
 44 | 
 45 | ## going bigger
 46 | 
 47 | Okay, that's pretty cool, but ultimately we're going to want to do this for all 8 puzzles.
 48 | 
 49 |     !split
 50 |     1 2 3 4 5 6 7 8
 51 | 
 52 |     !format
 53 |     https://hanukkah.bluebird.sh/5783/{_}
 54 | 
 55 |     !read
 56 |     !extract-text
 57 |     !print
 58 | 
 59 | The `!split` operator splits its input into a list, just like in Python.
 60 | 
 61 | `!format` formats its input using [Python string formatting](), and can refer to previous elements by their names (discussed later), or the immediate previous input with `_`.
 62 | 
 63 | Now `!read` takes that formatted link (since it has no verbatim input anymore), and then `!extract-text` and `!print` work as before.  If we run it again, we now we get the text of all 8 puzzles.
 64 | 
 65 | ## array languages
 66 | 
 67 | So, uh, that was kind of too easy.  What's going on here?  Where's the loop?  Is this even programming, bro?
 68 | 
 69 | Okay, so, the thing about array languages, is that they automatically iterate over their input.  It's called "[loopless programming]()".
 70 | Think of the input as an N-dimensional (jagged) list: a list of elements, each of which may also be a list, etc.
 71 | 
 72 | The scalar operators we've seen so far--which take a scalar value, usually a string, and return a scalar value, also usually a string--loop over the "last" dimension, or the deepest list.
 73 | After a scalar operator is applied, its output has the same structure as the input.
 74 | 
 75 | The `!split` operator, on the other hand, is not a scalar operator.  It takes a string, but it returns a list of strings--so the input grows by one dimension.
 76 | 
 77 | In array-land the number of dimensions of an operand is called its "rank", with rank of 0 meaning "scalar".
 78 | So in our above example, the `!split` takes its verbatim input (a 1-dimensional list of 1 string), and splits
 79 | it into a 2-dimensional list: a list of 1 element, which is a list of 8 strings.
 80 | Every subsequent operator just operates over all the scalar elements in the list of lists.
 81 | 
 82 | ## using GPT
 83 | 
 84 | Okay, so we've extracted the text, now what?  Well, let's see if GPT can solve the puzzle:
 85 | 
 86 |     !format
 87 |     I have a database with 4 tables (field names given inside parens):
 88 | 
 89 |     - customers (customerid,name,address,citystatezip,birthdate,phone)
 90 |     - orders (orderid,customerid,ordered,shipped,items,total)
 91 |     - products (sku,desc,wholesale_cost)
 92 |     - orders_items (orderid,sku,qty,unit_price)
 93 | 
 94 |     Here is a database puzzle to be solved using the above schema:
 95 | 
 96 |     """
 97 |     {_}
 98 |     """
 99 | 
100 |     Give only a SQLite SELECT query to answer the question.
101 | 
102 |     !llm model=gpt-3.5-turbo
103 | 
104 |     !print
105 | 
106 | I wrote a prompt and inserted the extracted text with heavy delimiters, as recommended by the prompt experts. (Who knows if this does anything?  I sure don't.)  But we're clearly asking for a SQL answer from GPT.
107 | 
108 | Note here that AIPL operators can take arguments, both positional and keyword, like in Python.
109 | These arguments don't have to be quoted--only if they have spaces or quotes or a few other characters (which can be escaped like in C or Python as usual).
110 | 
111 | To run this script, you'll need to make sure you have the `OPENAI_API_KEY` and `OPENAI_API_ORG` environment variables set.
112 | 
113 | Okay, so if you run this script, you can see the output the GPT-3.5 gives.  Well that's nice, but what if we want to save it?  What we want is to do this instead of `!print`:
114 | 
115 |     !save hod-{puznum}.sql
116 | 
117 | In addition to `!format` formatting its "verbatim" input, arguments are also automatically formatted.
118 | So where could `puznum` come from?
119 | 
120 | ## context stays available
121 | 
122 | Now here's where AIPL is different than other array languages.
123 | 
124 | All the way at the beginning of the script, we had that `!split` which gave us the list of puzzle numbers.
125 | If we change that to this:
126 | 
127 |     !split>puznum
128 | 
129 | Then the list of puzzle numbers will be named `puznum`, and be carried forward as additional context to future results.
130 | So even though the puzzle number gets converted to a URL, which gets converted to HTML, then to text, etc, the earlier named contexts are still available for use in formatting.
131 | 
132 | You can see this if you view the intermediate outputs with `--step rich`:
133 | 
134 | ______
135 | 
136 | ## expensive operations are cached
137 | 
138 | Note that the second time through, it ran a lot faster!
139 | This is because AIPL automatically caches the results of expensive operations in a sqlite db called `aipl-cache.sqlite`--you can use [VisiData](visidata.org) or another tool to inspect it.
140 | Since the LLM prompt hasn't changed, AIPL uses the previously cached output, to save both time and money.
141 | (Most of the time, in development, you are going to be trying things over and over, so this is a great convenience.
142 | When you want to deliberately not use the cache, you can use the `--no-cache` CLI flag.
143 | 
144 | ## inputs are actually tables
145 | 
146 | You may have already noticed that the operands actually look like whole *tables*, instead of lists.
147 | This is because under the hood, they are tables.
148 | For purposes of looping like an array language, it's the rightmost or most-recently-added column
149 | which is automatically looped into and over.
150 | 
151 | But the other columns are still available, and certain operators besides `!format` can take advantage of them.
152 | Like if we put `!json` before the `!save`:
153 | 
154 |     !json
155 |     !save hod-solutions.json
156 | 
157 | `!json` converts the entire table to one single JSON blob, including the immediate output, and all previously named columns.
158 | 
159 | ## cross-joining
160 | 
161 | Okay, so GPT-3.5 isn't so great at solving the puzzle (at least with the prompt we've given it).
162 | Maybe GPT-4 would do better?
163 | 
164 | In this script, we could manually replace the model, and run it again.  But what if we wanted to run a new prompt with both models and compare the results?  Or on 10 different models?
165 | 
166 | With the `!cross` operator (and one more language feature), we can.  Let's put this at the top of the script:
167 | 
168 |     !split>model>>models
169 |     gpt-3.5-turbo
170 |     gpt-4.0
171 | 
172 | And replace the `!llm` with this:
173 | 
174 |     !cross <<models
175 |     !llm model={model}
176 | 
177 | What's happening here?  `!cross` is our first binary operator: it takes **two** inputs, whereas our previous operators are all unary (only taking one input).  `!cross` returns the cross product of these two inputs. This will result in two subtables, each one with one of the models and the prompt.
178 | 
179 | So we make a new input and use `>>models` to remember the whole table for later.  (`>model` names the column of values, so we can refer to it in arg formatting).
180 | 
181 | Then we go about our usual business constructing this main table.
182 | Just before running `!llm`, we use `!cross` to do the cross-join, and we use `<<models` to pass the second input.
183 | (We can also use `<<foo` to replace the one input of a unary operator; or we could pass the second input of a binary operator using the "verbatim input", which is not really useful for `!cross`.  But it could be useful for other binary operators.)
184 | 
185 | ## what next?
186 | 
187 | These are the basics of AIPL; you may want to learn about other available operators, but otherwise you now the fundamental structure that everything fits into.
188 | 
189 | - If you want to drop into Python, you can either just use `!python` which executes its input, or you can write an operator; it's pretty easy.
190 | :.  See [the docs on Operators](docs/operators.md).
191 | 
192 | 
193 | 
194 | 


--------------------------------------------------------------------------------
/README.aipl:
--------------------------------------------------------------------------------
  1 | !!def aipl-ops
  2 |  !python-input>op
  3 |     aipl.operators.values()
  4 |  !format
  5 |     - `!{op.opname}` (in={op.rankin} out={op.rankout})
  6 |        {op.__doc__}
  7 |  !sort
  8 |  !join sep=\n
  9 | 
 10 | !read>summarize_aipl
 11 | examples/summarize.aipl
 12 | 
 13 | !unbox>>summarize_aipl
 14 | 
 15 | !aipl-ops>opdocs
 16 | !unbox>>opdocs
 17 | 
 18 | !sh
 19 | aipl --help
 20 | 
 21 | !format>opdocs
 22 | {opdocs}
 23 | 
 24 | 
 25 | !format
 26 | 
 27 |     # AIPL (Array-Inspired Pipeline Language)
 28 | 
 29 |     [![Tests](https://github.com/saulpw/aipl/workflows/aipl-testing/badge.svg)](https://github.com/saulpw/aipl/actions/workflows/main.yml)
 30 |     [![discord](https://img.shields.io/discord/880915750007750737?label=discord)](https://visidata.org/chat)
 31 | 
 32 |     A tiny DSL to make it easier to explore and experiment with AI pipelines.
 33 | 
 34 |     ## Features
 35 | 
 36 |     - array language semantics (implicit looping over input)
 37 |     - tacit dataflow (output from previous command becomes input to next command)
 38 |     - entire pipeline defined in same file and in execution order for better understanding
 39 |       - including inline prompt templates
 40 |     - persistent cache of expensive operations into a sqlite db
 41 | 
 42 |     ### `summarize.aipl`
 43 | 
 44 |     Here's a prime example, a multi-level summarizer in the "map-reduce" style of langchain:
 45 | 
 46 |     ```
 47 |     {summarize_aipl}
 48 |     ```
 49 | 
 50 |     ## Usage
 51 | 
 52 |     ```
 53 |     {stdout}
 54 |     ```
 55 | 
 56 |     ## Command Syntax
 57 | 
 58 |     This is the basic syntax:
 59 | 
 60 |     - comments start with `#` as the first character of a line, and ignore the whole line.
 61 |     - commands start with `!` as the first character of a line.
 62 |     - everything else is given as the verbatim input to the previous `!` command.
 63 | 
 64 |     Commands can take positional and/or keyword arguments, separated by whitespace.
 65 | 
 66 |     - `!cmd arg1 key=value arg2`
 67 | 
 68 |     Keyword arguments have an `=` between the key and the value, and non-keyword arguments are those without a `=` in them.
 69 | 
 70 |     - `!cmd` will call the Python function registered to the `cmd` operator with the arguments given, as an operator on the current value.
 71 | 
 72 |     - Any text following the command line is dedented (and stripped) and added verbatim as a `prompt=` keyword argument.
 73 |     - Argument values may include Python formatting like `{{input}}` which will be replaced by values from the current row (falling back to parent rows, and ultimately the provided globals).
 74 |     - Prompt values, on the other hand, are not automatically formatted. `!format` go over every leaf row and return the formatted prompt as its output.
 75 |     - !literal will set its prompt as the toplevel input, without formatting.
 76 | 
 77 |     The AIPL syntax will continue to evolve and be clarified over time as it's used and developed.
 78 | 
 79 |     Notes:
 80 | 
 81 |     - an AIPL source file documents an entire pipeline from newline-delimited inputs on stdin (or via `!literal`) to the end of the pipeline (often `!print`).
 82 |     - commands always run consecutively and across all inputs.
 83 |     - the initial input is a single string (read from stdin).
 84 | 
 85 |     ## List of operators
 86 | 
 87 |     {opdocs}
 88 | 
 89 | 
 90 |     ## Defining a new operator
 91 | 
 92 |     It's pretty easy to define a new operator that can be used right away.
 93 |     For instance, here's how the `!join` operator might be defined:
 94 | 
 95 |     ```
 96 |     @defop('join', rankin=1, rankout=0)
 97 |     def op_join(aipl:AIPL, v:List[str], sep=' ') -> str:
 98 |         'Concatenate text values with *sep* into a single string.'
 99 |         return sep.join(v)
100 |     ```
101 | 
102 |     - `@defop(...)` registers the decorated function as the named operator.
103 |        - `rankin`/`rankout` indicate what the function takes as input, and what it returns:
104 |          - `0`: a scalar (number or string)
105 |          - `0.5`: a whole row (a mapping of key/value pairs)
106 |          - `1`: a vector of scalar values (e.g. `List[str]` as above)
107 |          - `1.5`: a whole Table (list of the whole table (array of rows)
108 |          - `None`: nothing (the operator is an input "source" if rankin is None; it is a pass-through if rankout is None)
109 |        - `arity` is how many operands it takes (only `0` and `1` supported currently)
110 | 
111 |     The join operator is `rankin=1 rankout=0` which means that it takes a list of strings and outputs a single string.
112 | 
113 |     - Add the `@expensive` decorator to operators that actually go to the network or use an LLM; this will persistently cache the results in a local sqlite database.
114 |        - running the same inputs through a pipeline multiple times won't keep refetching the same data impolitely, and won't run up a large bill during development.
115 | 
116 |     ## Design
117 | 
118 |     AIPL is intended as a simple platform for quick proof of concept data pipelines to be implemented and tested.
119 | 
120 |     ## Why?
121 | 
122 |     The recent developments in LLMs and AI are a whole new level of capabilities (and costs).
123 |     I wanted to see what all the fuss was about, so I tried to do some basic things with [langchain](https://github.com/hwchase17/langchain) but it was this big complicated system.
124 |     So instead I implemented some small workflows myself with raw Python, and it turned out that AI is actually pretty straightforward.
125 |     This is just a small hackable platform that makes it easy to experiment and get small-scale results.
126 |     For now it's called AIPL.
127 | 
128 |     ### Emphasize the Dataflow
129 | 
130 |     An AIPL script represents the essence of a data pipeline, with only the high-level operations and their parameters and prompts.
131 |     No boilerplate or quoting or complicated syntax.
132 |     Not even much of a language--just commands executed in order.
133 |     This keeps the focus on data flow and the high-level operations--the actual links in the chain.
134 |     It can be expanded or optimized or parallelized as needed.
135 | 
136 |     ### Very Little Overhead
137 | 
138 |     AIPL is array-oriented and concatenative, drawing inspiration from APL and Forth, both of which have powerful operators and very simple syntax.
139 |     Passing data implicitly between operators allows for efficient representation of data flows, and avoids [one of the hardest problems in computer science](https://www.namingthings.co/).
140 |     And the implicit looping of array languages makes it easier to scale interactivity.
141 | 
142 |     ### Take Advantage of Python Ecosystem
143 | 
144 |     AIPL is also intended to be practical (if only at small scale), so operators are easy to write using the existing cadre of Python libraries, and allow options and parameters passed to them verbatim.
145 | 
146 |     ### Keep It Simple
147 | 
148 |     The implementation is intentionally homespun, to remove layers of abstraction and reduce the friction of setup and operation.
149 |     It doesn't parallelize anything yet but it still should be able to handle hundreds of items even as it is, enough to prove a concept.
150 |     I expect it to be straightforward to scale it to mag 5 (up to a million items) if something takes off.
151 | 
152 |     ### Learn and Explore
153 | 
154 |     At the very least, AIPL should be a useful tool to learn, explore, and prototype small-scale data pipelines that have expensive operations like API calls and LLM generation.
155 | 
156 | 
157 |     # Architecture
158 | 
159 |     The fundamental data structure is a Table: an array of hashmaps ("rows"), with named Columns that key into each Row to get its value.
160 | 
161 |     A value can be a string or a number or another Table.
162 | 
163 |     The value of a row is the value in the rightmost column of its table.
164 |     The rightmost column of a table is a vector of values representing the whole table.
165 | 
166 |     A simple vector has only strings or numbers.
167 |     A simple table has a simple rightmost value vector and is Rank 0.
168 |     Each nesting of tables in the rightmost value vector increases its Rank by 1.
169 | 
170 |     ## operators
171 |     Each operator consumes 0 or 1 or 2 operands (its `arity`), and produces one result, which becomes the operand for the next operator.
172 | 
173 |     Each operator has an "in rank" and an "out rank", which is the rank of the operands they input and output.
174 | 
175 |     By default, each operator is applied across the deepest nested table.
176 |     The result of each operator is then placed in the deepest nested table (or its parent).
177 | 
178 |     ### rankin=0: one scalar at a time
179 | 
180 |     With `rankin=0` and `rankout` of:
181 | 
182 |     - -1: no change (like 'print')
183 |     - 0: scalar operation (like 'translate')
184 |     - 0.5: scalar to simple row (like 'url-split')
185 |     - 1: scalar to simple vector (like 'split-text')
186 |     - 1.5: scalar to table (like 'extract-links')
187 | 
188 |     ### rankin=0.5: consume whole row
189 | 
190 |     With `rankin=0.5`, and `rankout` of:
191 | 
192 |     - -1: no change to row (like 'dbinsert')
193 |     - 0: add a new value to row (like 'pyexpr')
194 |     - 0.5: replace or remove row (like 'filter')
195 |     - 1: transform whole vector (like 'sort' or 'normalize')
196 |     - 1.5: row to table
197 | 
198 |     ### rankin=1: consume the rightmost column
199 | 
200 |     With `rankin=1`, and `rankout` of:
201 | 
202 |     - -1: no change to row (like 'dbinsert')
203 |     - 0: reduce to scalar (like 'join')
204 |     - 0.5: reduce to simple row (like 'stats')
205 |     - 1: transform whole vector (like 'normalize'); or return None to remove column
206 |     - 1.5: vector to table
207 | 
208 |     ### rankin=1.5: consume whole table
209 | 
210 |     With `rankin=2`, and `rankout` of:
211 | 
212 |     - -1: no change to table
213 |     - 0: reduce table to scalar
214 |     - 0.5: reduce table to single row (like 'collapse')
215 |     - 1: reduce table to single vector ??
216 |     - 1.5: replace table with returned table (like 'sort')
217 | 
218 |     ## arguments and formatting
219 | 
220 |     In addition to operands, operators also take parameters, both positional and named (`args` and `kwargs` in Python).
221 |     These cannot have spaces, but they can have Python format strings like `{{input}}`.
222 | 
223 |     The identifiers available to Python format strings come from a chain of contexts:
224 | 
225 |     - column names in the current table are replaced with the value in the current row (for rankin=0 or 0.5).
226 |        - from each nested table, in priority from innermost to outermost
227 |     - row will also defer to their "parent" row if they don't have the column
228 | 
229 |     # Future
230 | 
231 |     ## new operators
232 | 
233 |     - `!dbtable`: use entire table as input
234 |     - `!dbquery`: sql template -> table
235 | 
236 |     ## single-step debugging
237 | 
238 |     - show results of each step in e.g. VisiData
239 |     - output as Pandas dataframe
240 | 
241 |     ## simple website scraping
242 | 
243 |     - recursively apply `!extract-links` and `!fetch-url` to scrape an entire website
244 |       - need operators to remove already-scraped urls and urls outside a particular domain/urlbase
245 | 
246 |     ## License
247 | 
248 |     I don't know yet.
249 | 
250 |     You can use this and play with it, and if you want to do anything more serious with it, please get in touch.
251 |     The [rest](https://bluebird.sh) [of my](https://xd.saul.pw) [work](https://visidata.org) is fiercely open source, but I also appreciate a good capitalist scheme.
252 |     Come chat with me on Discord [saul.pw/chat](saul.pw/chat) or Mastodon [@saulpw@fosstodon.org](https://fosstodon.org/@saulpw) and let's jam.
253 | 
254 |     If you want to get updates about I'm playing with, you can [sign up for my AI mailing list](https://landing.mailerlite.com/webforms/landing/y9b3w8).
255 | 
256 | !print
257 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AIPL (Array-Inspired Pipeline Language)
  2 | 
  3 | [![Tests](https://github.com/saulpw/aipl/workflows/aipl-testing/badge.svg)](https://github.com/saulpw/aipl/actions/workflows/main.yml)
  4 | [![discord](https://img.shields.io/discord/880915750007750737?label=discord)](https://visidata.org/chat)
  5 | 
  6 | A tiny DSL to make it easier to explore and experiment with AI pipelines.
  7 | 
  8 | ## Features
  9 | 
 10 | - array language semantics (implicit looping over input)
 11 | - tacit dataflow (output from previous command becomes input to next command)
 12 | - entire pipeline defined in same file and in execution order for better understanding
 13 |   - including inline prompt templates
 14 | - persistent cache of expensive operations into a sqlite db
 15 | 
 16 | ### `summarize.aipl`
 17 | 
 18 | Here's a prime example, a multi-level summarizer in the "map-reduce" style of langchain:
 19 | 
 20 | ```
 21 | #!/usr/bin/env bin/aipl
 22 | 
 23 | # fetch url, split webpage into chunks, summarize each chunk, then summarize the summaries.
 24 | 
 25 | # the inputs are urls
 26 | !read
 27 | 
 28 | # extract text from html
 29 | !extract-text
 30 | 
 31 | # split into chunks of lines that can fit in the context window
 32 | !split maxsize=8000 sep=\n
 33 | 
 34 | # have GPT summary each chunk
 35 | !format
 36 | 
 37 | Please read the following section of a webpage (500-1000 words) and provide a
 38 | concise and precise summary in a few sentences, optimized for keywords and main
 39 | content topics. Write only the summary, and do not include phrases like "the
 40 | article" or "this webpage" or "this section" or "the author". Ensure the tone
 41 | is precise and concise, and provide an overview of the entire section:
 42 | 
 43 | """
 44 | {_}
 45 | """
 46 | 
 47 | !llm model=gpt-3.5-turbo
 48 | 
 49 | # join the section summaries together
 50 | !join sep=\n-
 51 | 
 52 | # have GPT summarize the combined summaries
 53 | 
 54 | !format
 55 | 
 56 | Based on the summaries of each section provided, create a one-paragraph summary
 57 | of approximately 100 words. Begin with a topic sentence that introduces the
 58 | overall content topic, followed by several sentences describing the most
 59 | relevant subsections. Provide an overview of all section summaries and include
 60 | a conclusion or recommendations only if they are present in the original
 61 | webpage. Maintain a precise and concise tone, and make the overview coherent
 62 | and readable, while preserving important keywords and main content topics.
 63 | Remove all unnecessary text like "The document" and "the author".
 64 | 
 65 | """
 66 | {_}
 67 | """
 68 | 
 69 | !llm model=gpt-3.5-turbo
 70 | 
 71 | !print
 72 | 
 73 | ```
 74 | 
 75 | ## Usage
 76 | 
 77 | ```
 78 | usage: aipl [-h] [--debug] [--test] [--interactive] [--step STEP] [--step-breakpoint] [--step-rich] [--step-vd] [--dry-run] [--cache-db CACHEDBFN] [--no-cache]
 79 |             [--output-db OUTDBFN] [--split SEPARATOR]
 80 |             [script_or_global ...]
 81 | 
 82 | AIPL interpreter
 83 | 
 84 | positional arguments:
 85 |   script_or_global      scripts to run, or k=v global parameters
 86 | 
 87 | options:
 88 |   -h, --help            show this help message and exit
 89 |   --debug, -d           abort on exception
 90 |   --test, -t            enable test mode
 91 |   --interactive, -i     interactive REPL
 92 |   --step STEP           call aipl.step_<func>(cmd, input) before each step
 93 |   --step-breakpoint, -x
 94 |                         breakpoint() before each step
 95 |   --step-rich, -v       output rich table before each step
 96 |   --step-vd, --vd       open VisiData with input before each step
 97 |   --dry-run, -n         do not execute @expensive operations
 98 |   --cache-db CACHEDBFN, -c CACHEDBFN
 99 |                         sqlite database for caching operators
100 |   --no-cache            sqlite database for caching operators
101 |   --output-db OUTDBFN, -o OUTDBFN
102 |                         sqlite database accessible to !db operators
103 |   --split SEPARATOR, --separator SEPARATOR, -s SEPARATOR
104 |                         separator to split input on
105 | 
106 | ```
107 | 
108 | ## Command Syntax
109 | 
110 | This is the basic syntax:
111 | 
112 | - comments start with `#` as the first character of a line, and ignore the whole line.
113 | - commands start with `!` as the first character of a line.
114 | - everything else is part of the prompt template for the previous `!` command.
115 | 
116 | Commands can take positional and/or keyword arguments, separated by whitespace.
117 | 
118 | - `!cmd arg1 key=value arg2`
119 | 
120 | Keyword arguments have an `=` between the key and the value, and non-keyword arguments are those without a `=` in them.
121 | 
122 | - `!cmd` will call the Python function registered to the `cmd` operator with the arguments given, as an operator on the current value.
123 | 
124 | - Any text following the command line is dedented (and stripped) and added verbatim as a `prompt=` keyword argument.
125 | - Argument values may include Python formatting like `{input}` which will be replaced by values from the current row (falling back to parent rows, and ultimately the provided globals).
126 | - Prompt values, on the other hand, are not automatically formatted. `!format` go over every leaf row and return the formatted prompt as its output.
127 | - !literal will set its prompt as the toplevel input, without formatting.
128 | 
129 | The AIPL syntax will continue to evolve and be clarified over time as it's used and developed.
130 | 
131 | Notes:
132 | 
133 | - an AIPL source file documents an entire pipeline from newline-delimited inputs on stdin (or via `!literal`) to the end of the pipeline (often `!print`).
134 | - commands always run consecutively and across all inputs.
135 | - the initial input is a single string (read from stdin).
136 | 
137 | ## List of operators
138 | 
139 | - `!abort` (in=None out=None)
140 |    Abort the current chain.
141 | - `!cluster` (in=1 out=1)
142 |    Cluster rows by embedding into n clusters; add label column.
143 | - `!columns` (in=1.5 out=1.5)
144 |    Create new table containing only these columns.
145 | - `!comment` (in=None out=None)
146 |    Do nothing (ignoring args and prompt).
147 | - `!cross` (in=0.5 out=1.5)
148 |    Construct cross-product of current input with given global table
149 | - `!global` (in=100 out=1.5)
150 |    Save toplevel input into globals.
151 | - `!unbox` (in=1.5 out=1.5)
152 |    None
153 | - `!csv-parse` (in=None out=1.5)
154 |    Converts a .csv into a table of rows.
155 | - `!dbopen` (in=None out=0)
156 |    Open connection to database.
157 | - `!dbquery` (in=0.5 out=1.5)
158 |    Query database table.
159 | - `!dbdrop` (in=None out=None)
160 |    Drop database table.
161 | - `!dbinsert` (in=0.5 out=None)
162 |    Insert each row into database table.
163 | - `!option` (in=None out=None)
164 |    Set option=value.
165 | - `!debug` (in=None out=None)
166 |    set debug flag and call breakpoint() before each command
167 | - `!def` (in=0 out=None)
168 |    Define composite operator from cmds in prompt (must be indented).
169 | - `!extract-text-all` (in=0 out=0)
170 |    Extract all text from HTML
171 | - `!extract-text` (in=0 out=0)
172 |    Extract meaningful text from HTML
173 | - `!extract-links` (in=0 out=1.5)
174 |    Extract (linktext, title, href) from <a> tags in HTML
175 | - `!filter` (in=1.5 out=1.5)
176 |    Return copy of table, keeping only rows whose value is Truthy.
177 | - `!format` (in=0.5 out=0)
178 |    Format prompt text (right operand) as a Python string template, substituting values from row (left operand) and global context.
179 | - `!groupby` (in=1.5 out=1.5)
180 |    Group rows into tables, by set of columns given as args.
181 | - `!require-input` (in=100 out=100)
182 |    Ensure there is any input at all; if not, display the prompt and read input from the user.
183 | - `!join` (in=1 out=0)
184 |    Join inputs with sep into a single output scalar.
185 | - `!json` (in=100 out=0)
186 |    Convert Table into a json blob.
187 | - `!json-parse` (in=0 out=1.5)
188 |    Convert a json blob into a Table.
189 | - `!literal` (in=None out=0)
190 |    Set prompt as top-level input, without formatting.
191 | - `!llm` (in=0 out=0)
192 |    Send chat messages to `model` (default: gpt-3.5-turbo).  Lines beginning with @@@s or @@@a are sent as system or assistant messages respectively (default user).  Passes all named args directly to API.
193 | - `!llm-embedding` (in=0 out=0.5)
194 |    Get a [text embedding](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) for a string from `model`: a measure of text-relatedness, to be used with e.g. !cluster.
195 | - `!match` (in=0 out=0)
196 |    Return a bool with whether value matched regex. Used with !filter.
197 | - `!metrics-accuracy` (in=1.5 out=0)
198 |    None
199 | - `!metrics-precision` (in=1.5 out=0)
200 |    None
201 | - `!metrics-recall` (in=1.5 out=0)
202 |    None
203 | - `!name` (in=1.5 out=1.5)
204 |    Rename current input column to given name.
205 | - `!nop` (in=None out=None)
206 |    No operation.
207 | - `!pdf-extract` (in=0 out=0)
208 |    Extract contents of pdf to value.
209 | - `!print` (in=0 out=None)
210 |    Print to stdout.
211 | - `!python` (in=None out=None)
212 |    exec() Python toplevel statements.
213 | - `!python-expr` (in=0.5 out=0)
214 |    Add columns for Python expressions.
215 | - `!python-input` (in=0 out=1.5)
216 |    eval() Python expression and use as toplevel input table.
217 | - `!ravel` (in=100 out=1.5)
218 |    All of the leaf scalars in the value column become a single 1-D array.
219 | - `!read` (in=0 out=0)
220 |    Return contents of local filename.
221 | - `!read-bytes` (in=0 out=0)
222 |    Return contents of URL or local filename as bytes.
223 | - `!ref` (in=1.5 out=1.5)
224 |    Move column on table to end of columns list (becoming the new .value)
225 | - `!regex-capture` (in=0 out=0.5)
226 |    Capture from prompt regex into named matching groups.
227 | - `!regex-translate` (in=0 out=0)
228 |    Translate input according to regex translation rules in prompt, one per line, with regex and output separated by whitespace:
229 |         Dr\.? Doctor
230 |         Jr\.? Junior
231 |     
232 | - `!replace` (in=0 out=0)
233 |    Replace `find` in all leaf values with `repl`.
234 | - `!sample` (in=1.5 out=1.5)
235 |    Sample n random rows from the input table.
236 | - `!save` (in=0 out=None)
237 |    Save to given filename.
238 | - `!sh` (in=0 out=1.5)
239 |    Run the command described by args.  Return (retcode, stderr, stdout) columns.
240 | - `!shtty` (in=None out=0.5)
241 |    Run the command described by args.  Return (retcode, stderr, stdout) columns.
242 | - `!sort` (in=1.5 out=1.5)
243 |    Sort the table by the given columns.
244 | - `!grade-up` (in=1.5 out=1)
245 |    Assign ranks to unique elements in an array, incrementally increasing each by its corresponding rank value.
246 | - `!split` (in=0 out=1)
247 |    Split text into chunks based on sep, keeping each chunk below maxsize.
248 | - `!split-into` (in=0 out=0.5)
249 |    Split text by sep into the given column names.
250 | - `!take` (in=1.5 out=1.5)
251 |    Return a table with first n rows of `t`
252 | - `!test-input` (in=100 out=1.5)
253 |    In test mode, replace input with prompt.
254 | - `!test-equal` (in=0 out=None)
255 |    In test mode, error if value is not equal to prompt.
256 | - `!test-json` (in=100 out=None)
257 |    Error if value Column is not equal to json blob in prompt.
258 | - `!url-split` (in=0 out=0.5)
259 |    Split url into components (scheme, netloc, path, params, query, fragment).
260 | - `!url-defrag` (in=0 out=0)
261 |    Remove fragment from url.
262 | - `!xml-xpath` (in=0 out=1)
263 |    Return a vector of XMLElements from parsing entries in value.
264 | - `!xml-xpaths` (in=0 out=0.5)
265 |    Return a vector of XMLElements from parsing entries in value; kwargs become column_name=xpath.
266 | - `!aipl-ops` (in=0 out=0)
267 |    None
268 | 
269 | 
270 | ## Defining a new operator
271 | 
272 | It's pretty easy to define a new operator that can be used right away.
273 | For instance, here's how the `!join` operator might be defined:
274 | 
275 | ```
276 | @defop('join', rankin=1, rankout=0)
277 | def op_join(aipl:AIPL, v:List[str], sep=' ') -> str:
278 |     'Concatenate text values with *sep* into a single string.'
279 |     return sep.join(v)
280 | ```
281 | 
282 | - `@defop(...)` registers the decorated function as the named operator.
283 |    - `rankin`/`rankout` indicate what the function takes as input, and what it returns:
284 |      - `0`: a scalar (number or string)
285 |      - `0.5`: a whole row (a mapping of key/value pairs)
286 |      - `1`: a vector of scalar values (e.g. `List[str]` as above)
287 |      - `1.5`: a whole Table (list of the whole table (array of rows)
288 |      - `None`: nothing (the operator is an input "source" if rankin is None; it is a pass-through if rankout is None)
289 |    - `arity` is how many operands it takes (only `0` and `1` supported currently)
290 | 
291 | The join operator is `rankin=1 rankout=0` which means that it takes a list of strings and outputs a single string.
292 | 
293 | - Add the `@expensive` decorator to operators that actually go to the network or use an LLM; this will persistently cache the results in a local sqlite database.
294 |    - running the same inputs through a pipeline multiple times won't keep refetching the same data impolitely, and won't run up a large bill during development.
295 | 
296 | # Architecture
297 | 
298 | The fundamental data structure is a Table: an array of hashmaps ("rows"), with named Columns that key into each Row to get its value.
299 | 
300 | A value can be a string or a number or another Table.
301 | 
302 | The value of a row is the value in the rightmost column of its table.
303 | The rightmost column of a table is a vector of values representing the whole table.
304 | 
305 | A simple vector has only strings or numbers.
306 | A simple table has a simple rightmost value vector and is Rank 0.
307 | Each nesting of tables in the rightmost value vector increases its Rank by 1.
308 | 
309 | ## operators
310 | Each operator consumes 0 or 1 or 2 operands (its `arity`), and produces one result, which becomes the operand for the next operator.
311 | 
312 | Each operator has an "in rank" and an "out rank", which is the rank of the operands they input and output.
313 | 
314 | By default, each operator is applied across the deepest nested table.
315 | The result of each operator is then placed in the deepest nested table (or its parent).
316 | 
317 | ### rankin=0: one scalar at a time
318 | 
319 | With `rankin=0` and `rankout` of:
320 | 
321 | - -1: no change (like 'print')
322 | - 0: scalar operation (like 'translate')
323 | - 0.5: scalar to simple row (like 'url-split')
324 | - 1: scalar to simple vector (like 'split-text')
325 | - 1.5: scalar to table (like 'extract-links')
326 | 
327 | ### rankin=0.5: consume whole row
328 | 
329 | With `rankin=0.5`, and `rankout` of:
330 | 
331 | - -1: no change to row (like 'dbinsert')
332 | - 0: add a new value to row (like 'pyexpr')
333 | - 0.5: replace or remove row (like 'filter')
334 | - 1: transform whole vector (like 'sort' or 'normalize')
335 | - 1.5: row to table
336 | 
337 | ### rankin=1: consume the rightmost column
338 | 
339 | With `rankin=1`, and `rankout` of:
340 | 
341 | - -1: no change to row (like 'dbinsert')
342 | - 0: reduce to scalar (like 'join')
343 | - 0.5: reduce to simple row (like 'stats')
344 | - 1: transform whole vector (like 'normalize'); or return None to remove column
345 | - 1.5: vector to table
346 | 
347 | ### rankin=1.5: consume whole table
348 | 
349 | With `rankin=2`, and `rankout` of:
350 | 
351 | - -1: no change to table
352 | - 0: reduce table to scalar
353 | - 0.5: reduce table to single row (like 'collapse')
354 | - 1: reduce table to single vector ??
355 | - 1.5: replace table with returned table (like 'sort')
356 | 
357 | ## arguments and formatting
358 | 
359 | In addition to operands, operators also take parameters, both positional and named (`args` and `kwargs` in Python).
360 | These cannot have spaces, but they can have Python format strings like `{input}`.
361 | 
362 | The identifiers available to Python format strings come from a chain of contexts:
363 | 
364 | - column names in the current table are replaced with the value in the current row (for rankin=0 or 0.5).
365 |    - from each nested table, in priority from innermost to outermost
366 | - row will also defer to their "parent" row if they don't have the column
367 | 
368 | ## More information
369 | 
370 | Come chat with us on Discord [bluebird.sh/chat](https://bluebird.sh/chat) or Mastodon [@saulpw@fosstodon.org](https://fosstodon.org/@saulpw).
371 | 
372 | If you want to get updates about I'm playing with, you can [sign up for my AI mailing list](https://landing.mailerlite.com/webforms/landing/y9b3w8).
373 | 
374 | ## License
375 | 
376 | Licensed under MIT.
377 | 
378 | 


--------------------------------------------------------------------------------
/aipl/interpreter.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Mapping, Callable
  2 | from copy import copy
  3 | from dataclasses import dataclass
  4 | from functools import wraps
  5 | from itertools import cycle
  6 | import time
  7 | import inspect
  8 | 
  9 | from aipl import Error, AIPLException, InnerPythonException
 10 | from .table import Table, LazyRow, Column
 11 | from .db import Database
 12 | from .utils import stderr, fmtargs, fmtkwargs, AttrDict
 13 | from .parser import clean_to_id, Command
 14 | from . import parser
 15 | 
 16 | 
 17 | Scalar = int|float|str
 18 | 
 19 | 
 20 | class UserAbort(BaseException):
 21 |     'UserAbort not caught by internal error handling; will always exit.'
 22 | 
 23 | 
 24 | def rank(v):
 25 |     if isinstance(v, LazyRow):
 26 |         return rank(v.value)
 27 |     if isinstance(v, Table):
 28 |         return v.rank
 29 |     else:
 30 |         return 0
 31 | 
 32 | 
 33 | class AIPL:
 34 |     operators = {}  # opname:str -> func(aipl, ..., *args, *kwargs)
 35 |     aliases = {}  # aliasname:str -> builtinopname:str
 36 |     next_unique_key:int = 0
 37 |     cost_usd:float = 0.0
 38 | 
 39 |     def __init__(self, **kwargs):
 40 |         self.tables = {}  # named tables
 41 |         self.globals = dict(  # base context, imports go into here for later use in the whole script
 42 |             aipl=self,
 43 |             defop=defop,
 44 |             stderr=stderr,
 45 |             Table=Table,
 46 |         )
 47 |         self.options = AttrDict(kwargs)
 48 |         self.forced_input = None  # via !test-input
 49 |         self.output_db = Database(self.options.outdbfn)
 50 |         self.cache_db = None
 51 |         if self.options.cachedbfn:
 52 |             self.cache_db = Database(self.options.cachedbfn)
 53 | 
 54 | 
 55 |     @property
 56 |     def unique_key(self) -> str:
 57 |         r = self.next_unique_key
 58 |         self.next_unique_key += 1
 59 |         return f'_{r}'
 60 | 
 61 |     def step_breakpoint(self, cmd:Command, *inputs:List[Table]):
 62 |         breakpoint()
 63 | 
 64 |     def get_op(self, opname:str):
 65 |         while opname in self.aliases:
 66 |             opname = self.aliases[opname].opname
 67 | 
 68 |         return self.operators.get(opname, None)
 69 | 
 70 |     def parse(self, source:str) -> List[Command]:
 71 |         'Generate list of Commands from source text'
 72 | 
 73 |         ast = parser.parse(source)
 74 | 
 75 |         commands = []
 76 |         for command in ast:
 77 |             command.op = self.get_op(command.opname)
 78 | 
 79 |             if not command.op:
 80 |                 raise AIPLException(
 81 |                     f'[line {command.linenum}] no such operator "!{command.opname}"')
 82 | 
 83 |             if command.immediate:
 84 |                 result = self.run_cmdlist([command], [])
 85 |                 if isinstance(result, Error):
 86 |                     if isinstance(result.exception, InnerPythonException):
 87 |                         result.exception.command = command
 88 |                     raise result.exception
 89 | 
 90 |                 if command.varnames:
 91 |                     last_variable = command.varnames[-1]
 92 |                     self.globals[last_variable] = result
 93 |                     stderr(f'(global) {last_variable} = result of {command.line}')
 94 |             else:
 95 |                 commands.append(command)
 96 |         return commands
 97 | 
 98 |     def new_input(self, *inputlines):
 99 |         argkey = self.unique_key
100 |         return Table([{argkey:line} for line in inputlines])
101 | 
102 |     def run_test(self, script:str, *inputlines):
103 |         inputs = [self.new_input(*inputlines)]
104 |         return self.run(script, inputs)[-1]
105 | 
106 |     def run(self, script:str, inputs:list[Table]=None):
107 |         # lines before first cmdline are Python, to be executed immediately.
108 |         # also add nop at end to do final single-steps.
109 |         cmds = self.parse('!!python\n' + script + '\n!nop')
110 | 
111 |         return self.run_cmdlist(cmds, inputs)
112 | 
113 |     def pre_command(self, cmd:Command, t:Table=Table(), *args):
114 |         stderr(t, str(cmd))
115 | 
116 |     def run_cmdlist(self, cmds:List[Command], inputs:List[Table]):
117 |         for cmd in cmds:
118 |             if self.forced_input is not None:
119 |                 inputs.append(self.forced_input)
120 |                 self.forced_input = None
121 | 
122 |             input_tables = [self.tables[arg] for arg in cmd.input_tables]
123 | 
124 |             operands = [inputs[-1]] if inputs else []
125 |             if cmd.prompt is not None:
126 |                 input_tables.append(Table(cmd.prompt))
127 | 
128 |             if input_tables:
129 |                 operands[cmd.op.arity-len(input_tables):] = input_tables
130 | 
131 |             for input_col_name in cmd.input_cols:
132 |                 t = operands[-1]
133 |                 col = t.get_column(input_col_name)
134 |                 if col not in t.columns:
135 |                     raise AIPLException(f'no such column {input_col_name!r}')
136 |                 t.columns.remove(col)
137 |                 t.add_column(col)
138 | 
139 |             self.pre_command(cmd, *operands)
140 | 
141 |             if self.options.step:
142 |                 for stepfuncname in self.options.step.split(','):
143 |                     stepfunc = getattr(self, 'step_'+stepfuncname, None)
144 |                     if stepfunc:
145 |                         stepfunc(cmd, *operands)
146 |                     else:
147 |                         stderr(f'no aipl.step_{stepfuncname}!')
148 | 
149 |             try:
150 |                 annotated_result = self.eval_op(cmd, *operands, contexts=[self.globals, self.tables])
151 |                 result = annotated_result['result']
152 |                 if cmd.op.rankout is None:
153 |                     continue # just keep former inputs
154 |                 elif isinstance(result, Table):
155 |                     inputs = [result]
156 |                 else:
157 |                     k = cmd.varnames[-1] if cmd.varnames else self.unique_key
158 |                     inputs = [Table([{k:result}])]
159 | 
160 |                 for g in cmd.globals:
161 |                     self.tables[g] = inputs[-1]
162 | 
163 |             except AIPLException as e:
164 |                 raise AIPLException(f'AIPL Error (line {cmd.linenum} !{cmd.opname}): {e}') from e
165 |             except Exception as e:
166 |                 raise Exception(f'AIPL Error (line {cmd.linenum} !{cmd.opname}): {e}') from e
167 | 
168 |         for result in inputs:
169 |             if isinstance(result, Error):
170 |                 if isinstance(result.exception, InnerPythonException):
171 |                     result.exception.command = command
172 |                 raise result.exception
173 | 
174 |         return inputs
175 | 
176 |     def call_cmd(self, cmd:Command, contexts:List[Mapping], *inputs, newkey=''):
177 |         operands = [prep_input(arg, rank)
178 |                       for arg,rank in zip(inputs,
179 |                                           [cmd.op.rankin, cmd.op.rankin2])
180 |                    ]
181 |         args = fmtargs(cmd.args, contexts)
182 |         kwargs = fmtkwargs(cmd.kwargs, contexts)
183 | 
184 | 
185 |         try:
186 |             if self.options.step and 'break' in self.options.step.split(','):
187 |                 breakpoint()
188 |             start_t = time.time()
189 |             ret = cmd.op(self, *operands, *args, **kwargs)
190 |         except Exception as e:
191 |             if self.options.debug or self.options.test:
192 |                 raise
193 |             return Error(cmd.linenum, cmd.opname, e)
194 | 
195 |         end_t = time.time()
196 | 
197 |         if cmd.op.rankout is not None and cmd.varnames:
198 |             varname = cmd.varnames[-1]
199 |         else:
200 |             varname = newkey or self.unique_key
201 | 
202 |         result = prep_output(self,
203 |                            inputs[0] if inputs else None,
204 |                            ret,
205 |                            cmd.op.rankout,
206 |                            cmd.op.outcols.split(),
207 |                            varname)
208 | 
209 |         annotated_ret = dict(result=result, cost_usd=self.cost_usd, cost_ms=int((end_t-start_t)*1000))
210 |         self.cost_usd = 0
211 |         return annotated_ret
212 | 
213 |     def eval_op(self, cmd:Command, *operands:List[Table|LazyRow], contexts=[], newkey='') -> dict:
214 |         'Recursively evaluate cmd.op(t) with cmd args formatted with contexts.  Return dict(result:Table, cost_usd:float, cost_ms:int)'
215 | 
216 |         if cmd.op.arity == 0:
217 |             return self.call_cmd(cmd, contexts, newkey=newkey)
218 | 
219 |         else:
220 |             if len(operands) < cmd.op.arity:
221 |                 operands = list(operands) + [Table() for i in range(cmd.op.arity-len(operands))]
222 | 
223 |             t = operands[0]
224 |             if rank(t) <= cmd.op.rankin:
225 |                 return self.call_cmd(cmd, contexts, *operands, newkey=newkey)
226 | 
227 |             if isinstance(t, Table):
228 |                 ret = copy(t)
229 |             else:
230 |                 ret = copy(t.value)
231 | 
232 |             # !op>var1>var2 names the deepest column "var2" and the column one-level up (for rankout==1) "var1"
233 |             if cmd.op.rankout is not None and len(cmd.varnames) > cmd.op.rankout and rank(t) == int(cmd.op.rankin+1):
234 |                 newkey = cmd.varnames[0] or self.unique_key
235 |             else:
236 |                 newkey = newkey or self.unique_key
237 | 
238 |             start_t = time.time()
239 |             cost_usd = 0
240 |             for row in t:
241 |                 annotated_x = self.eval_op(cmd, row, *operands[1:], contexts=contexts+[row], newkey=newkey)
242 |                 x = annotated_x['result']
243 | 
244 |                 if x is None:
245 |                     continue
246 | 
247 |                 subresult = update_dict(row._row, x, newkey)
248 |                 cost_usd += annotated_x['cost_usd']
249 |                 subresult.setdefault('_costs', Table()).append(dict(usd=annotated_x['cost_usd'], ms=annotated_x['cost_ms']))
250 |                 ret.rows.append(subresult)
251 | 
252 |                 ret.add_column(Column('_costs'))
253 | 
254 |                 if isinstance(x, Mapping):
255 |                     for k in x.keys():
256 |                         ret.add_column(Column(k, k))
257 |                 else:
258 |                     ret.add_column(Column(newkey))
259 | 
260 |             end_t = time.time()
261 | 
262 |             return dict(result=ret, cost_usd=cost_usd, cost_ms=int((end_t-start_t)*1000))
263 | 
264 | 
265 | def update_dict(d:dict, elem, key:str='') -> dict:
266 |     'Update d with elem if elem is dict, otherwise add d[key]=elem.  Return d.'
267 |     if isinstance(elem, dict):
268 |         d.update(elem)
269 |     else:
270 |         d[key] = elem
271 |     return d
272 | 
273 | 
274 | def prep_input(operand:LazyRow|Table|Error, rankin:int|float) -> Scalar|List[Scalar]|Table|LazyRow:
275 |     if isinstance(operand, Error):
276 |         return operand
277 | 
278 |     if rankin is None:
279 |         return None
280 |     if rankin == 0:
281 |         if isinstance(operand, Table) and operand.rank == 0:
282 |             return operand.scalar
283 |         elif isinstance(operand, LazyRow):
284 |             return operand.value
285 |         else:
286 |             assert False, type(operand)
287 |     elif rankin == 0.5:
288 |         assert isinstance(operand, LazyRow)
289 |         return operand
290 |     elif rankin == 1:
291 |         if isinstance(operand, LazyRow):
292 |             assert operand.value.rank == 1
293 |             return operand.value.values
294 |         elif isinstance(operand, Table):
295 |             assert operand.rank == 1
296 |             return operand.values
297 |     elif rankin >= 1.5:
298 |         if isinstance(operand, LazyRow):
299 |             #assert operand.value.rank == 1
300 |             return operand.value
301 |         elif isinstance(operand, Table):
302 |             return operand
303 |     else:
304 |         raise Exception("Unexpected rankin")
305 | 
306 | def ziplift(a:Table, b:Table):
307 |     'Yield item pairs from `a` and `b`, with the number of elements from the shorter extended (lifted) to match the number of elements from the longer.'
308 | 
309 |     ita = iter(a)
310 |     itb = iter(b)
311 |     if len(a) > len(b):
312 |         itb = cycle(itb)
313 |     elif len(a) < len(b):
314 |         ita = cycle(ita)
315 |     return zip(ita, itb)
316 | 
317 | def prep_output(aipl,
318 |                 in_row:LazyRow|Table,
319 |                 out:Scalar|List[Scalar]|LazyRow|Table,
320 |                 rankout:int|float,
321 |                 outcols:List[str],
322 |                 varname:str) -> Scalar|List[Scalar]|Table|LazyRow:
323 | 
324 |     if rankout is None:
325 |         return None
326 | 
327 |     if rankout == 0:
328 |         assert not isinstance(out, (Table, LazyRow, dict))
329 |         return out
330 | 
331 |     elif rankout == 0.5:
332 |         return out
333 | 
334 |     elif rankout == 1:
335 |         ret = Table()
336 |         if isinstance(in_row, LazyRow):
337 |             ret.rows = [{'__parent': in_row, varname:v} for v in out]
338 |         elif isinstance(in_row, Table):
339 |             out = list(out)
340 |             ret.rows = [{'__parent': parent_row, varname:v} for parent_row, v in ziplift(in_row, out)]
341 |         else:
342 |             assert False, 'unknown type for in_row'
343 |         ret.add_column(Column(varname))
344 |         return ret
345 | 
346 |     elif rankout >= 1.5:
347 |         if isinstance(out, Table):
348 |             return out
349 |         else:
350 |             if in_row is None:
351 |                 parent_table = None
352 |                 parent_row = None
353 |             elif isinstance(in_row, Table):
354 |                 parent_table = None
355 |                 parent_row = None
356 |             elif isinstance(in_row, LazyRow):
357 |                 parent_table = in_row._table
358 |                 parent_row = in_row
359 |             else:
360 |                 raise Exception(f'unknown type for in_row: {type(in_row)}')
361 | 
362 |             rows = []
363 |             latest_row = {}  # in case there are no rows in out
364 |             all_keys = set()
365 |             for v in out:
366 |                 latest_row = {'__parent': parent_row} if parent_row is not None else {}
367 |                 if isinstance(v, dict):
368 |                     all_keys |= set(v.keys())
369 |                     latest_row.update(v)
370 |                 else:
371 |                     latest_row[varname] = v
372 |                 rows.append(latest_row)
373 | 
374 |             # use final latest_row to figure out columns
375 |             ret = Table(rows, parent=parent_table)
376 |             if outcols:
377 |                 for k in outcols:
378 |                     ret.add_column(Column(k))
379 |             elif all_keys:  # we have to figure out the keys, for better or worse
380 |                 for k in all_keys:
381 |                     ret.add_column(Column(k))
382 | 
383 |             return ret
384 | 
385 |     else:
386 |         raise Exception("Unexpected rankout")
387 | 
388 | 
389 | ranktypes = dict(
390 |     none = None,
391 |     all = 100,
392 |     scalar = 0,
393 |     row = 0.5,
394 |     vector = 1,
395 |     table = 1.5,
396 | )
397 | 
398 | def defop(operation:str|Callable|None=None,
399 |           rankin:None|int|float|str=0,
400 |           rankout:None|int|float|str=0,
401 |           *,
402 |           rankin2:None|int|float|str=None,
403 |           outcols:str='',
404 |           preprompt=lambda x: x,
405 |           opname:str|None=None):
406 |     '''
407 |     Define a new operator.
408 | 
409 |     Can be used as a decorator:
410 | 
411 |     @defop('op_name', rankin='vector')
412 |     def myop(...):
413 | 
414 |     Or just as a function:
415 |     defop(function, rankout='vector')
416 |     defop(function, opname='alternative_name')
417 | 
418 |     aipl will be passed to the function if the first argument is called
419 |     'aipl'.
420 |     '''
421 |     # arity implied by rankin
422 |     if rankin is None:
423 |         arity = 0
424 |     elif rankin2 is None:
425 |         arity = 1
426 |     else:
427 |         arity = 2
428 | 
429 |     # replace string mnemonic with 'actual' rank
430 |     rankin = ranktypes.get(rankin, rankin)
431 |     rankout = ranktypes.get(rankout, rankout)
432 |     rankin2 = ranktypes.get(rankin2, rankin2)
433 | 
434 |     def _decorator(f):
435 |         if opname:
436 |             name = opname
437 |         elif isinstance(operation, str):
438 |             name = operation
439 |         else:
440 |             name = getattr(f, '__name__', None) or str(f)
441 |         name = clean_to_id(name)
442 |         AIPL.operators[name] = Operator(
443 |             rankin = rankin,
444 |             rankout = rankout,
445 |             rankin2 = rankin2,
446 |             arity = arity,
447 |             outcols = outcols,
448 |             opname = opname,
449 |             preprompt = preprompt,
450 |             func = f)
451 |         return f
452 | 
453 |     if callable(operation):
454 |         return _decorator(operation)
455 |     else:
456 |         return _decorator
457 | 
458 | @dataclass
459 | class Operator:
460 |     rankin: int
461 |     rankout: int
462 |     rankin2: int|None
463 |     arity: int
464 |     outcols: str
465 |     opname: str
466 |     preprompt: Callable
467 |     func: Callable
468 | 
469 |     def __call__(self, aipl, *args, **kwargs):
470 |         if self._needs_aipl:
471 |             r = self.func(aipl, *args, **kwargs)
472 |         else:
473 |             r = self.func(*args, **kwargs)
474 | 
475 |         return r
476 | 
477 |     @property
478 |     def needs_prompt(self):
479 |         try:
480 |             return 'prompt' in inspect.signature(self.func).parameters
481 |         except ValueError:
482 |             return False
483 | 
484 |     @property
485 |     def _needs_aipl(self):
486 |         try:
487 |             return list(inspect.signature(self.func).parameters)[0] == 'aipl'
488 |         except ValueError:
489 |             return False
490 | 
491 | 
492 | def alias(alias_name:str, builtin_name:str, dialect:str=''):
493 |     'Create an alias `alias_name` for the op `builtin_name`'
494 |     assert alias_name not in AIPL.aliases
495 |     if builtin_name not in AIPL.operators:
496 |         raise AIPLException(f"{builtin_name} is not a valid operator for alias")
497 |     AIPL.aliases[clean_to_id(alias_name)] = AttrDict(opname=builtin_name, dialect=dialect)
498 | 


--------------------------------------------------------------------------------