├── .github └── workflows │ ├── deploy.yaml │ └── test.yaml.off ├── .gitignore ├── 00_xml.ipynb ├── 01_funccall.ipynb ├── 02_shell.ipynb ├── 03_download.ipynb ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── _quarto.yml ├── index.ipynb ├── nbdev.yml ├── pyproject.toml ├── samples ├── sample_core.py └── sample_styles.css ├── settings.ini ├── setup.py ├── styles.css └── toolslm ├── __init__.py ├── _modidx.py ├── download.py ├── funccall.py ├── md_hier.py ├── shell.py └── xml.py /.github/workflows/deploy.yaml: -------------------------------------------------------------------------------- 1 | name: Deploy to GitHub Pages 2 | 3 | permissions: 4 | contents: write 5 | pages: write 6 | 7 | on: 8 | push: 9 | branches: [ "main", "master" ] 10 | workflow_dispatch: 11 | jobs: 12 | deploy: 13 | runs-on: ubuntu-latest 14 | steps: [uses: fastai/workflows/quarto-ghp@master] 15 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml.off: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [workflow_dispatch, pull_request, push] 3 | 4 | jobs: 5 | test: 6 | runs-on: ubuntu-latest 7 | steps: [uses: fastai/workflows/nbdev-ci@master] 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .gitattributes 2 | _proc/ 3 | index_files/ 4 | sidebar.yml 5 | Gemfile.lock 6 | token 7 | _docs/ 8 | conda/ 9 | .last_checked 10 | .gitconfig 11 | *.bak 12 | *.log 13 | *~ 14 | ~* 15 | _tmp* 16 | tmp* 17 | tags 18 | 19 | # Byte-compiled / optimized / DLL files 20 | __pycache__/ 21 | *.py[cod] 22 | *$py.class 23 | 24 | # C extensions 25 | *.so 26 | 27 | # Distribution / packaging 28 | .Python 29 | env/ 30 | build/ 31 | develop-eggs/ 32 | dist/ 33 | downloads/ 34 | eggs/ 35 | .eggs/ 36 | lib/ 37 | lib64/ 38 | parts/ 39 | sdist/ 40 | var/ 41 | wheels/ 42 | *.egg-info/ 43 | .installed.cfg 44 | *.egg 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | .hypothesis/ 66 | 67 | # Translations 68 | *.mo 69 | *.pot 70 | 71 | # Django stuff: 72 | *.log 73 | local_settings.py 74 | 75 | # Flask stuff: 76 | instance/ 77 | .webassets-cache 78 | 79 | # Scrapy stuff: 80 | .scrapy 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | 85 | # PyBuilder 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # dotenv 101 | .env 102 | 103 | # virtualenv 104 | .venv 105 | venv/ 106 | ENV/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | 121 | .vscode 122 | *.swp 123 | 124 | # osx generated files 125 | .DS_Store 126 | .DS_Store? 127 | .Trashes 128 | ehthumbs.db 129 | Thumbs.db 130 | .idea 131 | 132 | # pytest 133 | .pytest_cache 134 | 135 | # tools/trust-doc-nbs 136 | docs_src/.last_checked 137 | 138 | # symlinks to fastai 139 | docs_src/fastai 140 | tools/fastai 141 | 142 | # link checker 143 | checklink/cookies.txt 144 | 145 | # .gitconfig is now autogenerated 146 | .gitconfig 147 | 148 | _docs 149 | 150 | /.quarto/ 151 | -------------------------------------------------------------------------------- /00_xml.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "efe78920", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "#|default_exp xml" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "3d773712-12fe-440e-891f-36f59666dfde", 16 | "metadata": {}, 17 | "source": [ 18 | "# xml source" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "ff6f6471-8061-4fdd-85a1-25fdc27c5cf3", 24 | "metadata": {}, 25 | "source": [ 26 | "## Setup" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "033c76fd", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "#| export\n", 37 | "import hashlib,xml.etree.ElementTree as ET\n", 38 | "from collections import namedtuple\n", 39 | "\n", 40 | "from fastcore.utils import *\n", 41 | "from fastcore.meta import delegates\n", 42 | "from fastcore.xtras import hl_md\n", 43 | "from fastcore.xml import to_xml, Document, Documents, Document_content, Src\n", 44 | "from fastcore.script import call_parse\n", 45 | "try: from IPython import display\n", 46 | "except: display=None" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "id": "2795f9fc", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "#| exports\n", 57 | "def json_to_xml(d:dict, # JSON dictionary to convert\n", 58 | " rnm:str # Root name\n", 59 | " )->str:\n", 60 | " \"Convert `d` to XML.\"\n", 61 | " root = ET.Element(rnm)\n", 62 | " def build_xml(data, parent):\n", 63 | " if isinstance(data, dict):\n", 64 | " for key, value in data.items(): build_xml(value, ET.SubElement(parent, key))\n", 65 | " elif isinstance(data, list):\n", 66 | " for item in data: build_xml(item, ET.SubElement(parent, 'item'))\n", 67 | " else: parent.text = str(data)\n", 68 | " build_xml(d, root)\n", 69 | " ET.indent(root)\n", 70 | " return ET.tostring(root, encoding='unicode')" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "id": "140a35a2", 76 | "metadata": {}, 77 | "source": [ 78 | "JSON doesn't map as nicely to XML as the data structure used in `fastcore.xml`, but for simple XML trees it can be convenient -- for example:" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "005a5be4", 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/markdown": [ 90 | "```xml\n", 91 | "\n", 92 | " Howard\n", 93 | " \n", 94 | " Jeremy\n", 95 | " Peter\n", 96 | " \n", 97 | "
\n", 98 | " Queensland\n", 99 | " Australia\n", 100 | "
\n", 101 | "
\n", 102 | "```" 103 | ], 104 | "text/plain": [ 105 | "" 106 | ] 107 | }, 108 | "execution_count": null, 109 | "metadata": {}, 110 | "output_type": "execute_result" 111 | } 112 | ], 113 | "source": [ 114 | "a = dict(surname='Howard', firstnames=['Jeremy','Peter'],\n", 115 | " address=dict(state='Queensland',country='Australia'))\n", 116 | "hl_md(json_to_xml(a, 'person'))" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "id": "7788c48c", 122 | "metadata": {}, 123 | "source": [ 124 | "## Including documents" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "479be4c9", 130 | "metadata": {}, 131 | "source": [ 132 | "According [to Anthropic](https://docs.anthropic.com/claude/docs/long-context-window-tips), \"*it's essential to structure your prompts in a way that clearly separates the input data from the instructions*\". They recommend using something like the following:\n", 133 | "\n", 134 | "```xml\n", 135 | "Here are some documents for you to reference for your task:\n", 136 | " \n", 137 | "\n", 138 | "\n", 139 | "\n", 140 | "(URL, file name, hash, etc)\n", 141 | "\n", 142 | "\n", 143 | "(the text content)\n", 144 | "\n", 145 | "\n", 146 | "\n", 147 | "```\n", 148 | "\n", 149 | "We will create some small helper functions to make it easier to generate context in this format, although we're use `` instead of `` to avoid conflict with that HTML tag. Although it's based on Anthropic's recommendation, it's likely to work well with other models too." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "a01dc320", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "#| exports\n", 160 | "doctype = namedtuple('doctype', ['src', 'content'])" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "id": "6620a123", 166 | "metadata": {}, 167 | "source": [ 168 | "We'll use `doctype` to store our pairs." 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "id": "ce853491", 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "#| exports\n", 179 | "def _add_nls(s):\n", 180 | " \"Add newlines to start and end of `s` if missing\"\n", 181 | " if not s: return s\n", 182 | " if s[ 0]!='\\n': s = '\\n'+s\n", 183 | " if s[-1]!='\\n': s = s+'\\n'\n", 184 | " return s" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "id": "026d3b06", 190 | "metadata": {}, 191 | "source": [ 192 | "Since Anthropic's example shows newlines before and after each tag, we'll do the same." 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "id": "26fddbc3", 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "'a'" 205 | ] 206 | }, 207 | "execution_count": null, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "to_xml(Src('a'))" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "id": "1bac81ce", 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "'a'" 226 | ] 227 | }, 228 | "execution_count": null, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "to_xml(Document('a'))" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "id": "40a7e0ba", 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/plain": [ 246 | "'a'" 247 | ] 248 | }, 249 | "execution_count": null, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "to_xml(Documents('a'))" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "id": "932e8858", 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "#| exports\n", 266 | "def mk_doctype(content:str, # The document content\n", 267 | " src:Optional[str]=None # URL, filename, etc; defaults to `md5(content)` if not provided\n", 268 | " ) -> namedtuple:\n", 269 | " \"Create a `doctype` named tuple\"\n", 270 | " if src is None: src = hashlib.md5(content.encode()).hexdigest()[:8]\n", 271 | " return doctype(_add_nls(str(src).strip()), _add_nls(content.strip()))" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "id": "8800921b", 277 | "metadata": {}, 278 | "source": [ 279 | "This is a convenience wrapper to ensure that a `doctype` has the needed information in the right format." 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "id": "14f9e185", 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "doctype(src='\\n47e19350\\n', content='\\nThis is a \"sample\"\\n')" 292 | ] 293 | }, 294 | "execution_count": null, 295 | "metadata": {}, 296 | "output_type": "execute_result" 297 | } 298 | ], 299 | "source": [ 300 | "doc = 'This is a \"sample\"'\n", 301 | "mk_doctype(doc)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "id": "15e454db", 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "#| exports\n", 312 | "def mk_doc(index:int, # The document index\n", 313 | " content:str, # The document content\n", 314 | " src:Optional[str]=None, # URL, filename, etc; defaults to `md5(content)` if not provided\n", 315 | " **kwargs\n", 316 | " ) -> tuple:\n", 317 | " \"Create an `ft` format tuple for a single doc in Anthropic's recommended format\"\n", 318 | " dt = mk_doctype(content, src)\n", 319 | " content = Document_content(NotStr(dt.content))\n", 320 | " src = Src(NotStr(dt.src))\n", 321 | " return Document(src, content, index=index, **kwargs)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "id": "a8b6ac26", 327 | "metadata": {}, 328 | "source": [ 329 | "We can now generate XML for one document in the suggested format:" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "id": "e7ed5a9a", 336 | "metadata": {}, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/markdown": [ 341 | "```html\n", 342 | "\n", 343 | "47e19350\n", 344 | "\n", 345 | "This is a \"sample\"\n", 346 | "\n", 347 | "```" 348 | ], 349 | "text/plain": [ 350 | "document((src(('\\n47e19350\\n',),{}), document-content(('\\nThis is a \"sample\"\\n',),{})),{'index': 1, 'title': 'test'})" 351 | ] 352 | }, 353 | "execution_count": null, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | } 357 | ], 358 | "source": [ 359 | "mk_doc(1, doc, title=\"test\")" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "id": "ba5ebfab", 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "#| exports\n", 370 | "def docs_xml(docs:list[str], # The content of each document\n", 371 | " srcs:Optional[list]=None, # URLs, filenames, etc; each one defaults to `md5(content)` if not provided\n", 372 | " prefix:bool=True, # Include Anthropic's suggested prose intro?\n", 373 | " details:Optional[list]=None # Optional list of dicts with additional attrs for each doc\n", 374 | " )->str:\n", 375 | " \"Create an XML string containing `docs` in Anthropic's recommended format\"\n", 376 | " pre = 'Here are some documents for you to reference for your task:\\n\\n' if prefix else ''\n", 377 | " if srcs is None: srcs = [None]*len(docs)\n", 378 | " if details is None: details = [{}]*len(docs)\n", 379 | " docs = (mk_doc(i+1, d, s, **kw) for i,(d,s,kw) in enumerate(zip(docs,srcs,details)))\n", 380 | " return pre + to_xml(Documents(docs))" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "id": "85004124", 386 | "metadata": {}, 387 | "source": [ 388 | "Putting it all together, we have our final XML format:" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "id": "1dac60f6", 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "name": "stdout", 399 | "output_type": "stream", 400 | "text": [ 401 | "Here are some documents for you to reference for your task:\n", 402 | "\n", 403 | "\n", 404 | "47e19350\n", 405 | "\n", 406 | "This is a \"sample\"\n", 407 | "\n", 408 | "doc.txt\n", 409 | "\n", 410 | "And another one\n", 411 | "\n" 412 | ] 413 | } 414 | ], 415 | "source": [ 416 | "docs = [doc, 'And another one']\n", 417 | "srcs = [None, 'doc.txt']\n", 418 | "print(docs_xml(docs, srcs))" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "id": "2a8a7a9a", 424 | "metadata": {}, 425 | "source": [ 426 | "## Context creation" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "id": "cd06b2dc", 432 | "metadata": {}, 433 | "source": [ 434 | "Now that we can generate Anthropic's XML format, let's make it easy for a few common cases." 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "id": "65317fc6", 440 | "metadata": {}, 441 | "source": [ 442 | "### File list to context" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "id": "3778e8ed", 448 | "metadata": {}, 449 | "source": [ 450 | "For generating XML context from files, we'll just read them as text and use the file names as `src`." 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "id": "0a168636", 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "#| exports\n", 461 | "def files2ctx(\n", 462 | " fnames:list[Union[str,Path]], # List of file names to add to context\n", 463 | " prefix:bool=True # Include Anthropic's suggested prose intro?\n", 464 | ")->str: # XML for LM context\n", 465 | " fnames = [Path(o) for o in fnames]\n", 466 | " contents = [o.read_text() for o in fnames]\n", 467 | " return docs_xml(contents, fnames, prefix=prefix)" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "id": "1bf73d36", 474 | "metadata": {}, 475 | "outputs": [ 476 | { 477 | "data": { 478 | "text/markdown": [ 479 | "```xml\n", 480 | "Here are some documents for you to reference for your task:\n", 481 | "\n", 482 | "\n", 483 | "samples/sample_core.py\n", 484 | "\n", 485 | "import inspect\n", 486 | "empty = inspect.Parameter.empty\n", 487 | "models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'\n", 488 | "\n", 489 | "samples/sample_styles.css\n", 490 | "\n", 491 | ".cell { margin-bottom: 1rem; }\n", 492 | ".cell > .sourceCode { margin-bottom: 0; }\n", 493 | ".cell-output > pre { margin-bottom: 0; }\n", 494 | "\n", 495 | "```" 496 | ], 497 | "text/plain": [ 498 | "" 499 | ] 500 | }, 501 | "execution_count": null, 502 | "metadata": {}, 503 | "output_type": "execute_result" 504 | } 505 | ], 506 | "source": [ 507 | "fnames = ['samples/sample_core.py', 'samples/sample_styles.css']\n", 508 | "hl_md(files2ctx(fnames))" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "id": "191ddb2b", 514 | "metadata": {}, 515 | "source": [ 516 | "### Folder to context" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": null, 522 | "id": "a0452a21", 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "#| exports\n", 527 | "@delegates(globtastic)\n", 528 | "def folder2ctx(\n", 529 | " folder:Union[str,Path], # Folder name containing files to add to context\n", 530 | " prefix:bool=True, # Include Anthropic's suggested prose intro?\n", 531 | " **kwargs # Passed to `globtastic`\n", 532 | ")->str: # XML for Claude context\n", 533 | " fnames = globtastic(folder, **kwargs)\n", 534 | " return files2ctx(fnames, prefix=prefix)" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": null, 540 | "id": "efd52392", 541 | "metadata": {}, 542 | "outputs": [ 543 | { 544 | "name": "stdout", 545 | "output_type": "stream", 546 | "text": [ 547 | "\n", 548 | "samples/sample_core.py\n", 549 | "\n", 550 | "import inspect\n", 551 | "empty = inspect.Parameter.empty\n", 552 | "models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'\n", 553 | "\n" 554 | ] 555 | } 556 | ], 557 | "source": [ 558 | "print(folder2ctx('samples', prefix=False, file_glob='*.py'))" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "id": "0cd4bbeb-b07f-447d-abe8-2b4190d4aa63", 565 | "metadata": {}, 566 | "outputs": [], 567 | "source": [ 568 | "#| exports\n", 569 | "#| hide\n", 570 | "@call_parse\n", 571 | "@delegates(folder2ctx)\n", 572 | "def folder2ctx_cli(\n", 573 | " folder:str, # Folder name containing files to add to context\n", 574 | " **kwargs # Passed to `folder2ctx`\n", 575 | ")->str: # XML for Claude context\n", 576 | " print(folder2ctx(folder, **kwargs))" 577 | ] 578 | }, 579 | { 580 | "cell_type": "markdown", 581 | "id": "95bc490c-bf9d-4146-a729-97f7221559af", 582 | "metadata": {}, 583 | "source": [ 584 | ":::{.callout-tip}\n", 585 | "\n", 586 | "After you install `toolslm`, `folder2ctx` becomes available from the command line. You can see how to use it with the following command:\n", 587 | "\n", 588 | "```bash\n", 589 | "folder2ctx -h\n", 590 | "```\n", 591 | ":::" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "id": "94ec4289", 597 | "metadata": {}, 598 | "source": [ 599 | "## Export -" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": null, 605 | "id": "1e9ee5c1", 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [ 609 | "#|hide\n", 610 | "#|eval: false\n", 611 | "from nbdev.doclinks import nbdev_export\n", 612 | "nbdev_export()" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": null, 618 | "id": "5d06a6ce", 619 | "metadata": {}, 620 | "outputs": [], 621 | "source": [] 622 | } 623 | ], 624 | "metadata": { 625 | "kernelspec": { 626 | "display_name": "python3", 627 | "language": "python", 628 | "name": "python3" 629 | } 630 | }, 631 | "nbformat": 4, 632 | "nbformat_minor": 5 633 | } 634 | -------------------------------------------------------------------------------- /01_funccall.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "efe78920", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "#|default_exp funccall" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "3d773712-12fe-440e-891f-36f59666dfde", 16 | "metadata": {}, 17 | "source": [ 18 | "# funccall source" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "e5ad6b86", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "#| exports\n", 29 | "import inspect\n", 30 | "from collections import abc\n", 31 | "from fastcore.utils import *\n", 32 | "from fastcore.docments import docments\n", 33 | "from typing import get_origin, get_args, Dict, List, Optional, Tuple, Union\n", 34 | "from types import UnionType" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "aec123ab", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "#|hide\n", 45 | "from fastcore.test import *" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "id": "a9f43047", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "#| export\n", 56 | "empty = inspect.Parameter.empty" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "1a7cdbc6", 62 | "metadata": {}, 63 | "source": [ 64 | "## Function calling" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "7ec35c95", 70 | "metadata": {}, 71 | "source": [ 72 | "Many LLMs do function calling (aka tool use) by taking advantage of JSON schema.\n", 73 | "\n", 74 | "We'll use [docments](https://fastcore.fast.ai/docments.html) to make getting JSON schema from Python functions as ergonomic as possible. Each parameter (and the return value) should have a type, and a docments comment with the description of what it is. Here's an example:" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "4a017af1", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "def silly_sum(\n", 85 | " a:int, # First thing to sum\n", 86 | " b:int=1, # Second thing to sum\n", 87 | " c:list[int]=None, # A pointless argument\n", 88 | ") -> int: # The sum of the inputs\n", 89 | " \"Adds a + b.\"\n", 90 | " return a + b" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "1a3ff443", 96 | "metadata": {}, 97 | "source": [ 98 | "This is what `docments` makes of that:" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "id": "b3f2ebcf", 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/markdown": [ 110 | "```json\n", 111 | "{ 'a': { 'anno': ,\n", 112 | " 'default': ,\n", 113 | " 'docment': 'First thing to sum'},\n", 114 | " 'b': {'anno': , 'default': 1, 'docment': 'Second thing to sum'},\n", 115 | " 'c': {'anno': list[int], 'default': None, 'docment': 'A pointless argument'},\n", 116 | " 'return': { 'anno': ,\n", 117 | " 'default': ,\n", 118 | " 'docment': 'The sum of the inputs'}}\n", 119 | "```" 120 | ], 121 | "text/plain": [ 122 | "{'a': {'docment': 'First thing to sum',\n", 123 | " 'anno': int,\n", 124 | " 'default': inspect._empty},\n", 125 | " 'b': {'docment': 'Second thing to sum', 'anno': int, 'default': 1},\n", 126 | " 'c': {'docment': 'A pointless argument', 'anno': list[int], 'default': None},\n", 127 | " 'return': {'docment': 'The sum of the inputs',\n", 128 | " 'anno': int,\n", 129 | " 'default': inspect._empty}}" 130 | ] 131 | }, 132 | "execution_count": null, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "d = docments(silly_sum, full=True)\n", 139 | "d" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "id": "745e44ea", 145 | "metadata": {}, 146 | "source": [ 147 | "Note that this is an [AttrDict](https://fastcore.fast.ai/basics.html#attrdict) so we can treat it like an object, *or* a dict:" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "id": "35cb279d", 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "('First thing to sum', int)" 160 | ] 161 | }, 162 | "execution_count": null, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "d.a.docment, d['a']['anno']" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "id": "e7bf4025", 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "#| exports\n", 179 | "def _types(t:type)->tuple[str,Optional[str]]:\n", 180 | " \"Tuple of json schema type name and (if appropriate) array item name.\"\n", 181 | " if t is empty: raise TypeError('Missing type')\n", 182 | " tmap = {int:\"integer\", float:\"number\", str:\"string\", bool:\"boolean\", list:\"array\", dict:\"object\"}\n", 183 | " tmap.update({k.__name__: v for k, v in tmap.items()})\n", 184 | " if getattr(t, '__origin__', None) in (list,tuple):\n", 185 | " args = getattr(t, '__args__', None)\n", 186 | " item_type = \"object\" if not args else tmap.get(t.__args__[0].__name__, \"object\")\n", 187 | " return \"array\", item_type\n", 188 | " # if t is a string like 'int', directly use the string as the key\n", 189 | " elif isinstance(t, str): return tmap.get(t, \"object\"), None\n", 190 | " # if t is the type itself and a container\n", 191 | " elif get_origin(t): return tmap.get(get_origin(t).__name__, \"object\"), None\n", 192 | " # if t is the type itself like int, use the __name__ representation as the key\n", 193 | " else: return tmap.get(t.__name__, \"object\"), None" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "id": "edf73046", 199 | "metadata": {}, 200 | "source": [ 201 | "This internal function is needed to convert Python types into JSON schema types." 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "id": "ecb7bc52", 208 | "metadata": {}, 209 | "outputs": [ 210 | { 211 | "data": { 212 | "text/plain": [ 213 | "(('array', 'integer'), ('integer', None), ('integer', None))" 214 | ] 215 | }, 216 | "execution_count": null, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "_types(list[int]), _types(int), _types('int')" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "id": "38b4650a", 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "data": { 233 | "text/plain": [ 234 | "(('array', 'integer'), ('object', None), ('object', None), ('array', 'string'))" 235 | ] 236 | }, 237 | "execution_count": null, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "_types(List[int]), _types(Optional[str]), _types(str | None), _types(Tuple[str, int])" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "id": "f4d0ac1e", 249 | "metadata": {}, 250 | "source": [ 251 | "Note the current behavior:\n", 252 | "\n", 253 | "- ignores all but the first argument for tuples\n", 254 | "- union types map to object which is a stand-in for arbitrary types\n", 255 | "\n", 256 | "These and other approximations may require further refinement in the future." 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "id": "c0e3c940", 262 | "metadata": {}, 263 | "source": [ 264 | "Will also convert custom types to the `object` type." 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "id": "9969fd00", 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "data": { 275 | "text/plain": [ 276 | "(('array', 'object'), ('object', None))" 277 | ] 278 | }, 279 | "execution_count": null, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | } 283 | ], 284 | "source": [ 285 | "class Custom: a: int\n", 286 | "_types(list[Custom]), _types(Custom)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "id": "4d5dc245", 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "#| exports\n", 297 | "def _param(name, info):\n", 298 | " \"json schema parameter given `name` and `info` from docments full dict.\"\n", 299 | " paramt,itemt = _types(info.anno)\n", 300 | " pschema = dict(type=paramt, description=info.docment or \"\")\n", 301 | " if itemt: pschema[\"items\"] = {\"type\": itemt}\n", 302 | " if info.default is not empty: pschema[\"default\"] = info.default\n", 303 | " return pschema" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "id": "5337d6bd", 309 | "metadata": {}, 310 | "source": [ 311 | "This private function converts a key/value pair from the `docments` structure into the `dict` that will be needed for the schema." 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "id": "2450ace6", 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "name": "stdout", 322 | "output_type": "stream", 323 | "text": [ 324 | "a // {'docment': 'First thing to sum', 'anno': , 'default': }\n" 325 | ] 326 | }, 327 | { 328 | "data": { 329 | "text/plain": [ 330 | "{'type': 'integer', 'description': 'First thing to sum'}" 331 | ] 332 | }, 333 | "execution_count": null, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "n,o = first(d.items())\n", 340 | "print(n,'//', o)\n", 341 | "_param(n, o)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "id": "ba6bcac4", 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "#| export\n", 352 | "custom_types = {Path}\n", 353 | "\n", 354 | "def _handle_type(t, defs):\n", 355 | " \"Handle a single type, creating nested schemas if necessary\"\n", 356 | " if t is NoneType: return {'type': 'null'}\n", 357 | " if t in custom_types: return {'type':'string', 'format':t.__name__}\n", 358 | " if isinstance(t, type) and not issubclass(t, (int, float, str, bool)) or inspect.isfunction(t):\n", 359 | " defs[t.__name__] = _get_nested_schema(t)\n", 360 | " return {'$ref': f'#/$defs/{t.__name__}'}\n", 361 | " return {'type': _types(t)[0]}" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "id": "16dbf080", 368 | "metadata": {}, 369 | "outputs": [ 370 | { 371 | "data": { 372 | "text/plain": [ 373 | "({'type': 'integer'}, {'type': 'string', 'format': 'Path'})" 374 | ] 375 | }, 376 | "execution_count": null, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "_handle_type(int, None), _handle_type(Path, None)" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "id": "7fd6cd29", 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "#| export\n", 393 | "def _is_container(t):\n", 394 | " \"Check if type is a container (list, dict, tuple, set, Union)\"\n", 395 | " origin = get_origin(t)\n", 396 | " return origin in (list, dict, tuple, set, Union) if origin else False\n", 397 | "\n", 398 | "def _is_parameterized(t):\n", 399 | " \"Check if type has arguments (e.g. list[int] vs list, dict[str, int] vs dict)\"\n", 400 | " return _is_container(t) and (get_args(t) != ())" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "id": "783747af", 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "assert _is_parameterized(list[int]) == True\n", 411 | "assert _is_parameterized(int) == False\n", 412 | "assert _is_container(list[int]) == True\n", 413 | "assert _is_container(dict[str, int]) == True\n", 414 | "assert _is_container(int) == False" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "id": "d42c88dd", 420 | "metadata": {}, 421 | "source": [ 422 | "For union and optional types, `Union` covers older `Union[str]` syntax while `UnionType` covers 3.10+ `str | None` syntax." 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "id": "7815799b", 429 | "metadata": {}, 430 | "outputs": [ 431 | { 432 | "data": { 433 | "text/plain": [ 434 | "(str | None, types.UnionType, (str, NoneType))" 435 | ] 436 | }, 437 | "execution_count": null, 438 | "metadata": {}, 439 | "output_type": "execute_result" 440 | } 441 | ], 442 | "source": [ 443 | "def _example_new_unioin(opt_tup: str | None):\n", 444 | " pass\n", 445 | "\n", 446 | "d = docments(_example_new_unioin, full=True)\n", 447 | "anno1 = first(d.items())[1].anno\n", 448 | "(anno1, get_origin(anno1), get_args(anno1))" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "id": "d745c902", 455 | "metadata": {}, 456 | "outputs": [ 457 | { 458 | "data": { 459 | "text/plain": [ 460 | "(typing.Optional[str], typing.Union, (str, NoneType))" 461 | ] 462 | }, 463 | "execution_count": null, 464 | "metadata": {}, 465 | "output_type": "execute_result" 466 | } 467 | ], 468 | "source": [ 469 | "def _example_old_union(opt_tup: Union[str, type(None)] =None):\n", 470 | " pass\n", 471 | "\n", 472 | "d = docments(_example_old_union, full=True)\n", 473 | "anno2 = first(d.items())[1].anno\n", 474 | "(anno2, get_origin(anno2), get_args(anno2))" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "id": "3c5701c7", 480 | "metadata": {}, 481 | "source": [ 482 | "Support for both union types is part of the broader container handling:" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "id": "c1153f02", 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "#| export\n", 493 | "def _handle_container(origin, args, defs):\n", 494 | " \"Handle container types like dict, list, tuple, set, and Union\"\n", 495 | " if origin is Union or origin is UnionType:\n", 496 | " return {\"anyOf\": [_handle_type(arg, defs) for arg in args]}\n", 497 | " if origin is dict:\n", 498 | " value_type = args[1].__args__[0] if hasattr(args[1], '__args__') else args[1]\n", 499 | " return {\n", 500 | " 'type': 'object',\n", 501 | " 'additionalProperties': (\n", 502 | " {'type': 'array', 'items': _handle_type(value_type, defs)}\n", 503 | " if hasattr(args[1], '__origin__') else _handle_type(args[1], defs)\n", 504 | " )\n", 505 | " }\n", 506 | " elif origin in (list, tuple, set):\n", 507 | " schema = {'type': 'array', 'items': _handle_type(args[0], defs)}\n", 508 | " if origin is set:\n", 509 | " schema['uniqueItems'] = True\n", 510 | " return schema\n", 511 | " return None" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "id": "5ee1c529", 518 | "metadata": {}, 519 | "outputs": [], 520 | "source": [ 521 | "#| export\n", 522 | "def _process_property(name, obj, props, req, defs):\n", 523 | " \"Process a single property of the schema\"\n", 524 | " p = _param(name, obj)\n", 525 | " props[name] = p\n", 526 | " if obj.default is empty: req[name] = True\n", 527 | "\n", 528 | " if _is_container(obj.anno) and _is_parameterized(obj.anno):\n", 529 | " p.update(_handle_container(get_origin(obj.anno), get_args(obj.anno), defs)) \n", 530 | " else:\n", 531 | " # Non-container type or container without arguments\n", 532 | " p.update(_handle_type(obj.anno, defs))" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "id": "38b0f97e", 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "#| export\n", 543 | "def _get_nested_schema(obj):\n", 544 | " \"Generate nested JSON schema for a class or function\"\n", 545 | " d = docments(obj, full=True)\n", 546 | " props, req, defs = {}, {}, {}\n", 547 | "\n", 548 | " for n, o in d.items():\n", 549 | " if n != 'return' and n != 'self':\n", 550 | " _process_property(n, o, props, req, defs)\n", 551 | "\n", 552 | " schema = dict(type='object', properties=props, title=obj.__name__ if isinstance(obj, type) else None)\n", 553 | " if req: schema['required'] = list(req)\n", 554 | " if defs: schema['$defs'] = defs\n", 555 | " return schema" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "id": "1bb9df6c", 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "# Test primitive types\n", 566 | "defs = {}\n", 567 | "assert _handle_type(int, defs) == {'type': 'integer'}\n", 568 | "assert _handle_type(str, defs) == {'type': 'string'}\n", 569 | "assert _handle_type(bool, defs) == {'type': 'boolean'}\n", 570 | "assert _handle_type(float, defs) == {'type': 'number'}\n", 571 | "\n", 572 | "# Test custom class\n", 573 | "class TestClass:\n", 574 | " def __init__(self, x: int, y: int): store_attr()\n", 575 | "\n", 576 | "result = _handle_type(TestClass, defs)\n", 577 | "assert result == {'$ref': '#/$defs/TestClass'}\n", 578 | "assert 'TestClass' in defs\n", 579 | "assert defs['TestClass']['type'] == 'object'\n", 580 | "assert 'properties' in defs['TestClass']" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "id": "b1d09435", 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "# Test primitive types in containers\n", 591 | "assert _handle_container(list, (int,), defs) == {'type': 'array', 'items': {'type': 'integer'}}\n", 592 | "assert _handle_container(tuple, (str,), defs) == {'type': 'array', 'items': {'type': 'string'}}\n", 593 | "assert _handle_container(set, (str,), defs) == {'type': 'array', 'items': {'type': 'string'}, 'uniqueItems': True}\n", 594 | "assert _handle_container(dict, (str,bool), defs) == {'type': 'object', 'additionalProperties': {'type': 'boolean'}}\n", 595 | "\n", 596 | "result = _handle_container(list, (TestClass,), defs)\n", 597 | "assert result == {'type': 'array', 'items': {'$ref': '#/$defs/TestClass'}}\n", 598 | "assert 'TestClass' in defs\n", 599 | "\n", 600 | "# Test complex nested structure\n", 601 | "ComplexType = dict[str, list[TestClass]]\n", 602 | "result = _handle_container(dict, (str, list[TestClass]), defs)\n", 603 | "assert result == {\n", 604 | " 'type': 'object',\n", 605 | " 'additionalProperties': {\n", 606 | " 'type': 'array',\n", 607 | " 'items': {'$ref': '#/$defs/TestClass'}\n", 608 | " }\n", 609 | "}" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": null, 615 | "id": "a5fd37d5", 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [ 619 | "# Test processing of a required integer property\n", 620 | "props, req = {}, {}\n", 621 | "class TestClass:\n", 622 | " \"Test class\"\n", 623 | " def __init__(\n", 624 | " self,\n", 625 | " x: int, # First thing\n", 626 | " y: list[float], # Second thing\n", 627 | " z: str = \"default\", # Third thing\n", 628 | " ): store_attr()\n", 629 | "\n", 630 | "d = docments(TestClass, full=True)\n", 631 | "_process_property('x', d.x, props, req, defs)\n", 632 | "assert 'x' in props\n", 633 | "assert props['x']['type'] == 'integer'\n", 634 | "assert 'x' in req\n", 635 | "\n", 636 | "# Test processing of a required list property\n", 637 | "_process_property('y', d.y, props, req, defs)\n", 638 | "assert 'y' in props\n", 639 | "assert props['y']['type'] == 'array'\n", 640 | "assert props['y']['items']['type'] == 'number'\n", 641 | "assert 'y' in req\n", 642 | "\n", 643 | "# Test processing of an optional string property with default\n", 644 | "_process_property('z', d.z, props, req, defs)\n", 645 | "assert 'z' in props\n", 646 | "assert props['z']['type'] == 'string'\n", 647 | "assert props['z']['default'] == \"default\"\n", 648 | "assert 'z' not in req" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": null, 654 | "id": "23f54386", 655 | "metadata": {}, 656 | "outputs": [], 657 | "source": [ 658 | "#| exports\n", 659 | "def get_schema(f:Union[callable,dict], pname='input_schema')->dict:\n", 660 | " \"Generate JSON schema for a class, function, or method\"\n", 661 | " if isinstance(f, dict): return f\n", 662 | " schema = _get_nested_schema(f)\n", 663 | " desc = f.__doc__\n", 664 | " assert desc, \"Docstring missing!\"\n", 665 | " d = docments(f, full=True)\n", 666 | " ret = d.pop('return')\n", 667 | " if ret.anno is not empty: desc += f'\\n\\nReturns:\\n- type: {_types(ret.anno)[0]}'\n", 668 | " return {\"name\": f.__name__, \"description\": desc, pname: schema}" 669 | ] 670 | }, 671 | { 672 | "cell_type": "markdown", 673 | "id": "a59df671", 674 | "metadata": {}, 675 | "source": [ 676 | "Putting this all together, we can now test getting a schema from `silly_sum`. The tool use spec doesn't support return annotations directly, so we put that in the description instead." 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": null, 682 | "id": "e7311af9", 683 | "metadata": {}, 684 | "outputs": [ 685 | { 686 | "name": "stdout", 687 | "output_type": "stream", 688 | "text": [ 689 | "Adds a + b.\n", 690 | "\n", 691 | "Returns:\n", 692 | "- type: integer\n" 693 | ] 694 | }, 695 | { 696 | "data": { 697 | "text/plain": [ 698 | "{'name': 'silly_sum',\n", 699 | " 'input_schema': {'type': 'object',\n", 700 | " 'properties': {'a': {'type': 'integer', 'description': 'First thing to sum'},\n", 701 | " 'b': {'type': 'integer',\n", 702 | " 'description': 'Second thing to sum',\n", 703 | " 'default': 1},\n", 704 | " 'c': {'type': 'array',\n", 705 | " 'description': 'A pointless argument',\n", 706 | " 'items': {'type': 'integer'},\n", 707 | " 'default': None}},\n", 708 | " 'title': None,\n", 709 | " 'required': ['a']}}" 710 | ] 711 | }, 712 | "execution_count": null, 713 | "metadata": {}, 714 | "output_type": "execute_result" 715 | } 716 | ], 717 | "source": [ 718 | "s = get_schema(silly_sum)\n", 719 | "desc = s.pop('description')\n", 720 | "print(desc)\n", 721 | "s" 722 | ] 723 | }, 724 | { 725 | "cell_type": "markdown", 726 | "id": "d478ba6b", 727 | "metadata": {}, 728 | "source": [ 729 | "This also works with string annotations, e.g:" 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": null, 735 | "id": "80203962", 736 | "metadata": {}, 737 | "outputs": [ 738 | { 739 | "data": { 740 | "text/plain": [ 741 | "{'name': 'silly_test',\n", 742 | " 'description': 'Mandatory docstring',\n", 743 | " 'input_schema': {'type': 'object',\n", 744 | " 'properties': {'a': {'type': 'integer', 'description': 'quoted type hint'}},\n", 745 | " 'title': None,\n", 746 | " 'required': ['a']}}" 747 | ] 748 | }, 749 | "execution_count": null, 750 | "metadata": {}, 751 | "output_type": "execute_result" 752 | } 753 | ], 754 | "source": [ 755 | "def silly_test(\n", 756 | " a: 'int', # quoted type hint\n", 757 | "):\n", 758 | " \"Mandatory docstring\"\n", 759 | " return a\n", 760 | "\n", 761 | "get_schema(silly_test)" 762 | ] 763 | }, 764 | { 765 | "cell_type": "markdown", 766 | "id": "e3f36f8a", 767 | "metadata": {}, 768 | "source": [ 769 | "This also works with instance methods:" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": null, 775 | "id": "05d33447", 776 | "metadata": {}, 777 | "outputs": [ 778 | { 779 | "data": { 780 | "text/plain": [ 781 | "{'name': 'sums',\n", 782 | " 'description': 'Adds a + b.\\n\\nReturns:\\n- type: integer',\n", 783 | " 'input_schema': {'type': 'object',\n", 784 | " 'properties': {'a': {'type': 'integer', 'description': 'First thing to sum'},\n", 785 | " 'b': {'type': 'integer',\n", 786 | " 'description': 'Second thing to sum',\n", 787 | " 'default': 1}},\n", 788 | " 'title': None,\n", 789 | " 'required': ['a']}}" 790 | ] 791 | }, 792 | "execution_count": null, 793 | "metadata": {}, 794 | "output_type": "execute_result" 795 | } 796 | ], 797 | "source": [ 798 | "class Dummy:\n", 799 | " def sums(\n", 800 | " self,\n", 801 | " a:int, # First thing to sum\n", 802 | " b:int=1 # Second thing to sum\n", 803 | " ) -> int: # The sum of the inputs\n", 804 | " \"Adds a + b.\"\n", 805 | " print(f\"Finding the sum of {a} and {b}\")\n", 806 | " return a + b\n", 807 | "\n", 808 | "get_schema(Dummy.sums)" 809 | ] 810 | }, 811 | { 812 | "cell_type": "markdown", 813 | "id": "ae3fdfa4", 814 | "metadata": {}, 815 | "source": [ 816 | "`get_schema` also handles more complicated structures such as nested classes. This is useful for things like structured outputs." 817 | ] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "execution_count": null, 822 | "id": "ce3be915", 823 | "metadata": {}, 824 | "outputs": [ 825 | { 826 | "data": { 827 | "text/plain": [ 828 | "{'name': 'Conversation',\n", 829 | " 'description': 'A conversation between two speakers',\n", 830 | " 'input_schema': {'type': 'object',\n", 831 | " 'properties': {'turns': {'type': 'array',\n", 832 | " 'description': 'Turns of the conversation',\n", 833 | " 'items': {'$ref': '#/$defs/Turn'}}},\n", 834 | " 'title': 'Conversation',\n", 835 | " 'required': ['turns'],\n", 836 | " '$defs': {'Turn': {'type': 'object',\n", 837 | " 'properties': {'speaker_a': {'type': 'string',\n", 838 | " 'description': \"First speaker's message\"},\n", 839 | " 'speaker_b': {'type': 'string',\n", 840 | " 'description': \"Second speaker's message\"}},\n", 841 | " 'title': 'Turn',\n", 842 | " 'required': ['speaker_a', 'speaker_b']}}}}" 843 | ] 844 | }, 845 | "execution_count": null, 846 | "metadata": {}, 847 | "output_type": "execute_result" 848 | } 849 | ], 850 | "source": [ 851 | "class Turn:\n", 852 | " \"Turn between two speakers\"\n", 853 | " def __init__(\n", 854 | " self,\n", 855 | " speaker_a:str, # First speaker's message\n", 856 | " speaker_b:str, # Second speaker's message\n", 857 | " ): store_attr()\n", 858 | "\n", 859 | "class Conversation:\n", 860 | " \"A conversation between two speakers\"\n", 861 | " def __init__(\n", 862 | " self,\n", 863 | " turns:list[Turn], # Turns of the conversation\n", 864 | " ): store_attr()\n", 865 | "\n", 866 | "get_schema(Conversation)" 867 | ] 868 | }, 869 | { 870 | "cell_type": "code", 871 | "execution_count": null, 872 | "id": "386e514d", 873 | "metadata": {}, 874 | "outputs": [ 875 | { 876 | "data": { 877 | "text/plain": [ 878 | "{'name': 'DictConversation',\n", 879 | " 'description': 'A conversation between two speakers',\n", 880 | " 'input_schema': {'type': 'object',\n", 881 | " 'properties': {'turns': {'type': 'object',\n", 882 | " 'description': 'dictionary of topics and the Turns of the conversation',\n", 883 | " 'additionalProperties': {'type': 'array',\n", 884 | " 'items': {'$ref': '#/$defs/Turn'}}}},\n", 885 | " 'title': 'DictConversation',\n", 886 | " 'required': ['turns'],\n", 887 | " '$defs': {'Turn': {'type': 'object',\n", 888 | " 'properties': {'speaker_a': {'type': 'string',\n", 889 | " 'description': \"First speaker's message\"},\n", 890 | " 'speaker_b': {'type': 'string',\n", 891 | " 'description': \"Second speaker's message\"}},\n", 892 | " 'title': 'Turn',\n", 893 | " 'required': ['speaker_a', 'speaker_b']}}}}" 894 | ] 895 | }, 896 | "execution_count": null, 897 | "metadata": {}, 898 | "output_type": "execute_result" 899 | } 900 | ], 901 | "source": [ 902 | "class DictConversation:\n", 903 | " \"A conversation between two speakers\"\n", 904 | " def __init__(\n", 905 | " self,\n", 906 | " turns:dict[str,list[Turn]], # dictionary of topics and the Turns of the conversation\n", 907 | " ): store_attr()\n", 908 | "\n", 909 | "get_schema(DictConversation)" 910 | ] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": null, 915 | "id": "2c08ac6b", 916 | "metadata": {}, 917 | "outputs": [ 918 | { 919 | "data": { 920 | "text/plain": [ 921 | "{'name': 'SetConversation',\n", 922 | " 'description': 'A conversation between two speakers',\n", 923 | " 'input_schema': {'type': 'object',\n", 924 | " 'properties': {'turns': {'type': 'array',\n", 925 | " 'description': 'the unique Turns of the conversation',\n", 926 | " 'items': {'$ref': '#/$defs/Turn'},\n", 927 | " 'uniqueItems': True}},\n", 928 | " 'title': 'SetConversation',\n", 929 | " 'required': ['turns'],\n", 930 | " '$defs': {'Turn': {'type': 'object',\n", 931 | " 'properties': {'speaker_a': {'type': 'string',\n", 932 | " 'description': \"First speaker's message\"},\n", 933 | " 'speaker_b': {'type': 'string',\n", 934 | " 'description': \"Second speaker's message\"}},\n", 935 | " 'title': 'Turn',\n", 936 | " 'required': ['speaker_a', 'speaker_b']}}}}" 937 | ] 938 | }, 939 | "execution_count": null, 940 | "metadata": {}, 941 | "output_type": "execute_result" 942 | } 943 | ], 944 | "source": [ 945 | "class SetConversation:\n", 946 | " \"A conversation between two speakers\"\n", 947 | " def __init__(\n", 948 | " self,\n", 949 | " turns:set[Turn], # the unique Turns of the conversation\n", 950 | " ): store_attr()\n", 951 | "\n", 952 | "get_schema(SetConversation)" 953 | ] 954 | }, 955 | { 956 | "cell_type": "code", 957 | "execution_count": null, 958 | "id": "8cf3f35c", 959 | "metadata": {}, 960 | "outputs": [], 961 | "source": [ 962 | "#| exports\n", 963 | "def PathArg(\n", 964 | " path: str # A filesystem path\n", 965 | "): return Path(path)" 966 | ] 967 | }, 968 | { 969 | "cell_type": "markdown", 970 | "id": "169212a6", 971 | "metadata": {}, 972 | "source": [ 973 | "Paths are a special case, since they only take `*args` and `**kwargs` as params, but normally we'd use them in a schema by just passing a str. So we create a custom param type for that." 974 | ] 975 | }, 976 | { 977 | "cell_type": "code", 978 | "execution_count": null, 979 | "id": "e9135dfa", 980 | "metadata": {}, 981 | "outputs": [ 982 | { 983 | "data": { 984 | "text/plain": [ 985 | "{'name': 'path_test',\n", 986 | " 'description': 'Mandatory docstring',\n", 987 | " 'input_schema': {'type': 'object',\n", 988 | " 'properties': {'a': {'type': 'object',\n", 989 | " 'description': 'a type hint',\n", 990 | " '$ref': '#/$defs/PathArg'},\n", 991 | " 'b': {'type': 'object',\n", 992 | " 'description': 'b type hint',\n", 993 | " '$ref': '#/$defs/PathArg'}},\n", 994 | " 'title': None,\n", 995 | " 'required': ['a', 'b'],\n", 996 | " '$defs': {'PathArg': {'type': 'object',\n", 997 | " 'properties': {'path': {'type': 'string',\n", 998 | " 'description': 'A filesystem path'}},\n", 999 | " 'title': None,\n", 1000 | " 'required': ['path']}}}}" 1001 | ] 1002 | }, 1003 | "execution_count": null, 1004 | "metadata": {}, 1005 | "output_type": "execute_result" 1006 | } 1007 | ], 1008 | "source": [ 1009 | "def path_test(\n", 1010 | " a: PathArg, # a type hint\n", 1011 | " b: PathArg # b type hint\n", 1012 | "):\n", 1013 | " \"Mandatory docstring\"\n", 1014 | " return a/b\n", 1015 | "\n", 1016 | "get_schema(path_test)" 1017 | ] 1018 | }, 1019 | { 1020 | "cell_type": "markdown", 1021 | "id": "c6d1d0c8", 1022 | "metadata": {}, 1023 | "source": [ 1024 | "Alternatively, use `Path` as usual, and handle the `format` key in the json to use that as a callable:" 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "code", 1029 | "execution_count": null, 1030 | "id": "bdb69462", 1031 | "metadata": {}, 1032 | "outputs": [ 1033 | { 1034 | "data": { 1035 | "text/plain": [ 1036 | "{'name': 'path_test2',\n", 1037 | " 'description': 'Mandatory docstring',\n", 1038 | " 'input_schema': {'type': 'object',\n", 1039 | " 'properties': {'a': {'type': 'string',\n", 1040 | " 'description': 'a type hint',\n", 1041 | " 'format': 'Path'},\n", 1042 | " 'b': {'type': 'string', 'description': 'b type hint', 'format': 'Path'}},\n", 1043 | " 'title': None,\n", 1044 | " 'required': ['a', 'b']}}" 1045 | ] 1046 | }, 1047 | "execution_count": null, 1048 | "metadata": {}, 1049 | "output_type": "execute_result" 1050 | } 1051 | ], 1052 | "source": [ 1053 | "def path_test2(\n", 1054 | " a: Path, # a type hint\n", 1055 | " b: Path # b type hint\n", 1056 | "):\n", 1057 | " \"Mandatory docstring\"\n", 1058 | " return a/b\n", 1059 | "\n", 1060 | "get_schema(path_test2)" 1061 | ] 1062 | }, 1063 | { 1064 | "cell_type": "markdown", 1065 | "id": "369320d4", 1066 | "metadata": {}, 1067 | "source": [ 1068 | "### Additional `get_schema()` Test Cases" 1069 | ] 1070 | }, 1071 | { 1072 | "cell_type": "markdown", 1073 | "id": "a8052380", 1074 | "metadata": {}, 1075 | "source": [ 1076 | "Union types are approximately mapped to JSON schema 'anyOf' with two or more value types." 1077 | ] 1078 | }, 1079 | { 1080 | "cell_type": "code", 1081 | "execution_count": null, 1082 | "id": "6fc1d6f9", 1083 | "metadata": {}, 1084 | "outputs": [ 1085 | { 1086 | "data": { 1087 | "text/plain": [ 1088 | "{'name': '_union_test',\n", 1089 | " 'description': 'Mandatory docstring',\n", 1090 | " 'input_schema': {'type': 'object',\n", 1091 | " 'properties': {'opt_tup': {'type': 'object',\n", 1092 | " 'description': '',\n", 1093 | " 'default': None,\n", 1094 | " 'anyOf': [{'type': 'array'}, {'type': 'string'}, {'type': 'integer'}]}},\n", 1095 | " 'title': None}}" 1096 | ] 1097 | }, 1098 | "execution_count": null, 1099 | "metadata": {}, 1100 | "output_type": "execute_result" 1101 | } 1102 | ], 1103 | "source": [ 1104 | "def _union_test(opt_tup: Union[Tuple[int, int], str, int]=None):\n", 1105 | " \"Mandatory docstring\"\n", 1106 | " return \"\"\n", 1107 | "get_schema(_union_test)" 1108 | ] 1109 | }, 1110 | { 1111 | "cell_type": "markdown", 1112 | "id": "7641aca8", 1113 | "metadata": {}, 1114 | "source": [ 1115 | "The new (Python 3.10+) union syntax can also be used, producing an equivalent schema." 1116 | ] 1117 | }, 1118 | { 1119 | "cell_type": "code", 1120 | "execution_count": null, 1121 | "id": "a1a11b3b", 1122 | "metadata": {}, 1123 | "outputs": [ 1124 | { 1125 | "data": { 1126 | "text/plain": [ 1127 | "{'name': '_new_union_test',\n", 1128 | " 'description': 'Mandatory docstring',\n", 1129 | " 'input_schema': {'type': 'object',\n", 1130 | " 'properties': {'opt_tup': {'type': 'object',\n", 1131 | " 'description': '',\n", 1132 | " 'default': None,\n", 1133 | " 'anyOf': [{'type': 'array'}, {'type': 'string'}, {'type': 'integer'}]}},\n", 1134 | " 'title': None}}" 1135 | ] 1136 | }, 1137 | "execution_count": null, 1138 | "metadata": {}, 1139 | "output_type": "execute_result" 1140 | } 1141 | ], 1142 | "source": [ 1143 | "def _new_union_test(opt_tup: Tuple[int, int] | str | int =None):\n", 1144 | " \"Mandatory docstring\"\n", 1145 | " pass\n", 1146 | "get_schema(_new_union_test)" 1147 | ] 1148 | }, 1149 | { 1150 | "cell_type": "markdown", 1151 | "id": "8d24cc0a", 1152 | "metadata": {}, 1153 | "source": [ 1154 | "Optional is a special case of union types, limited to two types, one of which is None (mapped to null in JSON schema):" 1155 | ] 1156 | }, 1157 | { 1158 | "cell_type": "code", 1159 | "execution_count": null, 1160 | "id": "ac8f3d19", 1161 | "metadata": {}, 1162 | "outputs": [ 1163 | { 1164 | "data": { 1165 | "text/plain": [ 1166 | "{'name': '_optional_test',\n", 1167 | " 'description': 'Mandatory docstring',\n", 1168 | " 'input_schema': {'type': 'object',\n", 1169 | " 'properties': {'opt_tup': {'type': 'object',\n", 1170 | " 'description': '',\n", 1171 | " 'default': None,\n", 1172 | " 'anyOf': [{'type': 'array'}, {'type': 'null'}]}},\n", 1173 | " 'title': None}}" 1174 | ] 1175 | }, 1176 | "execution_count": null, 1177 | "metadata": {}, 1178 | "output_type": "execute_result" 1179 | } 1180 | ], 1181 | "source": [ 1182 | "def _optional_test(opt_tup: Optional[Tuple[int, int]]=None):\n", 1183 | " \"Mandatory docstring\"\n", 1184 | " pass\n", 1185 | "get_schema(_optional_test)" 1186 | ] 1187 | }, 1188 | { 1189 | "cell_type": "markdown", 1190 | "id": "c969721b", 1191 | "metadata": {}, 1192 | "source": [ 1193 | "Containers can also be used, both in their parameterized form (`List[int]`) or as their unparameterized raw type (`List`). In the latter case, the item type is mapped to `object` in JSON schema." 1194 | ] 1195 | }, 1196 | { 1197 | "cell_type": "code", 1198 | "execution_count": null, 1199 | "id": "b2959197", 1200 | "metadata": {}, 1201 | "outputs": [ 1202 | { 1203 | "data": { 1204 | "text/plain": [ 1205 | "{'name': '_list_test',\n", 1206 | " 'description': 'Mandatory docstring',\n", 1207 | " 'input_schema': {'type': 'object',\n", 1208 | " 'properties': {'l': {'type': 'array',\n", 1209 | " 'description': '',\n", 1210 | " 'items': {'type': 'integer'}}},\n", 1211 | " 'title': None,\n", 1212 | " 'required': ['l']}}" 1213 | ] 1214 | }, 1215 | "execution_count": null, 1216 | "metadata": {}, 1217 | "output_type": "execute_result" 1218 | } 1219 | ], 1220 | "source": [ 1221 | "def _list_test(l: List[int]):\n", 1222 | " \"Mandatory docstring\"\n", 1223 | " pass\n", 1224 | "get_schema(_list_test)" 1225 | ] 1226 | }, 1227 | { 1228 | "cell_type": "code", 1229 | "execution_count": null, 1230 | "id": "c8fbfea7", 1231 | "metadata": {}, 1232 | "outputs": [ 1233 | { 1234 | "data": { 1235 | "text/plain": [ 1236 | "{'name': '_raw_list_test',\n", 1237 | " 'description': 'Mandatory docstring',\n", 1238 | " 'input_schema': {'type': 'object',\n", 1239 | " 'properties': {'l': {'type': 'array',\n", 1240 | " 'description': '',\n", 1241 | " 'items': {'type': 'object'}}},\n", 1242 | " 'title': None,\n", 1243 | " 'required': ['l']}}" 1244 | ] 1245 | }, 1246 | "execution_count": null, 1247 | "metadata": {}, 1248 | "output_type": "execute_result" 1249 | } 1250 | ], 1251 | "source": [ 1252 | "def _raw_list_test(l: List):\n", 1253 | " \"Mandatory docstring\"\n", 1254 | " pass\n", 1255 | "get_schema(_raw_list_test)" 1256 | ] 1257 | }, 1258 | { 1259 | "cell_type": "markdown", 1260 | "id": "5704c197", 1261 | "metadata": {}, 1262 | "source": [ 1263 | "The same applies to dictionary, which can similarly be parameterized with key/value types or specified as a raw type." 1264 | ] 1265 | }, 1266 | { 1267 | "cell_type": "code", 1268 | "execution_count": null, 1269 | "id": "b2e8c567", 1270 | "metadata": {}, 1271 | "outputs": [ 1272 | { 1273 | "data": { 1274 | "text/plain": [ 1275 | "{'name': '_dict_test',\n", 1276 | " 'description': 'Mandatory docstring',\n", 1277 | " 'input_schema': {'type': 'object',\n", 1278 | " 'properties': {'d': {'type': 'object',\n", 1279 | " 'description': '',\n", 1280 | " 'additionalProperties': {'type': 'integer'}}},\n", 1281 | " 'title': None,\n", 1282 | " 'required': ['d']}}" 1283 | ] 1284 | }, 1285 | "execution_count": null, 1286 | "metadata": {}, 1287 | "output_type": "execute_result" 1288 | } 1289 | ], 1290 | "source": [ 1291 | "def _dict_test(d: Dict[str, int]):\n", 1292 | " \"Mandatory docstring\"\n", 1293 | " pass\n", 1294 | "get_schema(_dict_test)" 1295 | ] 1296 | }, 1297 | { 1298 | "cell_type": "code", 1299 | "execution_count": null, 1300 | "id": "b3138ac4", 1301 | "metadata": {}, 1302 | "outputs": [ 1303 | { 1304 | "data": { 1305 | "text/plain": [ 1306 | "{'name': '_raw_dict_test',\n", 1307 | " 'description': 'Mandatory docstring',\n", 1308 | " 'input_schema': {'type': 'object',\n", 1309 | " 'properties': {'d': {'type': 'object', 'description': ''}},\n", 1310 | " 'title': None,\n", 1311 | " 'required': ['d']}}" 1312 | ] 1313 | }, 1314 | "execution_count": null, 1315 | "metadata": {}, 1316 | "output_type": "execute_result" 1317 | } 1318 | ], 1319 | "source": [ 1320 | "def _raw_dict_test(d: Dict):\n", 1321 | " \"Mandatory docstring\"\n", 1322 | "get_schema(_raw_dict_test)" 1323 | ] 1324 | }, 1325 | { 1326 | "cell_type": "markdown", 1327 | "id": "9529d39a", 1328 | "metadata": {}, 1329 | "source": [ 1330 | "### Python tool" 1331 | ] 1332 | }, 1333 | { 1334 | "cell_type": "markdown", 1335 | "id": "7a69cad9", 1336 | "metadata": {}, 1337 | "source": [ 1338 | "In language model clients it's often useful to have a 'code interpreter' -- this is something that runs code, and generally outputs the result of the last expression (i.e like IPython or Jupyter). \n", 1339 | "\n", 1340 | "In this section we'll create the `python` function, which executes a string as Python code, with an optional timeout. If the last line is an expression, we'll return that -- just like in IPython or Jupyter, but without needing them installed." 1341 | ] 1342 | }, 1343 | { 1344 | "cell_type": "code", 1345 | "execution_count": null, 1346 | "id": "873000d7", 1347 | "metadata": {}, 1348 | "outputs": [], 1349 | "source": [ 1350 | "#| exports\n", 1351 | "import ast, time, signal, traceback\n", 1352 | "from fastcore.utils import *" 1353 | ] 1354 | }, 1355 | { 1356 | "cell_type": "code", 1357 | "execution_count": null, 1358 | "id": "4703296a", 1359 | "metadata": {}, 1360 | "outputs": [], 1361 | "source": [ 1362 | "#| exports\n", 1363 | "def _copy_loc(new, orig):\n", 1364 | " \"Copy location information from original node to new node and all children.\"\n", 1365 | " new = ast.copy_location(new, orig)\n", 1366 | " for field, o in ast.iter_fields(new):\n", 1367 | " if isinstance(o, ast.AST): setattr(new, field, _copy_loc(o, orig))\n", 1368 | " elif isinstance(o, list): setattr(new, field, [_copy_loc(value, orig) for value in o])\n", 1369 | " return new" 1370 | ] 1371 | }, 1372 | { 1373 | "cell_type": "markdown", 1374 | "id": "6c0d4922", 1375 | "metadata": {}, 1376 | "source": [ 1377 | "This is an internal function that's needed for `_run` to ensure that location information is available in the abstract syntax tree (AST), since otherwise python complains." 1378 | ] 1379 | }, 1380 | { 1381 | "cell_type": "code", 1382 | "execution_count": null, 1383 | "id": "1574585f", 1384 | "metadata": {}, 1385 | "outputs": [], 1386 | "source": [ 1387 | "#| exports\n", 1388 | "def _run(code:str, glb:dict=None, loc:dict=None):\n", 1389 | " \"Run `code`, returning final expression (similar to IPython)\"\n", 1390 | " tree = ast.parse(code)\n", 1391 | " last_node = tree.body[-1] if tree.body else None\n", 1392 | " \n", 1393 | " # If the last node is an expression, modify the AST to capture the result\n", 1394 | " if isinstance(last_node, ast.Expr):\n", 1395 | " tgt = [ast.Name(id='_result', ctx=ast.Store())]\n", 1396 | " assign_node = ast.Assign(targets=tgt, value=last_node.value)\n", 1397 | " tree.body[-1] = _copy_loc(assign_node, last_node)\n", 1398 | "\n", 1399 | " compiled_code = compile(tree, filename='', mode='exec')\n", 1400 | " glb = glb or {}\n", 1401 | " stdout_buffer = io.StringIO()\n", 1402 | " saved_stdout = sys.stdout\n", 1403 | " sys.stdout = stdout_buffer\n", 1404 | " try: exec(compiled_code, glb, loc)\n", 1405 | " finally: sys.stdout = saved_stdout\n", 1406 | " _result = glb.get('_result', None)\n", 1407 | " if _result is not None: return _result\n", 1408 | " return stdout_buffer.getvalue().strip()" 1409 | ] 1410 | }, 1411 | { 1412 | "cell_type": "markdown", 1413 | "id": "92ca7f47", 1414 | "metadata": {}, 1415 | "source": [ 1416 | "This is the internal function used to actually run the code -- we pull off the last AST to see if it's an expression (i.e something that returns a value), and if so, we store it to a special `_result` variable so we can return it." 1417 | ] 1418 | }, 1419 | { 1420 | "cell_type": "code", 1421 | "execution_count": null, 1422 | "id": "15b72cb2", 1423 | "metadata": {}, 1424 | "outputs": [ 1425 | { 1426 | "data": { 1427 | "text/plain": [ 1428 | "479001600" 1429 | ] 1430 | }, 1431 | "execution_count": null, 1432 | "metadata": {}, 1433 | "output_type": "execute_result" 1434 | } 1435 | ], 1436 | "source": [ 1437 | "_run('import math;math.factorial(12)')" 1438 | ] 1439 | }, 1440 | { 1441 | "cell_type": "code", 1442 | "execution_count": null, 1443 | "id": "632a7ac1", 1444 | "metadata": {}, 1445 | "outputs": [ 1446 | { 1447 | "data": { 1448 | "text/plain": [ 1449 | "'2'" 1450 | ] 1451 | }, 1452 | "execution_count": null, 1453 | "metadata": {}, 1454 | "output_type": "execute_result" 1455 | } 1456 | ], 1457 | "source": [ 1458 | "_run('print(1+1)')" 1459 | ] 1460 | }, 1461 | { 1462 | "cell_type": "markdown", 1463 | "id": "34f2e5c2", 1464 | "metadata": {}, 1465 | "source": [ 1466 | "We now have the machinery needed to create our `python` function." 1467 | ] 1468 | }, 1469 | { 1470 | "cell_type": "code", 1471 | "execution_count": null, 1472 | "id": "81857615", 1473 | "metadata": {}, 1474 | "outputs": [], 1475 | "source": [ 1476 | "#| exports\n", 1477 | "def python(code:str, # Code to execute\n", 1478 | " glb:Optional[dict]=None, # Globals namespace\n", 1479 | " loc:Optional[dict]=None, # Locals namespace\n", 1480 | " timeout:int=3600 # Maximum run time in seconds before a `TimeoutError` is raised\n", 1481 | " ): # Result of last node, if it's an expression, or `None` otherwise\n", 1482 | " \"\"\"Executes python `code` with `timeout` and returning final expression (similar to IPython).\n", 1483 | " Raised exceptions are returned as a string, with a stack trace.\"\"\"\n", 1484 | " def handler(*args): raise TimeoutError()\n", 1485 | " if glb is None: glb = inspect.currentframe().f_back.f_globals\n", 1486 | " if loc is None: loc=glb\n", 1487 | " signal.signal(signal.SIGALRM, handler)\n", 1488 | " signal.alarm(timeout)\n", 1489 | " try: return _run(code, glb, loc)\n", 1490 | " except Exception as e: return traceback.format_exc()\n", 1491 | " finally: signal.alarm(0)" 1492 | ] 1493 | }, 1494 | { 1495 | "cell_type": "markdown", 1496 | "id": "b6b9324f", 1497 | "metadata": {}, 1498 | "source": [ 1499 | "There's no builtin security here -- you should generally use this in a sandbox, or alternatively prompt before running code. It can handle multiline function definitions, and pretty much any other normal Python syntax." 1500 | ] 1501 | }, 1502 | { 1503 | "cell_type": "code", 1504 | "execution_count": null, 1505 | "id": "69d74f4d", 1506 | "metadata": {}, 1507 | "outputs": [ 1508 | { 1509 | "data": { 1510 | "text/plain": [ 1511 | "120" 1512 | ] 1513 | }, 1514 | "execution_count": null, 1515 | "metadata": {}, 1516 | "output_type": "execute_result" 1517 | } 1518 | ], 1519 | "source": [ 1520 | "python(\"\"\"def factorial(n):\n", 1521 | " if n == 0 or n == 1: return 1\n", 1522 | " else: return n * factorial(n-1)\n", 1523 | "factorial(5)\"\"\")" 1524 | ] 1525 | }, 1526 | { 1527 | "cell_type": "markdown", 1528 | "id": "6c629442", 1529 | "metadata": {}, 1530 | "source": [ 1531 | "If the code takes longer than `timeout` then it returns an error string." 1532 | ] 1533 | }, 1534 | { 1535 | "cell_type": "code", 1536 | "execution_count": null, 1537 | "id": "fcb472b3", 1538 | "metadata": {}, 1539 | "outputs": [ 1540 | { 1541 | "name": "stdout", 1542 | "output_type": "stream", 1543 | "text": [ 1544 | "Traceback (most recent call last):\n", 1545 | " File \"/var/folders/5c/jls7k26j1tq6l03cl_kvpnwh0000gn/T/ipykernel_97636/2963369439.py\", line 14, in python\n", 1546 | " try: return _run(code, glb, loc)\n", 1547 | " ^^^^^^^^^^^^^^^^^^^^\n", 1548 | " File \"/var/folders/5c/jls7k26j1tq6l03cl_kvpnwh0000gn/T/ipykernel_97636/1858893181.py\", line 18, in _run\n", 1549 | " try: exec(compiled_code, glb, loc)\n", 1550 | " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", 1551 | " File \"\", line 1, in \n", 1552 | " File \"/var/folders/5c/jls7k26j1tq6l03cl_kvpnwh0000gn/T/ipykernel_97636/2963369439.py\", line 9, in handler\n", 1553 | " def handler(*args): raise TimeoutError()\n", 1554 | " ^^^^^^^^^^^^^^^^^^^^\n", 1555 | "TimeoutError\n", 1556 | "\n" 1557 | ] 1558 | } 1559 | ], 1560 | "source": [ 1561 | "print(python('import time; time.sleep(10)', timeout=1))" 1562 | ] 1563 | }, 1564 | { 1565 | "cell_type": "markdown", 1566 | "id": "d45684c1", 1567 | "metadata": {}, 1568 | "source": [ 1569 | "By default the caller's global namespace is used." 1570 | ] 1571 | }, 1572 | { 1573 | "cell_type": "code", 1574 | "execution_count": null, 1575 | "id": "72dfe290", 1576 | "metadata": {}, 1577 | "outputs": [ 1578 | { 1579 | "data": { 1580 | "text/plain": [ 1581 | "1" 1582 | ] 1583 | }, 1584 | "execution_count": null, 1585 | "metadata": {}, 1586 | "output_type": "execute_result" 1587 | } 1588 | ], 1589 | "source": [ 1590 | "python(\"a=1\")\n", 1591 | "a" 1592 | ] 1593 | }, 1594 | { 1595 | "cell_type": "markdown", 1596 | "id": "bf48557c", 1597 | "metadata": {}, 1598 | "source": [ 1599 | "Pass a different `glb` if needed." 1600 | ] 1601 | }, 1602 | { 1603 | "cell_type": "code", 1604 | "execution_count": null, 1605 | "id": "55fb5613", 1606 | "metadata": {}, 1607 | "outputs": [ 1608 | { 1609 | "data": { 1610 | "text/plain": [ 1611 | "(1, 3)" 1612 | ] 1613 | }, 1614 | "execution_count": null, 1615 | "metadata": {}, 1616 | "output_type": "execute_result" 1617 | } 1618 | ], 1619 | "source": [ 1620 | "glb = {}\n", 1621 | "python(\"a=3\", glb)\n", 1622 | "a, glb['a']" 1623 | ] 1624 | }, 1625 | { 1626 | "cell_type": "markdown", 1627 | "id": "244c502e", 1628 | "metadata": {}, 1629 | "source": [ 1630 | "### Tool Calling" 1631 | ] 1632 | }, 1633 | { 1634 | "cell_type": "markdown", 1635 | "id": "186408f8", 1636 | "metadata": {}, 1637 | "source": [ 1638 | "Many LLM API providers offer tool calling where an LLM can choose to call a given tool. This is also helpful for structured outputs since the response from the LLM is contrained to the required arguments of the tool.\n", 1639 | "\n", 1640 | "This section will be dedicated to helper functions for calling tools. We don't want to allow LLMs to call just any possible function (that would be a security disaster!) so we create a namespace -- that is, a dictionary of allowable function names to call." 1641 | ] 1642 | }, 1643 | { 1644 | "cell_type": "code", 1645 | "execution_count": null, 1646 | "id": "782c4415", 1647 | "metadata": {}, 1648 | "outputs": [], 1649 | "source": [ 1650 | "#| export\n", 1651 | "def mk_ns(*funcs_or_objs):\n", 1652 | " merged = {}\n", 1653 | " for o in funcs_or_objs:\n", 1654 | " if isinstance(o, type): merged |= {n:getattr(o,n) for n,m in o.__dict__.items() if isinstance(m, (staticmethod, classmethod))}\n", 1655 | " if isinstance(o, object): merged |= {n:getattr(o,n) for n, m in inspect.getmembers(o, inspect.ismethod)} | {n:m for n,m in o.__class__.__dict__.items() if isinstance(m, staticmethod)}\n", 1656 | " if callable(o) and hasattr(o, '__name__'): merged |= {o.__name__: o}\n", 1657 | " return merged" 1658 | ] 1659 | }, 1660 | { 1661 | "cell_type": "code", 1662 | "execution_count": null, 1663 | "id": "5947aac4", 1664 | "metadata": {}, 1665 | "outputs": [ 1666 | { 1667 | "data": { 1668 | "text/plain": [ 1669 | "{'sums': }" 1670 | ] 1671 | }, 1672 | "execution_count": null, 1673 | "metadata": {}, 1674 | "output_type": "execute_result" 1675 | } 1676 | ], 1677 | "source": [ 1678 | "def sums(a, b): return a + b\n", 1679 | "ns = mk_ns(sums); ns" 1680 | ] 1681 | }, 1682 | { 1683 | "cell_type": "code", 1684 | "execution_count": null, 1685 | "id": "86ce0458", 1686 | "metadata": {}, 1687 | "outputs": [ 1688 | { 1689 | "data": { 1690 | "text/plain": [ 1691 | "3" 1692 | ] 1693 | }, 1694 | "execution_count": null, 1695 | "metadata": {}, 1696 | "output_type": "execute_result" 1697 | } 1698 | ], 1699 | "source": [ 1700 | "ns['sums'](1, 2)" 1701 | ] 1702 | }, 1703 | { 1704 | "cell_type": "code", 1705 | "execution_count": null, 1706 | "id": "29d22f82", 1707 | "metadata": {}, 1708 | "outputs": [], 1709 | "source": [ 1710 | "class Dummy:\n", 1711 | " def __init__(self,a): self.a = a\n", 1712 | " def __call__(self): return self.a\n", 1713 | " def sums(self, a, b): return a + b\n", 1714 | " @staticmethod\n", 1715 | " def subs(a, b): return a - b\n", 1716 | " @classmethod\n", 1717 | " def mults(cls, a, b): return a * b" 1718 | ] 1719 | }, 1720 | { 1721 | "cell_type": "code", 1722 | "execution_count": null, 1723 | "id": "ca50b957", 1724 | "metadata": {}, 1725 | "outputs": [ 1726 | { 1727 | "data": { 1728 | "text/plain": [ 1729 | "{'subs': ,\n", 1730 | " 'mults': >,\n", 1731 | " 'Dummy': __main__.Dummy}" 1732 | ] 1733 | }, 1734 | "execution_count": null, 1735 | "metadata": {}, 1736 | "output_type": "execute_result" 1737 | } 1738 | ], 1739 | "source": [ 1740 | "ns = mk_ns(Dummy); ns" 1741 | ] 1742 | }, 1743 | { 1744 | "cell_type": "code", 1745 | "execution_count": null, 1746 | "id": "59ef734f", 1747 | "metadata": {}, 1748 | "outputs": [ 1749 | { 1750 | "data": { 1751 | "text/plain": [ 1752 | "(-1, 6)" 1753 | ] 1754 | }, 1755 | "execution_count": null, 1756 | "metadata": {}, 1757 | "output_type": "execute_result" 1758 | } 1759 | ], 1760 | "source": [ 1761 | "ns['subs'](1, 2), ns['mults'](3, 2)" 1762 | ] 1763 | }, 1764 | { 1765 | "cell_type": "code", 1766 | "execution_count": null, 1767 | "id": "15871e6d", 1768 | "metadata": {}, 1769 | "outputs": [ 1770 | { 1771 | "data": { 1772 | "text/plain": [ 1773 | "{'__call__': >,\n", 1774 | " '__init__': >,\n", 1775 | " 'mults': >,\n", 1776 | " 'sums': >,\n", 1777 | " 'subs': )>}" 1778 | ] 1779 | }, 1780 | "execution_count": null, 1781 | "metadata": {}, 1782 | "output_type": "execute_result" 1783 | } 1784 | ], 1785 | "source": [ 1786 | "d = Dummy(10)\n", 1787 | "ns = mk_ns(d); ns" 1788 | ] 1789 | }, 1790 | { 1791 | "cell_type": "code", 1792 | "execution_count": null, 1793 | "id": "13cb7685", 1794 | "metadata": {}, 1795 | "outputs": [ 1796 | { 1797 | "data": { 1798 | "text/plain": [ 1799 | "(-1, 6, 5, 10)" 1800 | ] 1801 | }, 1802 | "execution_count": null, 1803 | "metadata": {}, 1804 | "output_type": "execute_result" 1805 | } 1806 | ], 1807 | "source": [ 1808 | "ns['subs'](1, 2), ns['mults'](3, 2), ns['sums'](3, 2), ns['__call__']()" 1809 | ] 1810 | }, 1811 | { 1812 | "cell_type": "code", 1813 | "execution_count": null, 1814 | "id": "2dfe13ae", 1815 | "metadata": {}, 1816 | "outputs": [ 1817 | { 1818 | "data": { 1819 | "text/plain": [ 1820 | "(None, -99)" 1821 | ] 1822 | }, 1823 | "execution_count": null, 1824 | "metadata": {}, 1825 | "output_type": "execute_result" 1826 | } 1827 | ], 1828 | "source": [ 1829 | "ns['__init__'](-99), ns['__call__']()" 1830 | ] 1831 | }, 1832 | { 1833 | "cell_type": "code", 1834 | "execution_count": null, 1835 | "id": "85b4734f", 1836 | "metadata": {}, 1837 | "outputs": [], 1838 | "source": [ 1839 | "#| exports\n", 1840 | "def call_func(fc_name, fc_inputs, ns, raise_on_err=True):\n", 1841 | " \"Call the function `fc_name` with the given `fc_inputs` using namespace `ns`.\"\n", 1842 | " if not isinstance(ns, abc.Mapping): ns = mk_ns(*ns)\n", 1843 | " func = ns[fc_name]\n", 1844 | " try: return func(**fc_inputs)\n", 1845 | " except Exception as e:\n", 1846 | " if raise_on_err: raise e\n", 1847 | " else: return traceback.format_exc()" 1848 | ] 1849 | }, 1850 | { 1851 | "cell_type": "markdown", 1852 | "id": "ce9cce60", 1853 | "metadata": {}, 1854 | "source": [ 1855 | "Now when we an LLM responses with the tool to use and its inputs, we can simply use the same namespace it was given to look up the tool and call it." 1856 | ] 1857 | }, 1858 | { 1859 | "cell_type": "code", 1860 | "execution_count": null, 1861 | "id": "f2ade8a8", 1862 | "metadata": {}, 1863 | "outputs": [ 1864 | { 1865 | "data": { 1866 | "text/plain": [ 1867 | "3" 1868 | ] 1869 | }, 1870 | "execution_count": null, 1871 | "metadata": {}, 1872 | "output_type": "execute_result" 1873 | } 1874 | ], 1875 | "source": [ 1876 | "call_func('sums', {'a': 1, 'b': 2}, ns=[sums])" 1877 | ] 1878 | }, 1879 | { 1880 | "cell_type": "code", 1881 | "execution_count": null, 1882 | "id": "9aace64a", 1883 | "metadata": {}, 1884 | "outputs": [ 1885 | { 1886 | "data": { 1887 | "text/plain": [ 1888 | "-1" 1889 | ] 1890 | }, 1891 | "execution_count": null, 1892 | "metadata": {}, 1893 | "output_type": "execute_result" 1894 | } 1895 | ], 1896 | "source": [ 1897 | "call_func('subs', {'a': 1, 'b': 2}, ns=mk_ns(d))" 1898 | ] 1899 | }, 1900 | { 1901 | "cell_type": "code", 1902 | "execution_count": null, 1903 | "id": "6c93c0ef", 1904 | "metadata": {}, 1905 | "outputs": [], 1906 | "source": [ 1907 | "assert \"unsupported operand type(s) for -: 'int' and 'str'\" in call_func('subs', {'a': 1, 'b': '3'}, ns=mk_ns(d), raise_on_err=False)" 1908 | ] 1909 | }, 1910 | { 1911 | "cell_type": "code", 1912 | "execution_count": null, 1913 | "id": "85489c3d", 1914 | "metadata": {}, 1915 | "outputs": [], 1916 | "source": [ 1917 | "test_fail(call_func, args=['subs', {'a': 1, 'b': '3'}], kwargs={'ns': mk_ns(d)})" 1918 | ] 1919 | }, 1920 | { 1921 | "cell_type": "code", 1922 | "execution_count": null, 1923 | "id": "b19298ac", 1924 | "metadata": {}, 1925 | "outputs": [], 1926 | "source": [ 1927 | "%%ai\n", 1928 | "How do I get the whole traceback of an error instead of just str(e) like above?" 1929 | ] 1930 | }, 1931 | { 1932 | "cell_type": "markdown", 1933 | "id": "6ec89b42", 1934 | "metadata": {}, 1935 | "source": [ 1936 | "To get the whole traceback of an error instead of just `str(e)`, you can use the `traceback` module, which you've already imported in your code. Modify the `call_func` function to capture and return the full traceback when an error occurs:\n", 1937 | "\n", 1938 | "```python\n", 1939 | "#| exports\n", 1940 | "def call_func(fc_name, fc_inputs, ns, raise_on_err=True):\n", 1941 | " \"Call the function `fc_name` with the given `fc_inputs` using namespace `ns`.\"\n", 1942 | " if not isinstance(ns, abc.Mapping): ns = mk_ns(*ns)\n", 1943 | " func = ns[fc_name]\n", 1944 | " try: return func(**fc_inputs)\n", 1945 | " except Exception as e:\n", 1946 | " if raise_on_err: raise e\n", 1947 | " else: return traceback.format_exc()\n", 1948 | "```\n", 1949 | "\n", 1950 | "This replaces `str(e)` with `traceback.format_exc()`, which returns the full traceback as a string, including the error type, message, and the call stack that led to the error. This gives you much more context about where and why the error occurred." 1951 | ] 1952 | }, 1953 | { 1954 | "cell_type": "markdown", 1955 | "id": "591574b8-6b53-4908-8159-b87be42133f7", 1956 | "metadata": {}, 1957 | "source": [ 1958 | "### Async function calling" 1959 | ] 1960 | }, 1961 | { 1962 | "cell_type": "markdown", 1963 | "id": "96a3a7d3-31ef-4cc6-b47c-35eaa8bbff8b", 1964 | "metadata": {}, 1965 | "source": [ 1966 | "Since tools defined by MCP servers are async function, it is probably a good idea to have an async version of `call_func`." 1967 | ] 1968 | }, 1969 | { 1970 | "cell_type": "code", 1971 | "execution_count": null, 1972 | "id": "e273507b-6e4b-40bb-ae23-6397e89a4d51", 1973 | "metadata": {}, 1974 | "outputs": [ 1975 | { 1976 | "data": { 1977 | "text/plain": [ 1978 | "{'asums': }" 1979 | ] 1980 | }, 1981 | "execution_count": null, 1982 | "metadata": {}, 1983 | "output_type": "execute_result" 1984 | } 1985 | ], 1986 | "source": [ 1987 | "async def asums(a, b): return a + b\n", 1988 | "ns = mk_ns(asums); ns" 1989 | ] 1990 | }, 1991 | { 1992 | "cell_type": "code", 1993 | "execution_count": null, 1994 | "id": "7ac04e80-7bb9-4b52-8285-454684605d47", 1995 | "metadata": {}, 1996 | "outputs": [], 1997 | "source": [ 1998 | "#| exports\n", 1999 | "async def call_func_async(fc_name, fc_inputs, ns, raise_on_err=True):\n", 2000 | " \"Awaits the function `fc_name` with the given `fc_inputs` using namespace `ns`.\"\n", 2001 | " res = call_func(fc_name, fc_inputs, ns, raise_on_err=raise_on_err)\n", 2002 | " if inspect.iscoroutine(res):\n", 2003 | " try: res = await res\n", 2004 | " except Exception as e:\n", 2005 | " if raise_on_err: raise e\n", 2006 | " else: return traceback.format_exc()\n", 2007 | " return res" 2008 | ] 2009 | }, 2010 | { 2011 | "cell_type": "code", 2012 | "execution_count": null, 2013 | "id": "b83998ac-68e2-4dbe-b594-65fb4fdf59b8", 2014 | "metadata": {}, 2015 | "outputs": [ 2016 | { 2017 | "data": { 2018 | "text/plain": [ 2019 | "3" 2020 | ] 2021 | }, 2022 | "execution_count": null, 2023 | "metadata": {}, 2024 | "output_type": "execute_result" 2025 | } 2026 | ], 2027 | "source": [ 2028 | "await call_func_async('asums', {'a': 1, 'b': 2}, ns=[asums])" 2029 | ] 2030 | }, 2031 | { 2032 | "cell_type": "code", 2033 | "execution_count": null, 2034 | "id": "91092ee9", 2035 | "metadata": {}, 2036 | "outputs": [], 2037 | "source": [ 2038 | "test_eq(await call_func_async('asums', {'a': 1, 'b': '2'}, ns=[asums], raise_on_err=False), \"unsupported operand type(s) for +: 'int' and 'str'\")" 2039 | ] 2040 | }, 2041 | { 2042 | "cell_type": "code", 2043 | "execution_count": null, 2044 | "id": "a06776cf", 2045 | "metadata": {}, 2046 | "outputs": [], 2047 | "source": [ 2048 | "ex = False\n", 2049 | "try: await call_func_async('asums', {'a': 1, 'b': '2'}, ns=[asums], raise_on_err=True)\n", 2050 | "except: ex = True\n", 2051 | "assert ex" 2052 | ] 2053 | }, 2054 | { 2055 | "cell_type": "markdown", 2056 | "id": "94ec4289", 2057 | "metadata": {}, 2058 | "source": [ 2059 | "## Export -" 2060 | ] 2061 | }, 2062 | { 2063 | "cell_type": "code", 2064 | "execution_count": null, 2065 | "id": "1e9ee5c1", 2066 | "metadata": {}, 2067 | "outputs": [], 2068 | "source": [ 2069 | "#|hide\n", 2070 | "#|eval: false\n", 2071 | "from nbdev.doclinks import nbdev_export\n", 2072 | "nbdev_export()" 2073 | ] 2074 | }, 2075 | { 2076 | "cell_type": "code", 2077 | "execution_count": null, 2078 | "id": "9cf037e0", 2079 | "metadata": {}, 2080 | "outputs": [], 2081 | "source": [] 2082 | } 2083 | ], 2084 | "metadata": { 2085 | "kernelspec": { 2086 | "display_name": "python3", 2087 | "language": "python", 2088 | "name": "python3" 2089 | } 2090 | }, 2091 | "nbformat": 4, 2092 | "nbformat_minor": 5 2093 | } 2094 | -------------------------------------------------------------------------------- /02_shell.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "efe78920", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "#|default_exp shell" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "3d773712-12fe-440e-891f-36f59666dfde", 16 | "metadata": {}, 17 | "source": [ 18 | "# shell source" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "1328ef69", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "#| exports\n", 29 | "import ast, time, signal, traceback\n", 30 | "from fastcore.utils import *" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "481b4368", 36 | "metadata": {}, 37 | "source": [ 38 | "`get_shell` is like `python`, except it also maintains a stateful interpreter, rather than just running a single line of code. This is implemented using IPython, so that must be installed." 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "6bbf062d", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "#| exports\n", 49 | "from IPython.terminal.interactiveshell import TerminalInteractiveShell\n", 50 | "from IPython.utils.capture import capture_output" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "d3d04ec5", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "def exception2str(ex:Exception)->str:\n", 61 | " \"Convert exception `ex` into a string\"\n", 62 | " return ''.join(traceback.format_exception(type(ex), ex, ex.__traceback__))" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "id": "d6ba32b4", 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "Traceback (most recent call last):\n", 76 | " File \"/var/folders/ss/34z569j921v58v8n1n_8z7h40000gn/T/ipykernel_37260/4058275565.py\", line 1, in \n", 77 | " try: print(1/0)\n", 78 | " ~^~\n", 79 | "ZeroDivisionError: division by zero\n", 80 | "\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "try: print(1/0)\n", 86 | "except Exception as e: print(exception2str(e))" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "34099c2f", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "#| exports\n", 97 | "TerminalInteractiveShell.orig_run = TerminalInteractiveShell.run_cell" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "id": "d6aa8e7b", 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "#| exports\n", 108 | "@patch\n", 109 | "def run_cell(self:TerminalInteractiveShell, cell, timeout=None):\n", 110 | " \"Wrapper for original `run_cell` which adds timeout and output capture\"\n", 111 | " if timeout:\n", 112 | " def handler(*args): raise TimeoutError()\n", 113 | " signal.signal(signal.SIGALRM, handler)\n", 114 | " signal.alarm(timeout)\n", 115 | " try:\n", 116 | " with capture_output() as io: result = self.orig_run(cell)\n", 117 | " result.stdout = io.stdout\n", 118 | " return result\n", 119 | " except TimeoutException as e:\n", 120 | " result = self.ExecutionResult(error_before_exec=None, error_in_exec=e)\n", 121 | " finally:\n", 122 | " if timeout: signal.alarm(0)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "id": "cdadbb12", 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "#| exports\n", 133 | "def get_shell()->TerminalInteractiveShell:\n", 134 | " \"Get a `TerminalInteractiveShell` with minimal functionality\"\n", 135 | " sh = TerminalInteractiveShell()\n", 136 | " sh.logger.log_output = sh.history_manager.enabled = False\n", 137 | " dh = sh.displayhook\n", 138 | " dh.finish_displayhook = dh.write_output_prompt = dh.start_displayhook = lambda: None\n", 139 | " dh.write_format_data = lambda format_dict, md_dict=None: None\n", 140 | " sh.logstart = sh.automagic = sh.autoindent = False\n", 141 | " sh.autocall = 0\n", 142 | " sh.system = lambda cmd: None\n", 143 | " return sh" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "5ffbe57e", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "shell = get_shell()" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "id": "b03b78b3", 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "data": { 164 | "text/plain": [ 165 | "(2, '3\\n')" 166 | ] 167 | }, 168 | "execution_count": null, 169 | "metadata": {}, 170 | "output_type": "execute_result" 171 | } 172 | ], 173 | "source": [ 174 | "r = shell.run_cell('print(3); 1+1')\n", 175 | "r.result,r.stdout" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "id": "48849fc3", 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "name": "stdout", 186 | "output_type": "stream", 187 | "text": [ 188 | "Traceback (most recent call last):\n", 189 | " File \"/Users/jhoward/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3577, in run_code\n", 190 | " exec(code_obj, self.user_global_ns, self.user_ns)\n", 191 | " File \"\", line 1, in \n", 192 | " raise Exception(\"blah\")\n", 193 | "Exception: blah\n", 194 | "\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "r = shell.run_cell('raise Exception(\"blah\")')\n", 200 | "print(exception2str(r.error_in_exec))" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "id": "ddabea6d", 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "TimeoutError()" 213 | ] 214 | }, 215 | "execution_count": null, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "r = shell.run_cell('import time; time.sleep(10)', timeout=1)\n", 222 | "r.error_in_exec" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "id": "94ec4289", 228 | "metadata": {}, 229 | "source": [ 230 | "## Export -" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "id": "1e9ee5c1", 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "#|hide\n", 241 | "#|eval: false\n", 242 | "from nbdev.doclinks import nbdev_export\n", 243 | "nbdev_export()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "id": "207f9715", 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [] 253 | } 254 | ], 255 | "metadata": { 256 | "kernelspec": { 257 | "display_name": "python3", 258 | "language": "python", 259 | "name": "python3" 260 | } 261 | }, 262 | "nbformat": 4, 263 | "nbformat_minor": 5 264 | } 265 | -------------------------------------------------------------------------------- /03_download.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "92c3dff2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "#| default_exp download" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "1d533800", 16 | "metadata": {}, 17 | "source": [ 18 | "# Download helpers\n", 19 | "\n", 20 | "- Download and process LLM-ready documents" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "id": "e58d8c43", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "#| export\n", 31 | "from fastcore.utils import *\n", 32 | "from httpx import get\n", 33 | "from fastcore.meta import delegates\n", 34 | "from urllib.parse import urlparse, urljoin" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "30199708", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "from IPython.display import Markdown,HTML\n", 45 | "from fastcore.test import *" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "id": "95c4cab1", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "#| export\n", 56 | "def clean_md(text, rm_comments=True, rm_details=True):\n", 57 | " \"Remove comments and `
` sections from `text`\"\n", 58 | " if rm_comments: text = re.sub(r'\\n?\\n?', '', text, flags=re.DOTALL)\n", 59 | " if rm_details: text = re.sub(r'\\n?
.*?
\\n?', '', text, flags=re.DOTALL)\n", 60 | " return text" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "0f3d5c69", 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "#| export\n", 71 | "@delegates(get)\n", 72 | "def read_md(url, rm_comments=True, rm_details=True, **kwargs):\n", 73 | " \"Read text from `url` and clean with `clean_docs`\"\n", 74 | " return clean_md(get(url, **kwargs).text, rm_comments=rm_comments, rm_details=rm_details)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "id": "478d5508", 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "mdurl = 'https://claudette.answer.ai/index.html.md'\n", 85 | "md = read_md(mdurl)\n", 86 | "# Markdown(md)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "d8d61937", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "#| export\n", 97 | "def html2md(s:str, ignore_links=True):\n", 98 | " \"Convert `s` from HTML to markdown\"\n", 99 | " import html2text\n", 100 | " o = html2text.HTML2Text(bodywidth=5000)\n", 101 | " o.ignore_links = ignore_links\n", 102 | " o.mark_code = True\n", 103 | " o.ignore_images = True\n", 104 | " return o.handle(s)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "id": "5e897053", 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "#| export\n", 115 | "def read_html(url, # URL to read\n", 116 | " sel=None, # Read only outerHTML of CSS selector `sel`\n", 117 | " rm_comments=True, # Removes HTML comments\n", 118 | " rm_details=True, # Removes `
` tags\n", 119 | " multi=False, # Get all matches to `sel` or first one \n", 120 | " wrap_tag=None, #If multi, each selection wrapped with content\n", 121 | " ignore_links=True,\n", 122 | " ): # Cleaned markdown\n", 123 | " \"Get `url`, optionally selecting CSS selector `sel`, and convert to clean markdown\"\n", 124 | " page = get(url).text\n", 125 | " if sel:\n", 126 | " from bs4 import BeautifulSoup\n", 127 | " soup = BeautifulSoup(page, 'html.parser')\n", 128 | " if multi:\n", 129 | " page = [str(el) for el in soup.select(sel)]\n", 130 | " if not wrap_tag: page = \"\\n\".join(page)\n", 131 | " else: page = str(soup.select_one(sel))\n", 132 | " mds = map(lambda x: clean_md(html2md(x, ignore_links=ignore_links), rm_comments, rm_details=rm_details), tuplify(page))\n", 133 | " if wrap_tag: return '\\n'.join([f\"\\n<{wrap_tag}>\\n{o}\\n\" for o in mds])\n", 134 | " else: return'\\n'.join(mds)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "1d07c687", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# test single class selector\n", 145 | "listings = read_html('https://www.answer.ai/', sel='.listing-description')\n", 146 | "assert len(listings) < 500\n", 147 | "\n", 148 | "# Test multi class selector\n", 149 | "listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True)\n", 150 | "assert len(listings) > 1000 # returns more than single so selecting multi\n", 151 | "\n", 152 | "# Test multi_wrap_tag\n", 153 | "listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True, wrap_tag='document')\n", 154 | "assert '' in listings and '' in listings " 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "20188898", 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "'[My experience learning GPU programming, and implementing a new GPU education app in the process](./posts/2025-03-17-gpu-programming-scratch.html)\\n\\n'" 167 | ] 168 | }, 169 | "execution_count": null, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "read_html('https://www.answer.ai/', sel='.listing-description', ignore_links=False)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "id": "7406a52d", 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "# test tag css selectors\n", 186 | "assert len(read_html('https://www.answer.ai/', sel='div.listing-description', multi=True)) > 1000\n", 187 | "assert len(read_html('https://www.answer.ai/', sel='div', multi=True)) > 1000" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "id": "8f25e767", 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "htmlurl = 'https://hypermedia.systems/hypermedia-a-reintroduction/'\n", 198 | "hmd = read_html(htmlurl)\n", 199 | "assert len(hmd) > 100\n", 200 | "# Markdown(hmd)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "id": "066b5532", 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "#| export\n", 211 | "def get_llmstxt(url, optional=False, n_workers=None):\n", 212 | " \"Get llms.txt file from and expand it with `llms_txt.create_ctx()`\"\n", 213 | " if not url.endswith('llms.txt'): return None\n", 214 | " import llms_txt\n", 215 | " resp = get(url)\n", 216 | " if resp.status_code!=200: return None\n", 217 | " return llms_txt.create_ctx(resp.text, optional=optional, n_workers=n_workers)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "id": "2c370bf2", 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "# print(get_llmstxt('https://llmstxt.org/llms.txt'))" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "id": "a2fc5a55", 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "#| export\n", 238 | "def split_url(url):\n", 239 | " \"Split `url` into base, path, and file name, normalising name to '/' if empty\"\n", 240 | " parsed = urlparse(url.strip('/'))\n", 241 | " base = f\"{parsed.scheme}://{parsed.netloc}\"\n", 242 | " path,spl,fname = parsed.path.rpartition('/')\n", 243 | " fname = spl+fname\n", 244 | " if not path and not fname: path='/'\n", 245 | " return base,path,fname" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "id": "1a92b74e", 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "data": { 256 | "text/plain": [ 257 | "[('https://claudette.answer.ai', '', '/path'),\n", 258 | " ('https://claudette.answer.ai', '/', ''),\n", 259 | " ('https://llmstxt.org', '/', ''),\n", 260 | " ('https://llmstxt.org', '/', '')]" 261 | ] 262 | }, 263 | "execution_count": null, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "urls = ('https://claudette.answer.ai/path/', 'https://claudette.answer.ai/', 'https://llmstxt.org', 'https://llmstxt.org/')\n", 270 | "\n", 271 | "[split_url(o) for o in urls]" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "id": "5337c0a2", 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "#| export\n", 282 | "def _tryget(url):\n", 283 | " \"Return response from `url` if `status_code!=404`, otherwise `None`\"\n", 284 | " res = get(url)\n", 285 | " return None if res.status_code==404 else url" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "id": "189f5b24", 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "#| export\n", 296 | "def find_docs(url):\n", 297 | " \"If available, return LLM-friendly llms.txt context or markdown file location from `url`\"\n", 298 | " base,path,fname = split_url(url)\n", 299 | " url = (base+path+fname).strip('/')\n", 300 | " if fname=='/llms.txt': return url\n", 301 | " if Path(fname).suffix in('.md', '.txt', '.rst'): return _tryget(url)\n", 302 | " if '.' in fname: return _tryget(url+'.md') or find_docs(url[:url.rfind('/')])\n", 303 | " res = _tryget(url+'/llms.txt')\n", 304 | " if res: return res\n", 305 | " res = _tryget(url+'/index.md')\n", 306 | " if res: return res\n", 307 | " res = _tryget(url+'/index.html.md')\n", 308 | " if res: return res\n", 309 | " res = _tryget(url+'/index-commonmark.md')\n", 310 | " if res: return res\n", 311 | " parsed_url = urlparse(url)\n", 312 | " if parsed_url.path == '/' or not parsed_url.path: return None\n", 313 | " return find_docs(urljoin(url, '..'))" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "id": "5d1722d9", 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "fl_url = 'https://answerdotai.github.io/fastlite'" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "id": "0b226407", 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "data": { 334 | "text/plain": [ 335 | "'https://answerdotai.github.io/fastlite/llms.txt'" 336 | ] 337 | }, 338 | "execution_count": null, 339 | "metadata": {}, 340 | "output_type": "execute_result" 341 | } 342 | ], 343 | "source": [ 344 | "find_docs(fl_url)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "id": "14344890", 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "name": "stdout", 355 | "output_type": "stream", 356 | "text": [ 357 | "https://claudette.answer.ai/llms.txt\n", 358 | "https://claudette.answer.ai/llms.txt\n", 359 | "https://llmstxt.org/llms.txt\n", 360 | "https://llmstxt.org/llms.txt\n" 361 | ] 362 | } 363 | ], 364 | "source": [ 365 | "for o in urls: print(find_docs(o))" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "id": "439546d4", 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "#| eval: false\n", 376 | "suffixes = [\"/\", \"/tmp\", \"/tmp/tmp/\"]\n", 377 | "for suff in suffixes:\n", 378 | " for o in urls: test_eq(find_docs(o), find_docs(o+suff))\n", 379 | "\n", 380 | "test_eq(find_docs(\"https://github.com\"), \"https://github.com/llms.txt\")\n", 381 | "test_eq(find_docs(\"https://github.com/AnswerDotAI\"), \"https://github.com/llms.txt\")\n", 382 | "test_eq(find_docs(\"https://github.com/AnswerDotAI/\"), \"https://github.com/llms.txt\")" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "id": "771d1208", 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "#| export\n", 393 | "def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True):\n", 394 | " \"If available, return LLM-friendly llms.txt context or markdown file response for `url`\"\n", 395 | " url = find_docs(url)\n", 396 | " if url.endswith('/llms.txt'): res = get_llmstxt(url, optional=optional, n_workers=n_workers)\n", 397 | " else: res = get(url).text\n", 398 | " return clean_md(res, rm_comments=rm_comments, rm_details=rm_details)" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "id": "94ec4289", 404 | "metadata": {}, 405 | "source": [ 406 | "## Export -" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "id": "1e9ee5c1", 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "#|hide\n", 417 | "#|eval: false\n", 418 | "from nbdev.doclinks import nbdev_export\n", 419 | "nbdev_export()" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "id": "0c01784b", 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [] 429 | } 430 | ], 431 | "metadata": { 432 | "kernelspec": { 433 | "display_name": "python3", 434 | "language": "python", 435 | "name": "python3" 436 | } 437 | }, 438 | "nbformat": 4, 439 | "nbformat_minor": 5 440 | } 441 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Release notes 2 | 3 | 4 | 5 | ## 0.2.1 6 | 7 | ### New Features 8 | 9 | - Optionally dont raise error on `call_func` ([#31](https://github.com/AnswerDotAI/toolslm/pull/31)), thanks to [@erikgaas](https://github.com/erikgaas) 10 | - dict support in `get_schema` ([#30](https://github.com/AnswerDotAI/toolslm/issues/30)) 11 | 12 | 13 | ## 0.2.0 14 | 15 | ### Breaking changes 16 | 17 | - Optional libs (http2text, beautifulsoup, llms_txt) are no longer automatically installed 18 | 19 | ### New Features 20 | 21 | - Lazily load optional modules ([#29](https://github.com/AnswerDotAI/toolslm/issues/29)) 22 | 23 | 24 | ## 0.1.3 25 | 26 | ### New Features 27 | 28 | - Pass glb,loc to python ([#28](https://github.com/AnswerDotAI/toolslm/issues/28)) 29 | 30 | ## 0.1.2 31 | 32 | ### New Features 33 | 34 | - Adds `call_func_async` ([#27](https://github.com/AnswerDotAI/toolslm/pull/27)), thanks to [@mikonapoli](https://github.com/mikonapoli) 35 | - Add arg ignore links ([#26](https://github.com/AnswerDotAI/toolslm/pull/26)), thanks to [@Isaac-Flath](https://github.com/Isaac-Flath) 36 | 37 | 38 | ## 0.1.1 39 | 40 | ### New Features 41 | 42 | - Add arg ignore links ([#26](https://github.com/AnswerDotAI/toolslm/pull/26)), thanks to [@Isaac-Flath](https://github.com/Isaac-Flath) 43 | 44 | ### Bugs Squashed 45 | 46 | - fix: prevent markdown heading detection inside code blocks ([#25](https://github.com/AnswerDotAI/toolslm/pull/25)), thanks to [@franckalbinet](https://github.com/franckalbinet) 47 | - Fix markdown hierarchy parsing for arbitrary header levels ([#22](https://github.com/AnswerDotAI/toolslm/pull/22)), thanks to [@erikgaas](https://github.com/erikgaas) 48 | 49 | 50 | ## 0.1.0 51 | 52 | ### Breaking changes 53 | 54 | - Replace `source` with `src` in context generation ([#17](https://github.com/AnswerDotAI/toolslm/issues/17)) 55 | 56 | 57 | ## 0.0.8 58 | 59 | ### New Features 60 | 61 | - Escape and print context in `folder2ctx` et al ([#16](https://github.com/AnswerDotAI/toolslm/issues/16)) 62 | 63 | 64 | ## 0.0.7 65 | 66 | ### New Features 67 | 68 | - Add `dict2obj` to `md_hier` funcs ([#15](https://github.com/AnswerDotAI/toolslm/issues/15)) 69 | - Migrate call_func from claudette to toolslm ([#14](https://github.com/AnswerDotAI/toolslm/pull/14)), thanks to [@ncoop57](https://github.com/ncoop57) 70 | - Allow for getting schemas from nested structures ([#11](https://github.com/AnswerDotAI/toolslm/pull/11)), thanks to [@ncoop57](https://github.com/ncoop57) 71 | - Allow for `sel` to select and wrap multiple element results ([#10](https://github.com/AnswerDotAI/toolslm/pull/10)), thanks to [@Isaac-Flath](https://github.com/Isaac-Flath) 72 | 73 | ### Bugs Squashed 74 | 75 | - Using `get_schema` on class method results in type missing error ([#12](https://github.com/AnswerDotAI/toolslm/issues/12)) 76 | 77 | 78 | ## 0.0.6 79 | 80 | ### New Features 81 | 82 | - Add `read_docs` and `find_docs` ([#8](https://github.com/AnswerDotAI/toolslm/issues/8)) 83 | 84 | 85 | ## 0.0.5 86 | 87 | ### Bugs Squashed 88 | 89 | - XML tools assume all files have content ([#3](https://github.com/AnswerDotAI/toolslm/issues/3)) 90 | 91 | 92 | ## 0.0.4 93 | 94 | - Minor updates 95 | 96 | ## 0.0.2 97 | 98 | - Rename project 99 | 100 | 101 | ## 0.0.1 102 | 103 | - Initial alpha release 104 | 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include settings.ini 2 | include LICENSE 3 | include CONTRIBUTING.md 4 | include README.md 5 | recursive-exclude * __pycache__ 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # toolslm 2 | 3 | 4 | 5 | 6 | This is a work in progress… 7 | 8 | ## Install 9 | 10 | ``` sh 11 | pip install toolslm 12 | ``` 13 | 14 | ## How to use 15 | 16 | ### Context creation 17 | 18 | toolslm has some helpers to make it easier to generate XML context from 19 | files, for instance 20 | [`folder2ctx`](https://AnswerDotAI.github.io/toolslm/xml.html#folder2ctx): 21 | 22 | ``` python 23 | print(folder2ctx('samples', prefix=False, file_glob='*.py')) 24 | ``` 25 | 26 | 27 | samples/sample_core.py 28 | 29 | import inspect 30 | empty = inspect.Parameter.empty 31 | models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307' 32 | 33 | 34 | JSON doesn’t map as nicely to XML as the `ft` data structure from 35 | `fastcore.xml`, but for simple XML trees it can be convenient. The 36 | [`json_to_xml`](https://AnswerDotAI.github.io/toolslm/xml.html#json_to_xml) 37 | function handles that conversion: 38 | 39 | ``` python 40 | a = dict(surname='Howard', firstnames=['Jeremy','Peter'], 41 | address=dict(state='Queensland',country='Australia')) 42 | print(json_to_xml(a, 'person')) 43 | ``` 44 | 45 | 46 | Howard 47 | 48 | Jeremy 49 | Peter 50 | 51 |
52 | Queensland 53 | Australia 54 |
55 |
56 | -------------------------------------------------------------------------------- /_quarto.yml: -------------------------------------------------------------------------------- 1 | project: 2 | type: website 3 | 4 | format: 5 | html: 6 | theme: cosmo 7 | css: styles.css 8 | toc: true 9 | keep-md: true 10 | commonmark: default 11 | 12 | website: 13 | twitter-card: true 14 | open-graph: true 15 | repo-actions: [issue] 16 | navbar: 17 | background: primary 18 | search: true 19 | sidebar: 20 | style: floating 21 | 22 | metadata-files: [nbdev.yml, sidebar.yml] 23 | -------------------------------------------------------------------------------- /index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "56e2fbc1", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "#| hide\n", 11 | "from toolslm.xml import *" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "9c85d17d", 17 | "metadata": {}, 18 | "source": [ 19 | "# toolslm\n", 20 | "\n", 21 | "> Tools to make language models a bit easier to use" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "id": "947109d0", 27 | "metadata": {}, 28 | "source": [ 29 | "This is a work in progress..." 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "id": "431900fc", 35 | "metadata": {}, 36 | "source": [ 37 | "## Install" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "6cf13202", 43 | "metadata": {}, 44 | "source": [ 45 | "```sh\n", 46 | "pip install toolslm\n", 47 | "```" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "id": "36346546", 53 | "metadata": {}, 54 | "source": [ 55 | "## How to use" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "id": "2a8a7a9a", 61 | "metadata": {}, 62 | "source": [ 63 | "### Context creation" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "id": "3778e8ed", 69 | "metadata": {}, 70 | "source": [ 71 | "toolslm has some helpers to make it easier to generate XML context from files, for instance `folder2ctx`:" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "id": "efd52392", 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "\n", 85 | "samples/sample_core.py\n", 86 | "\n", 87 | "import inspect\n", 88 | "empty = inspect.Parameter.empty\n", 89 | "models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'\n", 90 | "\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "print(folder2ctx('samples', prefix=False, file_glob='*.py'))" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "id": "58206da8", 101 | "metadata": {}, 102 | "source": [ 103 | "JSON doesn't map as nicely to XML as the `ft` data structure from `fastcore.xml`, but for simple XML trees it can be convenient. The `json_to_xml` function handles that conversion:" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "id": "9bcb985e", 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "\n", 117 | " Howard\n", 118 | " \n", 119 | " Jeremy\n", 120 | " Peter\n", 121 | " \n", 122 | "
\n", 123 | " Queensland\n", 124 | " Australia\n", 125 | "
\n", 126 | "
\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "a = dict(surname='Howard', firstnames=['Jeremy','Peter'],\n", 132 | " address=dict(state='Queensland',country='Australia'))\n", 133 | "print(json_to_xml(a, 'person'))" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "7a3b2c28", 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [] 143 | } 144 | ], 145 | "metadata": { 146 | "kernelspec": { 147 | "display_name": "python3", 148 | "language": "python", 149 | "name": "python3" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 5 154 | } 155 | -------------------------------------------------------------------------------- /nbdev.yml: -------------------------------------------------------------------------------- 1 | project: 2 | output-dir: _docs 3 | 4 | website: 5 | title: "toolslm" 6 | site-url: "https://AnswerDotAI.github.io/toolslm" 7 | description: "Tools to make language models a bit easier to use" 8 | repo-branch: main 9 | repo-url: "https://github.com/AnswerDotAI/toolslm" 10 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name="toolslm" 7 | requires-python=">=3.9" 8 | dynamic = [ "keywords", "description", "version", "dependencies", "optional-dependencies", "readme", "license", "authors", "classifiers", "entry-points", "scripts", "urls"] 9 | 10 | [tool.uv] 11 | cache-keys = [{ file = "pyproject.toml" }, { file = "settings.ini" }, { file = "setup.py" }] 12 | -------------------------------------------------------------------------------- /samples/sample_core.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | empty = inspect.Parameter.empty 3 | models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307' 4 | -------------------------------------------------------------------------------- /samples/sample_styles.css: -------------------------------------------------------------------------------- 1 | .cell { margin-bottom: 1rem; } 2 | .cell > .sourceCode { margin-bottom: 0; } 3 | .cell-output > pre { margin-bottom: 0; } 4 | 5 | -------------------------------------------------------------------------------- /settings.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | repo = toolslm 3 | lib_name = toolslm 4 | version = 0.2.2 5 | min_python = 3.9 6 | license = apache2 7 | black_formatting = False 8 | requirements = fastcore>=1.5.47 httpx 9 | doc_path = _docs 10 | lib_path = toolslm 11 | nbs_path = . 12 | recursive = True 13 | tst_flags = notest 14 | put_version_in_init = True 15 | branch = main 16 | custom_sidebar = False 17 | doc_host = https://AnswerDotAI.github.io 18 | doc_baseurl = /toolslm 19 | git_url = https://github.com/AnswerDotAI/toolslm 20 | title = toolslm 21 | audience = Developers 22 | author = Jeremy Howard 23 | author_email = j@fast.ai 24 | copyright = 2024 onwards, Jeremy Howard 25 | description = Tools to make language models a bit easier to use 26 | keywords = nbdev jupyter notebook python 27 | language = English 28 | status = 3 29 | user = AnswerDotAI 30 | readme_nb = index.ipynb 31 | allowed_metadata_keys = 32 | allowed_cell_metadata_keys = 33 | jupyter_hooks = True 34 | clean_ids = True 35 | clear_all = False 36 | conda_user = fastai 37 | console_scripts = folder2ctx=toolslm.xml:folder2ctx_cli 38 | cell_number = True 39 | skip_procs = 40 | update_pyproject = True 41 | 42 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from pkg_resources import parse_version 2 | from configparser import ConfigParser 3 | import setuptools, shlex 4 | assert parse_version(setuptools.__version__)>=parse_version('36.2') 5 | 6 | # note: all settings are in settings.ini; edit there, not here 7 | config = ConfigParser(delimiters=['=']) 8 | config.read('settings.ini', encoding='utf-8') 9 | cfg = config['DEFAULT'] 10 | 11 | cfg_keys = 'version description keywords author author_email'.split() 12 | expected = cfg_keys + "lib_name user branch license status min_python audience language".split() 13 | for o in expected: assert o in cfg, "missing expected setting: {}".format(o) 14 | setup_cfg = {o:cfg[o] for o in cfg_keys} 15 | 16 | licenses = { 17 | 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'), 18 | 'mit': ('MIT License', 'OSI Approved :: MIT License'), 19 | 'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'), 20 | 'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'), 21 | 'bsd3': ('BSD License', 'OSI Approved :: BSD License'), 22 | } 23 | statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha', 24 | '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ] 25 | py_versions = '3.6 3.7 3.8 3.9 3.10'.split() 26 | 27 | requirements = shlex.split(cfg.get('requirements', '')) 28 | if cfg.get('pip_requirements'): requirements += shlex.split(cfg.get('pip_requirements', '')) 29 | min_python = cfg['min_python'] 30 | lic = licenses.get(cfg['license'].lower(), (cfg['license'], None)) 31 | dev_requirements = (cfg.get('dev_requirements') or '').split() 32 | 33 | setuptools.setup( 34 | name = cfg['lib_name'], 35 | license = lic[0], 36 | classifiers = [ 37 | 'Development Status :: ' + statuses[int(cfg['status'])], 38 | 'Intended Audience :: ' + cfg['audience'].title(), 39 | 'Natural Language :: ' + cfg['language'].title(), 40 | ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []), 41 | url = cfg['git_url'], 42 | packages = setuptools.find_packages(), 43 | include_package_data = True, 44 | install_requires = requirements, 45 | extras_require={ 'dev': dev_requirements }, 46 | dependency_links = cfg.get('dep_links','').split(), 47 | python_requires = '>=' + cfg['min_python'], 48 | long_description = open('README.md', encoding='utf-8').read(), 49 | long_description_content_type = 'text/markdown', 50 | zip_safe = False, 51 | entry_points = { 52 | 'console_scripts': cfg.get('console_scripts','').split(), 53 | 'nbdev': [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d'] 54 | }, 55 | **setup_cfg) 56 | 57 | 58 | -------------------------------------------------------------------------------- /styles.css: -------------------------------------------------------------------------------- 1 | .cell { 2 | margin-bottom: 1rem; 3 | } 4 | 5 | .cell > .sourceCode { 6 | margin-bottom: 0; 7 | } 8 | 9 | .cell-output > pre { 10 | margin-bottom: 0; 11 | } 12 | 13 | .cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre { 14 | margin-left: 0.8rem; 15 | margin-top: 0; 16 | background: none; 17 | border-left: 2px solid lightsalmon; 18 | border-top-left-radius: 0; 19 | border-top-right-radius: 0; 20 | } 21 | 22 | .cell-output > .sourceCode { 23 | border: none; 24 | } 25 | 26 | .cell-output > .sourceCode { 27 | background: none; 28 | margin-top: 0; 29 | } 30 | 31 | div.description { 32 | padding-left: 2px; 33 | padding-top: 5px; 34 | font-style: italic; 35 | font-size: 135%; 36 | opacity: 70%; 37 | } 38 | -------------------------------------------------------------------------------- /toolslm/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.2" 2 | -------------------------------------------------------------------------------- /toolslm/_modidx.py: -------------------------------------------------------------------------------- 1 | # Autogenerated by nbdev 2 | 3 | d = { 'settings': { 'branch': 'main', 4 | 'doc_baseurl': '/toolslm', 5 | 'doc_host': 'https://AnswerDotAI.github.io', 6 | 'git_url': 'https://github.com/AnswerDotAI/toolslm', 7 | 'lib_path': 'toolslm'}, 8 | 'syms': { 'toolslm.download': { 'toolslm.download._tryget': ('download.html#_tryget', 'toolslm/download.py'), 9 | 'toolslm.download.clean_md': ('download.html#clean_md', 'toolslm/download.py'), 10 | 'toolslm.download.find_docs': ('download.html#find_docs', 'toolslm/download.py'), 11 | 'toolslm.download.get_llmstxt': ('download.html#get_llmstxt', 'toolslm/download.py'), 12 | 'toolslm.download.html2md': ('download.html#html2md', 'toolslm/download.py'), 13 | 'toolslm.download.read_docs': ('download.html#read_docs', 'toolslm/download.py'), 14 | 'toolslm.download.read_html': ('download.html#read_html', 'toolslm/download.py'), 15 | 'toolslm.download.read_md': ('download.html#read_md', 'toolslm/download.py'), 16 | 'toolslm.download.split_url': ('download.html#split_url', 'toolslm/download.py')}, 17 | 'toolslm.funccall': { 'toolslm.funccall.PathArg': ('funccall.html#patharg', 'toolslm/funccall.py'), 18 | 'toolslm.funccall._copy_loc': ('funccall.html#_copy_loc', 'toolslm/funccall.py'), 19 | 'toolslm.funccall._get_nested_schema': ('funccall.html#_get_nested_schema', 'toolslm/funccall.py'), 20 | 'toolslm.funccall._handle_container': ('funccall.html#_handle_container', 'toolslm/funccall.py'), 21 | 'toolslm.funccall._handle_type': ('funccall.html#_handle_type', 'toolslm/funccall.py'), 22 | 'toolslm.funccall._is_container': ('funccall.html#_is_container', 'toolslm/funccall.py'), 23 | 'toolslm.funccall._is_parameterized': ('funccall.html#_is_parameterized', 'toolslm/funccall.py'), 24 | 'toolslm.funccall._param': ('funccall.html#_param', 'toolslm/funccall.py'), 25 | 'toolslm.funccall._process_property': ('funccall.html#_process_property', 'toolslm/funccall.py'), 26 | 'toolslm.funccall._run': ('funccall.html#_run', 'toolslm/funccall.py'), 27 | 'toolslm.funccall._types': ('funccall.html#_types', 'toolslm/funccall.py'), 28 | 'toolslm.funccall.call_func': ('funccall.html#call_func', 'toolslm/funccall.py'), 29 | 'toolslm.funccall.call_func_async': ('funccall.html#call_func_async', 'toolslm/funccall.py'), 30 | 'toolslm.funccall.get_schema': ('funccall.html#get_schema', 'toolslm/funccall.py'), 31 | 'toolslm.funccall.mk_ns': ('funccall.html#mk_ns', 'toolslm/funccall.py'), 32 | 'toolslm.funccall.python': ('funccall.html#python', 'toolslm/funccall.py')}, 33 | 'toolslm.md_hier': {}, 34 | 'toolslm.shell': { 'toolslm.shell.TerminalInteractiveShell.run_cell': ( 'shell.html#terminalinteractiveshell.run_cell', 35 | 'toolslm/shell.py'), 36 | 'toolslm.shell.get_shell': ('shell.html#get_shell', 'toolslm/shell.py')}, 37 | 'toolslm.xml': { 'toolslm.xml._add_nls': ('xml.html#_add_nls', 'toolslm/xml.py'), 38 | 'toolslm.xml.docs_xml': ('xml.html#docs_xml', 'toolslm/xml.py'), 39 | 'toolslm.xml.files2ctx': ('xml.html#files2ctx', 'toolslm/xml.py'), 40 | 'toolslm.xml.folder2ctx': ('xml.html#folder2ctx', 'toolslm/xml.py'), 41 | 'toolslm.xml.folder2ctx_cli': ('xml.html#folder2ctx_cli', 'toolslm/xml.py'), 42 | 'toolslm.xml.json_to_xml': ('xml.html#json_to_xml', 'toolslm/xml.py'), 43 | 'toolslm.xml.mk_doc': ('xml.html#mk_doc', 'toolslm/xml.py'), 44 | 'toolslm.xml.mk_doctype': ('xml.html#mk_doctype', 'toolslm/xml.py')}}} 45 | -------------------------------------------------------------------------------- /toolslm/download.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../03_download.ipynb. 2 | 3 | # %% auto 0 4 | __all__ = ['clean_md', 'read_md', 'html2md', 'read_html', 'get_llmstxt', 'split_url', 'find_docs', 'read_docs'] 5 | 6 | # %% ../03_download.ipynb 2 7 | from fastcore.utils import * 8 | from httpx import get 9 | from fastcore.meta import delegates 10 | from urllib.parse import urlparse, urljoin 11 | 12 | # %% ../03_download.ipynb 4 13 | def clean_md(text, rm_comments=True, rm_details=True): 14 | "Remove comments and `
` sections from `text`" 15 | if rm_comments: text = re.sub(r'\n?\n?', '', text, flags=re.DOTALL) 16 | if rm_details: text = re.sub(r'\n?
.*?
\n?', '', text, flags=re.DOTALL) 17 | return text 18 | 19 | # %% ../03_download.ipynb 5 20 | @delegates(get) 21 | def read_md(url, rm_comments=True, rm_details=True, **kwargs): 22 | "Read text from `url` and clean with `clean_docs`" 23 | return clean_md(get(url, **kwargs).text, rm_comments=rm_comments, rm_details=rm_details) 24 | 25 | # %% ../03_download.ipynb 7 26 | def html2md(s:str, ignore_links=True): 27 | "Convert `s` from HTML to markdown" 28 | import html2text 29 | o = html2text.HTML2Text(bodywidth=5000) 30 | o.ignore_links = ignore_links 31 | o.mark_code = True 32 | o.ignore_images = True 33 | return o.handle(s) 34 | 35 | # %% ../03_download.ipynb 8 36 | def read_html(url, # URL to read 37 | sel=None, # Read only outerHTML of CSS selector `sel` 38 | rm_comments=True, # Removes HTML comments 39 | rm_details=True, # Removes `
` tags 40 | multi=False, # Get all matches to `sel` or first one 41 | wrap_tag=None, #If multi, each selection wrapped with content 42 | ignore_links=True, 43 | ): # Cleaned markdown 44 | "Get `url`, optionally selecting CSS selector `sel`, and convert to clean markdown" 45 | page = get(url).text 46 | if sel: 47 | from bs4 import BeautifulSoup 48 | soup = BeautifulSoup(page, 'html.parser') 49 | if multi: 50 | page = [str(el) for el in soup.select(sel)] 51 | if not wrap_tag: page = "\n".join(page) 52 | else: page = str(soup.select_one(sel)) 53 | mds = map(lambda x: clean_md(html2md(x, ignore_links=ignore_links), rm_comments, rm_details=rm_details), tuplify(page)) 54 | if wrap_tag: return '\n'.join([f"\n<{wrap_tag}>\n{o}\n" for o in mds]) 55 | else: return'\n'.join(mds) 56 | 57 | # %% ../03_download.ipynb 13 58 | def get_llmstxt(url, optional=False, n_workers=None): 59 | "Get llms.txt file from and expand it with `llms_txt.create_ctx()`" 60 | if not url.endswith('llms.txt'): return None 61 | import llms_txt 62 | resp = get(url) 63 | if resp.status_code!=200: return None 64 | return llms_txt.create_ctx(resp.text, optional=optional, n_workers=n_workers) 65 | 66 | # %% ../03_download.ipynb 15 67 | def split_url(url): 68 | "Split `url` into base, path, and file name, normalising name to '/' if empty" 69 | parsed = urlparse(url.strip('/')) 70 | base = f"{parsed.scheme}://{parsed.netloc}" 71 | path,spl,fname = parsed.path.rpartition('/') 72 | fname = spl+fname 73 | if not path and not fname: path='/' 74 | return base,path,fname 75 | 76 | # %% ../03_download.ipynb 17 77 | def _tryget(url): 78 | "Return response from `url` if `status_code!=404`, otherwise `None`" 79 | res = get(url) 80 | return None if res.status_code==404 else url 81 | 82 | # %% ../03_download.ipynb 18 83 | def find_docs(url): 84 | "If available, return LLM-friendly llms.txt context or markdown file location from `url`" 85 | base,path,fname = split_url(url) 86 | url = (base+path+fname).strip('/') 87 | if fname=='/llms.txt': return url 88 | if Path(fname).suffix in('.md', '.txt', '.rst'): return _tryget(url) 89 | if '.' in fname: return _tryget(url+'.md') or find_docs(url[:url.rfind('/')]) 90 | res = _tryget(url+'/llms.txt') 91 | if res: return res 92 | res = _tryget(url+'/index.md') 93 | if res: return res 94 | res = _tryget(url+'/index.html.md') 95 | if res: return res 96 | res = _tryget(url+'/index-commonmark.md') 97 | if res: return res 98 | parsed_url = urlparse(url) 99 | if parsed_url.path == '/' or not parsed_url.path: return None 100 | return find_docs(urljoin(url, '..')) 101 | 102 | # %% ../03_download.ipynb 23 103 | def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True): 104 | "If available, return LLM-friendly llms.txt context or markdown file response for `url`" 105 | url = find_docs(url) 106 | if url.endswith('/llms.txt'): res = get_llmstxt(url, optional=optional, n_workers=n_workers) 107 | else: res = get(url).text 108 | return clean_md(res, rm_comments=rm_comments, rm_details=rm_details) 109 | -------------------------------------------------------------------------------- /toolslm/funccall.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../01_funccall.ipynb. 2 | 3 | # %% auto 0 4 | __all__ = ['empty', 'custom_types', 'get_schema', 'PathArg', 'python', 'mk_ns', 'call_func', 'call_func_async'] 5 | 6 | # %% ../01_funccall.ipynb 2 7 | import inspect 8 | from collections import abc 9 | from fastcore.utils import * 10 | from fastcore.docments import docments 11 | from typing import get_origin, get_args, Dict, List, Optional, Tuple, Union 12 | from types import UnionType 13 | 14 | # %% ../01_funccall.ipynb 4 15 | empty = inspect.Parameter.empty 16 | 17 | # %% ../01_funccall.ipynb 12 18 | def _types(t:type)->tuple[str,Optional[str]]: 19 | "Tuple of json schema type name and (if appropriate) array item name." 20 | if t is empty: raise TypeError('Missing type') 21 | tmap = {int:"integer", float:"number", str:"string", bool:"boolean", list:"array", dict:"object"} 22 | tmap.update({k.__name__: v for k, v in tmap.items()}) 23 | if getattr(t, '__origin__', None) in (list,tuple): 24 | args = getattr(t, '__args__', None) 25 | item_type = "object" if not args else tmap.get(t.__args__[0].__name__, "object") 26 | return "array", item_type 27 | # if t is a string like 'int', directly use the string as the key 28 | elif isinstance(t, str): return tmap.get(t, "object"), None 29 | # if t is the type itself and a container 30 | elif get_origin(t): return tmap.get(get_origin(t).__name__, "object"), None 31 | # if t is the type itself like int, use the __name__ representation as the key 32 | else: return tmap.get(t.__name__, "object"), None 33 | 34 | # %% ../01_funccall.ipynb 19 35 | def _param(name, info): 36 | "json schema parameter given `name` and `info` from docments full dict." 37 | paramt,itemt = _types(info.anno) 38 | pschema = dict(type=paramt, description=info.docment or "") 39 | if itemt: pschema["items"] = {"type": itemt} 40 | if info.default is not empty: pschema["default"] = info.default 41 | return pschema 42 | 43 | # %% ../01_funccall.ipynb 22 44 | custom_types = {Path} 45 | 46 | def _handle_type(t, defs): 47 | "Handle a single type, creating nested schemas if necessary" 48 | if t is NoneType: return {'type': 'null'} 49 | if t in custom_types: return {'type':'string', 'format':t.__name__} 50 | if isinstance(t, type) and not issubclass(t, (int, float, str, bool)) or inspect.isfunction(t): 51 | defs[t.__name__] = _get_nested_schema(t) 52 | return {'$ref': f'#/$defs/{t.__name__}'} 53 | return {'type': _types(t)[0]} 54 | 55 | # %% ../01_funccall.ipynb 24 56 | def _is_container(t): 57 | "Check if type is a container (list, dict, tuple, set, Union)" 58 | origin = get_origin(t) 59 | return origin in (list, dict, tuple, set, Union) if origin else False 60 | 61 | def _is_parameterized(t): 62 | "Check if type has arguments (e.g. list[int] vs list, dict[str, int] vs dict)" 63 | return _is_container(t) and (get_args(t) != ()) 64 | 65 | # %% ../01_funccall.ipynb 30 66 | def _handle_container(origin, args, defs): 67 | "Handle container types like dict, list, tuple, set, and Union" 68 | if origin is Union or origin is UnionType: 69 | return {"anyOf": [_handle_type(arg, defs) for arg in args]} 70 | if origin is dict: 71 | value_type = args[1].__args__[0] if hasattr(args[1], '__args__') else args[1] 72 | return { 73 | 'type': 'object', 74 | 'additionalProperties': ( 75 | {'type': 'array', 'items': _handle_type(value_type, defs)} 76 | if hasattr(args[1], '__origin__') else _handle_type(args[1], defs) 77 | ) 78 | } 79 | elif origin in (list, tuple, set): 80 | schema = {'type': 'array', 'items': _handle_type(args[0], defs)} 81 | if origin is set: 82 | schema['uniqueItems'] = True 83 | return schema 84 | return None 85 | 86 | # %% ../01_funccall.ipynb 31 87 | def _process_property(name, obj, props, req, defs): 88 | "Process a single property of the schema" 89 | p = _param(name, obj) 90 | props[name] = p 91 | if obj.default is empty: req[name] = True 92 | 93 | if _is_container(obj.anno) and _is_parameterized(obj.anno): 94 | p.update(_handle_container(get_origin(obj.anno), get_args(obj.anno), defs)) 95 | else: 96 | # Non-container type or container without arguments 97 | p.update(_handle_type(obj.anno, defs)) 98 | 99 | # %% ../01_funccall.ipynb 32 100 | def _get_nested_schema(obj): 101 | "Generate nested JSON schema for a class or function" 102 | d = docments(obj, full=True) 103 | props, req, defs = {}, {}, {} 104 | 105 | for n, o in d.items(): 106 | if n != 'return' and n != 'self': 107 | _process_property(n, o, props, req, defs) 108 | 109 | schema = dict(type='object', properties=props, title=obj.__name__ if isinstance(obj, type) else None) 110 | if req: schema['required'] = list(req) 111 | if defs: schema['$defs'] = defs 112 | return schema 113 | 114 | # %% ../01_funccall.ipynb 36 115 | def get_schema(f:Union[callable,dict], pname='input_schema')->dict: 116 | "Generate JSON schema for a class, function, or method" 117 | if isinstance(f, dict): return f 118 | schema = _get_nested_schema(f) 119 | desc = f.__doc__ 120 | assert desc, "Docstring missing!" 121 | d = docments(f, full=True) 122 | ret = d.pop('return') 123 | if ret.anno is not empty: desc += f'\n\nReturns:\n- type: {_types(ret.anno)[0]}' 124 | return {"name": f.__name__, "description": desc, pname: schema} 125 | 126 | # %% ../01_funccall.ipynb 47 127 | def PathArg( 128 | path: str # A filesystem path 129 | ): return Path(path) 130 | 131 | # %% ../01_funccall.ipynb 67 132 | import ast, time, signal, traceback 133 | from fastcore.utils import * 134 | 135 | # %% ../01_funccall.ipynb 68 136 | def _copy_loc(new, orig): 137 | "Copy location information from original node to new node and all children." 138 | new = ast.copy_location(new, orig) 139 | for field, o in ast.iter_fields(new): 140 | if isinstance(o, ast.AST): setattr(new, field, _copy_loc(o, orig)) 141 | elif isinstance(o, list): setattr(new, field, [_copy_loc(value, orig) for value in o]) 142 | return new 143 | 144 | # %% ../01_funccall.ipynb 70 145 | def _run(code:str, glb:dict=None, loc:dict=None): 146 | "Run `code`, returning final expression (similar to IPython)" 147 | tree = ast.parse(code) 148 | last_node = tree.body[-1] if tree.body else None 149 | 150 | # If the last node is an expression, modify the AST to capture the result 151 | if isinstance(last_node, ast.Expr): 152 | tgt = [ast.Name(id='_result', ctx=ast.Store())] 153 | assign_node = ast.Assign(targets=tgt, value=last_node.value) 154 | tree.body[-1] = _copy_loc(assign_node, last_node) 155 | 156 | compiled_code = compile(tree, filename='', mode='exec') 157 | glb = glb or {} 158 | stdout_buffer = io.StringIO() 159 | saved_stdout = sys.stdout 160 | sys.stdout = stdout_buffer 161 | try: exec(compiled_code, glb, loc) 162 | finally: sys.stdout = saved_stdout 163 | _result = glb.get('_result', None) 164 | if _result is not None: return _result 165 | return stdout_buffer.getvalue().strip() 166 | 167 | # %% ../01_funccall.ipynb 75 168 | def python(code:str, # Code to execute 169 | glb:Optional[dict]=None, # Globals namespace 170 | loc:Optional[dict]=None, # Locals namespace 171 | timeout:int=3600 # Maximum run time in seconds before a `TimeoutError` is raised 172 | ): # Result of last node, if it's an expression, or `None` otherwise 173 | """Executes python `code` with `timeout` and returning final expression (similar to IPython). 174 | Raised exceptions are returned as a string, with a stack trace.""" 175 | def handler(*args): raise TimeoutError() 176 | if glb is None: glb = inspect.currentframe().f_back.f_globals 177 | if loc is None: loc=glb 178 | signal.signal(signal.SIGALRM, handler) 179 | signal.alarm(timeout) 180 | try: return _run(code, glb, loc) 181 | except Exception as e: return traceback.format_exc() 182 | finally: signal.alarm(0) 183 | 184 | # %% ../01_funccall.ipynb 86 185 | def mk_ns(*funcs_or_objs): 186 | merged = {} 187 | for o in funcs_or_objs: 188 | if isinstance(o, type): merged |= {n:getattr(o,n) for n,m in o.__dict__.items() if isinstance(m, (staticmethod, classmethod))} 189 | if isinstance(o, object): merged |= {n:getattr(o,n) for n, m in inspect.getmembers(o, inspect.ismethod)} | {n:m for n,m in o.__class__.__dict__.items() if isinstance(m, staticmethod)} 190 | if callable(o) and hasattr(o, '__name__'): merged |= {o.__name__: o} 191 | return merged 192 | 193 | # %% ../01_funccall.ipynb 95 194 | def call_func(fc_name, fc_inputs, ns, raise_on_err=True): 195 | "Call the function `fc_name` with the given `fc_inputs` using namespace `ns`." 196 | if not isinstance(ns, abc.Mapping): ns = mk_ns(*ns) 197 | func = ns[fc_name] 198 | try: return func(**fc_inputs) 199 | except Exception as e: 200 | if raise_on_err: raise e 201 | else: return traceback.format_exc() 202 | 203 | # %% ../01_funccall.ipynb 106 204 | async def call_func_async(fc_name, fc_inputs, ns, raise_on_err=True): 205 | "Awaits the function `fc_name` with the given `fc_inputs` using namespace `ns`." 206 | res = call_func(fc_name, fc_inputs, ns, raise_on_err=raise_on_err) 207 | if inspect.iscoroutine(res): 208 | try: res = await res 209 | except Exception as e: 210 | if raise_on_err: raise e 211 | else: return traceback.format_exc() 212 | return res 213 | -------------------------------------------------------------------------------- /toolslm/md_hier.py: -------------------------------------------------------------------------------- 1 | import re 2 | from fastcore.utils import * 3 | __all__ = ['markdown_to_dict', 'create_heading_dict'] 4 | 5 | def markdown_to_dict(markdown_content): 6 | def clean_heading(text): return re.sub(r'[^A-Za-z0-9 ]+', '', text).strip() 7 | 8 | lines = markdown_content.splitlines() 9 | headings = [] 10 | in_code_block = False 11 | 12 | # Parse headings with their levels and line numbers 13 | for idx, line in enumerate(lines): 14 | # Toggle code block state when encountering fence 15 | if line.strip().startswith('```'): in_code_block = not in_code_block 16 | 17 | # Only detect headings when not in a code block 18 | if in_code_block: continue 19 | match = re.match(r'^(#{1,6})\s*(.*)', line) 20 | if match: 21 | level = len(match.group(1)) 22 | text = match.group(2).strip() 23 | headings.append({'level': level, 'text': text, 'line': idx}) 24 | 25 | # Assign content to each heading, including subheadings 26 | for i, h in enumerate(headings): 27 | start = h['line'] # Include the heading line itself 28 | # Find the end index: next heading of same or higher level 29 | for j in range(i + 1, len(headings)): 30 | if headings[j]['level'] <= h['level']: 31 | end = headings[j]['line'] 32 | break 33 | else: end = len(lines) 34 | h['content'] = '\n'.join(lines[start:end]).strip() 35 | 36 | # Build the dictionary with hierarchical keys 37 | result,stack = {},[] 38 | first_level = headings[0]['level'] 39 | for h in headings: 40 | stack = stack[:h['level'] - first_level] + [clean_heading(h['text'])] 41 | key = '.'.join(stack) 42 | result[key] = h['content'] 43 | return dict2obj(result) 44 | 45 | def create_heading_dict(text): 46 | text = re.sub(r'```[\s\S]*?```', '', text) 47 | headings = re.findall(r'^#+.*', text, flags=re.MULTILINE) 48 | result = {} 49 | stack = [result] 50 | prev_level = 0 51 | 52 | for heading in headings: 53 | level = heading.count('#') 54 | title = heading.strip('#').strip() 55 | while level <= prev_level: 56 | stack.pop() 57 | prev_level -= 1 58 | new_dict = {} 59 | stack[-1][title] = new_dict 60 | stack.append(new_dict) 61 | prev_level = level 62 | return dict2obj(result) 63 | 64 | 65 | if __name__=='__main__': 66 | md_content = """ 67 | # User 68 | 69 | This is the User section. 70 | 71 | ## Tokens 72 | 73 | Details about tokens. 74 | 75 | ### Value 76 | 77 | The value of tokens. 78 | 79 | Some more details. 80 | 81 | ## Settings 82 | 83 | User settings information. 84 | 85 | # Admin 86 | 87 | Admin section. 88 | 89 | ## Users 90 | 91 | Admin users management. 92 | """ 93 | 94 | result = markdown_to_dict(md_content) 95 | #for key, value in result.items(): print(f'Key: {key}\nValue:\n{value}\n{"-"*40}') 96 | 97 | def test_empty_content(): 98 | md_content = "# Empty Heading" 99 | result = markdown_to_dict(md_content) 100 | assert result['Empty Heading'] == '# Empty Heading' 101 | 102 | def test_special_characters(): 103 | md_content = "# Heading *With* Special _Characters_!\nContent under heading." 104 | result = markdown_to_dict(md_content) 105 | assert 'Heading With Special Characters' in result 106 | assert result['Heading With Special Characters'] == '# Heading *With* Special _Characters_!\nContent under heading.' 107 | 108 | def test_duplicate_headings(): 109 | md_content = "# Duplicate\n## Duplicate\n### Duplicate\nContent under duplicate headings." 110 | result = markdown_to_dict(md_content) 111 | assert 'Duplicate' in result 112 | assert 'Duplicate.Duplicate' in result 113 | assert 'Duplicate.Duplicate.Duplicate' in result 114 | assert result['Duplicate.Duplicate.Duplicate'] == '### Duplicate\nContent under duplicate headings.' 115 | 116 | def test_no_content(): 117 | md_content = "# No Content Heading\n## Subheading" 118 | result = markdown_to_dict(md_content) 119 | assert result['No Content Heading'] == '# No Content Heading\n## Subheading' 120 | assert result['No Content Heading.Subheading'] == '## Subheading' 121 | 122 | def test_different_levels(): 123 | md_content = "### Level 3 Heading\nContent at level 3.\n# Level 1 Heading\nContent at level 1." 124 | result = markdown_to_dict(md_content) 125 | assert 'Level 3 Heading' in result 126 | assert 'Level 1 Heading' in result 127 | assert result['Level 3 Heading'] == '### Level 3 Heading\nContent at level 3.' 128 | assert result['Level 1 Heading'] == '# Level 1 Heading\nContent at level 1.' 129 | 130 | def test_parent_includes_subheadings(): 131 | md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content." 132 | result = markdown_to_dict(md_content) 133 | assert result['Parent'] == '# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content.' 134 | assert result['Parent.Child'] == '## Child\nChild content.\n### Grandchild\nGrandchild content.' 135 | assert result['Parent.Child.Grandchild'] == '### Grandchild\nGrandchild content.' 136 | 137 | def test_multiple_level2_siblings(): 138 | md_content = "##Sib 1\n##Sib 2\n##Sib 3\n##Sib 4\n##Sib 5'" 139 | result = markdown_to_dict(md_content) 140 | assert 'Sib 1' in result 141 | assert 'Sib 2' in result 142 | assert 'Sib 3' in result 143 | assert 'Sib 4' in result 144 | assert 'Sib 5' in result 145 | 146 | def test_code_chunks_escaped(): 147 | md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```" 148 | result = markdown_to_dict(md_content) 149 | assert 'Code comment' not in result 150 | assert "# Code comment" in result['Parent.Child'] 151 | 152 | test_empty_content() 153 | test_special_characters() 154 | test_duplicate_headings() 155 | test_no_content() 156 | test_different_levels() 157 | test_parent_includes_subheadings() 158 | test_multiple_level2_siblings() 159 | test_code_chunks_escaped() 160 | print('tests passed') 161 | 162 | def test_nested_headings(): 163 | md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content." 164 | result = create_heading_dict(md_content) 165 | assert 'Child' in result['Parent'] 166 | assert 'Grandchild' in result['Parent']['Child'] 167 | 168 | def test_code_chunks_escaped(): 169 | md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```" 170 | result = create_heading_dict(md_content) 171 | assert 'Code comment' not in result 172 | 173 | test_nested_headings() 174 | test_code_chunks_escaped() 175 | print('tests passed') -------------------------------------------------------------------------------- /toolslm/shell.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../02_shell.ipynb. 2 | 3 | # %% auto 0 4 | __all__ = ['get_shell'] 5 | 6 | # %% ../02_shell.ipynb 2 7 | import ast, time, signal, traceback 8 | from fastcore.utils import * 9 | 10 | # %% ../02_shell.ipynb 4 11 | from IPython.terminal.interactiveshell import TerminalInteractiveShell 12 | from IPython.utils.capture import capture_output 13 | 14 | # %% ../02_shell.ipynb 7 15 | TerminalInteractiveShell.orig_run = TerminalInteractiveShell.run_cell 16 | 17 | # %% ../02_shell.ipynb 8 18 | @patch 19 | def run_cell(self:TerminalInteractiveShell, cell, timeout=None): 20 | "Wrapper for original `run_cell` which adds timeout and output capture" 21 | if timeout: 22 | def handler(*args): raise TimeoutError() 23 | signal.signal(signal.SIGALRM, handler) 24 | signal.alarm(timeout) 25 | try: 26 | with capture_output() as io: result = self.orig_run(cell) 27 | result.stdout = io.stdout 28 | return result 29 | except TimeoutException as e: 30 | result = self.ExecutionResult(error_before_exec=None, error_in_exec=e) 31 | finally: 32 | if timeout: signal.alarm(0) 33 | 34 | # %% ../02_shell.ipynb 9 35 | def get_shell()->TerminalInteractiveShell: 36 | "Get a `TerminalInteractiveShell` with minimal functionality" 37 | sh = TerminalInteractiveShell() 38 | sh.logger.log_output = sh.history_manager.enabled = False 39 | dh = sh.displayhook 40 | dh.finish_displayhook = dh.write_output_prompt = dh.start_displayhook = lambda: None 41 | dh.write_format_data = lambda format_dict, md_dict=None: None 42 | sh.logstart = sh.automagic = sh.autoindent = False 43 | sh.autocall = 0 44 | sh.system = lambda cmd: None 45 | return sh 46 | -------------------------------------------------------------------------------- /toolslm/xml.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../00_xml.ipynb. 2 | 3 | # %% auto 0 4 | __all__ = ['doctype', 'json_to_xml', 'mk_doctype', 'mk_doc', 'docs_xml', 'files2ctx', 'folder2ctx', 'folder2ctx_cli'] 5 | 6 | # %% ../00_xml.ipynb 3 7 | import hashlib,xml.etree.ElementTree as ET 8 | from collections import namedtuple 9 | 10 | from fastcore.utils import * 11 | from fastcore.meta import delegates 12 | from fastcore.xtras import hl_md 13 | from fastcore.xml import to_xml, Document, Documents, Document_content, Src 14 | from fastcore.script import call_parse 15 | try: from IPython import display 16 | except: display=None 17 | 18 | # %% ../00_xml.ipynb 4 19 | def json_to_xml(d:dict, # JSON dictionary to convert 20 | rnm:str # Root name 21 | )->str: 22 | "Convert `d` to XML." 23 | root = ET.Element(rnm) 24 | def build_xml(data, parent): 25 | if isinstance(data, dict): 26 | for key, value in data.items(): build_xml(value, ET.SubElement(parent, key)) 27 | elif isinstance(data, list): 28 | for item in data: build_xml(item, ET.SubElement(parent, 'item')) 29 | else: parent.text = str(data) 30 | build_xml(d, root) 31 | ET.indent(root) 32 | return ET.tostring(root, encoding='unicode') 33 | 34 | # %% ../00_xml.ipynb 9 35 | doctype = namedtuple('doctype', ['src', 'content']) 36 | 37 | # %% ../00_xml.ipynb 11 38 | def _add_nls(s): 39 | "Add newlines to start and end of `s` if missing" 40 | if not s: return s 41 | if s[ 0]!='\n': s = '\n'+s 42 | if s[-1]!='\n': s = s+'\n' 43 | return s 44 | 45 | # %% ../00_xml.ipynb 16 46 | def mk_doctype(content:str, # The document content 47 | src:Optional[str]=None # URL, filename, etc; defaults to `md5(content)` if not provided 48 | ) -> namedtuple: 49 | "Create a `doctype` named tuple" 50 | if src is None: src = hashlib.md5(content.encode()).hexdigest()[:8] 51 | return doctype(_add_nls(str(src).strip()), _add_nls(content.strip())) 52 | 53 | # %% ../00_xml.ipynb 19 54 | def mk_doc(index:int, # The document index 55 | content:str, # The document content 56 | src:Optional[str]=None, # URL, filename, etc; defaults to `md5(content)` if not provided 57 | **kwargs 58 | ) -> tuple: 59 | "Create an `ft` format tuple for a single doc in Anthropic's recommended format" 60 | dt = mk_doctype(content, src) 61 | content = Document_content(NotStr(dt.content)) 62 | src = Src(NotStr(dt.src)) 63 | return Document(src, content, index=index, **kwargs) 64 | 65 | # %% ../00_xml.ipynb 22 66 | def docs_xml(docs:list[str], # The content of each document 67 | srcs:Optional[list]=None, # URLs, filenames, etc; each one defaults to `md5(content)` if not provided 68 | prefix:bool=True, # Include Anthropic's suggested prose intro? 69 | details:Optional[list]=None # Optional list of dicts with additional attrs for each doc 70 | )->str: 71 | "Create an XML string containing `docs` in Anthropic's recommended format" 72 | pre = 'Here are some documents for you to reference for your task:\n\n' if prefix else '' 73 | if srcs is None: srcs = [None]*len(docs) 74 | if details is None: details = [{}]*len(docs) 75 | docs = (mk_doc(i+1, d, s, **kw) for i,(d,s,kw) in enumerate(zip(docs,srcs,details))) 76 | return pre + to_xml(Documents(docs)) 77 | 78 | # %% ../00_xml.ipynb 29 79 | def files2ctx( 80 | fnames:list[Union[str,Path]], # List of file names to add to context 81 | prefix:bool=True # Include Anthropic's suggested prose intro? 82 | )->str: # XML for LM context 83 | fnames = [Path(o) for o in fnames] 84 | contents = [o.read_text() for o in fnames] 85 | return docs_xml(contents, fnames, prefix=prefix) 86 | 87 | # %% ../00_xml.ipynb 32 88 | @delegates(globtastic) 89 | def folder2ctx( 90 | folder:Union[str,Path], # Folder name containing files to add to context 91 | prefix:bool=True, # Include Anthropic's suggested prose intro? 92 | **kwargs # Passed to `globtastic` 93 | )->str: # XML for Claude context 94 | fnames = globtastic(folder, **kwargs) 95 | return files2ctx(fnames, prefix=prefix) 96 | 97 | # %% ../00_xml.ipynb 34 98 | @call_parse 99 | @delegates(folder2ctx) 100 | def folder2ctx_cli( 101 | folder:str, # Folder name containing files to add to context 102 | **kwargs # Passed to `folder2ctx` 103 | )->str: # XML for Claude context 104 | print(folder2ctx(folder, **kwargs)) 105 | --------------------------------------------------------------------------------