├── .github
└── workflows
│ ├── deploy.yaml
│ └── test.yaml.off
├── .gitignore
├── 00_xml.ipynb
├── 01_funccall.ipynb
├── 02_shell.ipynb
├── 03_download.ipynb
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── _quarto.yml
├── index.ipynb
├── nbdev.yml
├── pyproject.toml
├── samples
├── sample_core.py
└── sample_styles.css
├── settings.ini
├── setup.py
├── styles.css
└── toolslm
├── __init__.py
├── _modidx.py
├── download.py
├── funccall.py
├── md_hier.py
├── shell.py
└── xml.py
/.github/workflows/deploy.yaml:
--------------------------------------------------------------------------------
1 | name: Deploy to GitHub Pages
2 |
3 | permissions:
4 | contents: write
5 | pages: write
6 |
7 | on:
8 | push:
9 | branches: [ "main", "master" ]
10 | workflow_dispatch:
11 | jobs:
12 | deploy:
13 | runs-on: ubuntu-latest
14 | steps: [uses: fastai/workflows/quarto-ghp@master]
15 |
--------------------------------------------------------------------------------
/.github/workflows/test.yaml.off:
--------------------------------------------------------------------------------
1 | name: CI
2 | on: [workflow_dispatch, pull_request, push]
3 |
4 | jobs:
5 | test:
6 | runs-on: ubuntu-latest
7 | steps: [uses: fastai/workflows/nbdev-ci@master]
8 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .gitattributes
2 | _proc/
3 | index_files/
4 | sidebar.yml
5 | Gemfile.lock
6 | token
7 | _docs/
8 | conda/
9 | .last_checked
10 | .gitconfig
11 | *.bak
12 | *.log
13 | *~
14 | ~*
15 | _tmp*
16 | tmp*
17 | tags
18 |
19 | # Byte-compiled / optimized / DLL files
20 | __pycache__/
21 | *.py[cod]
22 | *$py.class
23 |
24 | # C extensions
25 | *.so
26 |
27 | # Distribution / packaging
28 | .Python
29 | env/
30 | build/
31 | develop-eggs/
32 | dist/
33 | downloads/
34 | eggs/
35 | .eggs/
36 | lib/
37 | lib64/
38 | parts/
39 | sdist/
40 | var/
41 | wheels/
42 | *.egg-info/
43 | .installed.cfg
44 | *.egg
45 |
46 | # PyInstaller
47 | # Usually these files are written by a python script from a template
48 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
49 | *.manifest
50 | *.spec
51 |
52 | # Installer logs
53 | pip-log.txt
54 | pip-delete-this-directory.txt
55 |
56 | # Unit test / coverage reports
57 | htmlcov/
58 | .tox/
59 | .coverage
60 | .coverage.*
61 | .cache
62 | nosetests.xml
63 | coverage.xml
64 | *.cover
65 | .hypothesis/
66 |
67 | # Translations
68 | *.mo
69 | *.pot
70 |
71 | # Django stuff:
72 | *.log
73 | local_settings.py
74 |
75 | # Flask stuff:
76 | instance/
77 | .webassets-cache
78 |
79 | # Scrapy stuff:
80 | .scrapy
81 |
82 | # Sphinx documentation
83 | docs/_build/
84 |
85 | # PyBuilder
86 | target/
87 |
88 | # Jupyter Notebook
89 | .ipynb_checkpoints
90 |
91 | # pyenv
92 | .python-version
93 |
94 | # celery beat schedule file
95 | celerybeat-schedule
96 |
97 | # SageMath parsed files
98 | *.sage.py
99 |
100 | # dotenv
101 | .env
102 |
103 | # virtualenv
104 | .venv
105 | venv/
106 | ENV/
107 |
108 | # Spyder project settings
109 | .spyderproject
110 | .spyproject
111 |
112 | # Rope project settings
113 | .ropeproject
114 |
115 | # mkdocs documentation
116 | /site
117 |
118 | # mypy
119 | .mypy_cache/
120 |
121 | .vscode
122 | *.swp
123 |
124 | # osx generated files
125 | .DS_Store
126 | .DS_Store?
127 | .Trashes
128 | ehthumbs.db
129 | Thumbs.db
130 | .idea
131 |
132 | # pytest
133 | .pytest_cache
134 |
135 | # tools/trust-doc-nbs
136 | docs_src/.last_checked
137 |
138 | # symlinks to fastai
139 | docs_src/fastai
140 | tools/fastai
141 |
142 | # link checker
143 | checklink/cookies.txt
144 |
145 | # .gitconfig is now autogenerated
146 | .gitconfig
147 |
148 | _docs
149 |
150 | /.quarto/
151 |
--------------------------------------------------------------------------------
/00_xml.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "efe78920",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "#|default_exp xml"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "3d773712-12fe-440e-891f-36f59666dfde",
16 | "metadata": {},
17 | "source": [
18 | "# xml source"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "id": "ff6f6471-8061-4fdd-85a1-25fdc27c5cf3",
24 | "metadata": {},
25 | "source": [
26 | "## Setup"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "id": "033c76fd",
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "#| export\n",
37 | "import hashlib,xml.etree.ElementTree as ET\n",
38 | "from collections import namedtuple\n",
39 | "\n",
40 | "from fastcore.utils import *\n",
41 | "from fastcore.meta import delegates\n",
42 | "from fastcore.xtras import hl_md\n",
43 | "from fastcore.xml import to_xml, Document, Documents, Document_content, Src\n",
44 | "from fastcore.script import call_parse\n",
45 | "try: from IPython import display\n",
46 | "except: display=None"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "id": "2795f9fc",
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "#| exports\n",
57 | "def json_to_xml(d:dict, # JSON dictionary to convert\n",
58 | " rnm:str # Root name\n",
59 | " )->str:\n",
60 | " \"Convert `d` to XML.\"\n",
61 | " root = ET.Element(rnm)\n",
62 | " def build_xml(data, parent):\n",
63 | " if isinstance(data, dict):\n",
64 | " for key, value in data.items(): build_xml(value, ET.SubElement(parent, key))\n",
65 | " elif isinstance(data, list):\n",
66 | " for item in data: build_xml(item, ET.SubElement(parent, 'item'))\n",
67 | " else: parent.text = str(data)\n",
68 | " build_xml(d, root)\n",
69 | " ET.indent(root)\n",
70 | " return ET.tostring(root, encoding='unicode')"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "id": "140a35a2",
76 | "metadata": {},
77 | "source": [
78 | "JSON doesn't map as nicely to XML as the data structure used in `fastcore.xml`, but for simple XML trees it can be convenient -- for example:"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "id": "005a5be4",
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "data": {
89 | "text/markdown": [
90 | "```xml\n",
91 | "\n",
92 | " Howard\n",
93 | " \n",
94 | " - Jeremy
\n",
95 | " - Peter
\n",
96 | " \n",
97 | " \n",
98 | " Queensland\n",
99 | " Australia\n",
100 | " \n",
101 | "\n",
102 | "```"
103 | ],
104 | "text/plain": [
105 | ""
106 | ]
107 | },
108 | "execution_count": null,
109 | "metadata": {},
110 | "output_type": "execute_result"
111 | }
112 | ],
113 | "source": [
114 | "a = dict(surname='Howard', firstnames=['Jeremy','Peter'],\n",
115 | " address=dict(state='Queensland',country='Australia'))\n",
116 | "hl_md(json_to_xml(a, 'person'))"
117 | ]
118 | },
119 | {
120 | "cell_type": "markdown",
121 | "id": "7788c48c",
122 | "metadata": {},
123 | "source": [
124 | "## Including documents"
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "id": "479be4c9",
130 | "metadata": {},
131 | "source": [
132 | "According [to Anthropic](https://docs.anthropic.com/claude/docs/long-context-window-tips), \"*it's essential to structure your prompts in a way that clearly separates the input data from the instructions*\". They recommend using something like the following:\n",
133 | "\n",
134 | "```xml\n",
135 | "Here are some documents for you to reference for your task:\n",
136 | " \n",
137 | "\n",
138 | "\n",
139 | "\n",
140 | "(URL, file name, hash, etc)\n",
141 | "\n",
142 | "\n",
143 | "(the text content)\n",
144 | "\n",
145 | "\n",
146 | "\n",
147 | "```\n",
148 | "\n",
149 | "We will create some small helper functions to make it easier to generate context in this format, although we're use `` instead of `` to avoid conflict with that HTML tag. Although it's based on Anthropic's recommendation, it's likely to work well with other models too."
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "id": "a01dc320",
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "#| exports\n",
160 | "doctype = namedtuple('doctype', ['src', 'content'])"
161 | ]
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "id": "6620a123",
166 | "metadata": {},
167 | "source": [
168 | "We'll use `doctype` to store our pairs."
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "id": "ce853491",
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "#| exports\n",
179 | "def _add_nls(s):\n",
180 | " \"Add newlines to start and end of `s` if missing\"\n",
181 | " if not s: return s\n",
182 | " if s[ 0]!='\\n': s = '\\n'+s\n",
183 | " if s[-1]!='\\n': s = s+'\\n'\n",
184 | " return s"
185 | ]
186 | },
187 | {
188 | "cell_type": "markdown",
189 | "id": "026d3b06",
190 | "metadata": {},
191 | "source": [
192 | "Since Anthropic's example shows newlines before and after each tag, we'll do the same."
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "id": "26fddbc3",
199 | "metadata": {},
200 | "outputs": [
201 | {
202 | "data": {
203 | "text/plain": [
204 | "'a'"
205 | ]
206 | },
207 | "execution_count": null,
208 | "metadata": {},
209 | "output_type": "execute_result"
210 | }
211 | ],
212 | "source": [
213 | "to_xml(Src('a'))"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "id": "1bac81ce",
220 | "metadata": {},
221 | "outputs": [
222 | {
223 | "data": {
224 | "text/plain": [
225 | "'a'"
226 | ]
227 | },
228 | "execution_count": null,
229 | "metadata": {},
230 | "output_type": "execute_result"
231 | }
232 | ],
233 | "source": [
234 | "to_xml(Document('a'))"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": null,
240 | "id": "40a7e0ba",
241 | "metadata": {},
242 | "outputs": [
243 | {
244 | "data": {
245 | "text/plain": [
246 | "'a'"
247 | ]
248 | },
249 | "execution_count": null,
250 | "metadata": {},
251 | "output_type": "execute_result"
252 | }
253 | ],
254 | "source": [
255 | "to_xml(Documents('a'))"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": null,
261 | "id": "932e8858",
262 | "metadata": {},
263 | "outputs": [],
264 | "source": [
265 | "#| exports\n",
266 | "def mk_doctype(content:str, # The document content\n",
267 | " src:Optional[str]=None # URL, filename, etc; defaults to `md5(content)` if not provided\n",
268 | " ) -> namedtuple:\n",
269 | " \"Create a `doctype` named tuple\"\n",
270 | " if src is None: src = hashlib.md5(content.encode()).hexdigest()[:8]\n",
271 | " return doctype(_add_nls(str(src).strip()), _add_nls(content.strip()))"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "id": "8800921b",
277 | "metadata": {},
278 | "source": [
279 | "This is a convenience wrapper to ensure that a `doctype` has the needed information in the right format."
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": null,
285 | "id": "14f9e185",
286 | "metadata": {},
287 | "outputs": [
288 | {
289 | "data": {
290 | "text/plain": [
291 | "doctype(src='\\n47e19350\\n', content='\\nThis is a \"sample\"\\n')"
292 | ]
293 | },
294 | "execution_count": null,
295 | "metadata": {},
296 | "output_type": "execute_result"
297 | }
298 | ],
299 | "source": [
300 | "doc = 'This is a \"sample\"'\n",
301 | "mk_doctype(doc)"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": null,
307 | "id": "15e454db",
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "#| exports\n",
312 | "def mk_doc(index:int, # The document index\n",
313 | " content:str, # The document content\n",
314 | " src:Optional[str]=None, # URL, filename, etc; defaults to `md5(content)` if not provided\n",
315 | " **kwargs\n",
316 | " ) -> tuple:\n",
317 | " \"Create an `ft` format tuple for a single doc in Anthropic's recommended format\"\n",
318 | " dt = mk_doctype(content, src)\n",
319 | " content = Document_content(NotStr(dt.content))\n",
320 | " src = Src(NotStr(dt.src))\n",
321 | " return Document(src, content, index=index, **kwargs)"
322 | ]
323 | },
324 | {
325 | "cell_type": "markdown",
326 | "id": "a8b6ac26",
327 | "metadata": {},
328 | "source": [
329 | "We can now generate XML for one document in the suggested format:"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "id": "e7ed5a9a",
336 | "metadata": {},
337 | "outputs": [
338 | {
339 | "data": {
340 | "text/markdown": [
341 | "```html\n",
342 | "\n",
343 | "47e19350\n",
344 | "\n",
345 | "This is a \"sample\"\n",
346 | "\n",
347 | "```"
348 | ],
349 | "text/plain": [
350 | "document((src(('\\n47e19350\\n',),{}), document-content(('\\nThis is a \"sample\"\\n',),{})),{'index': 1, 'title': 'test'})"
351 | ]
352 | },
353 | "execution_count": null,
354 | "metadata": {},
355 | "output_type": "execute_result"
356 | }
357 | ],
358 | "source": [
359 | "mk_doc(1, doc, title=\"test\")"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "id": "ba5ebfab",
366 | "metadata": {},
367 | "outputs": [],
368 | "source": [
369 | "#| exports\n",
370 | "def docs_xml(docs:list[str], # The content of each document\n",
371 | " srcs:Optional[list]=None, # URLs, filenames, etc; each one defaults to `md5(content)` if not provided\n",
372 | " prefix:bool=True, # Include Anthropic's suggested prose intro?\n",
373 | " details:Optional[list]=None # Optional list of dicts with additional attrs for each doc\n",
374 | " )->str:\n",
375 | " \"Create an XML string containing `docs` in Anthropic's recommended format\"\n",
376 | " pre = 'Here are some documents for you to reference for your task:\\n\\n' if prefix else ''\n",
377 | " if srcs is None: srcs = [None]*len(docs)\n",
378 | " if details is None: details = [{}]*len(docs)\n",
379 | " docs = (mk_doc(i+1, d, s, **kw) for i,(d,s,kw) in enumerate(zip(docs,srcs,details)))\n",
380 | " return pre + to_xml(Documents(docs))"
381 | ]
382 | },
383 | {
384 | "cell_type": "markdown",
385 | "id": "85004124",
386 | "metadata": {},
387 | "source": [
388 | "Putting it all together, we have our final XML format:"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": null,
394 | "id": "1dac60f6",
395 | "metadata": {},
396 | "outputs": [
397 | {
398 | "name": "stdout",
399 | "output_type": "stream",
400 | "text": [
401 | "Here are some documents for you to reference for your task:\n",
402 | "\n",
403 | "\n",
404 | "47e19350\n",
405 | "\n",
406 | "This is a \"sample\"\n",
407 | "\n",
408 | "doc.txt\n",
409 | "\n",
410 | "And another one\n",
411 | "\n"
412 | ]
413 | }
414 | ],
415 | "source": [
416 | "docs = [doc, 'And another one']\n",
417 | "srcs = [None, 'doc.txt']\n",
418 | "print(docs_xml(docs, srcs))"
419 | ]
420 | },
421 | {
422 | "cell_type": "markdown",
423 | "id": "2a8a7a9a",
424 | "metadata": {},
425 | "source": [
426 | "## Context creation"
427 | ]
428 | },
429 | {
430 | "cell_type": "markdown",
431 | "id": "cd06b2dc",
432 | "metadata": {},
433 | "source": [
434 | "Now that we can generate Anthropic's XML format, let's make it easy for a few common cases."
435 | ]
436 | },
437 | {
438 | "cell_type": "markdown",
439 | "id": "65317fc6",
440 | "metadata": {},
441 | "source": [
442 | "### File list to context"
443 | ]
444 | },
445 | {
446 | "cell_type": "markdown",
447 | "id": "3778e8ed",
448 | "metadata": {},
449 | "source": [
450 | "For generating XML context from files, we'll just read them as text and use the file names as `src`."
451 | ]
452 | },
453 | {
454 | "cell_type": "code",
455 | "execution_count": null,
456 | "id": "0a168636",
457 | "metadata": {},
458 | "outputs": [],
459 | "source": [
460 | "#| exports\n",
461 | "def files2ctx(\n",
462 | " fnames:list[Union[str,Path]], # List of file names to add to context\n",
463 | " prefix:bool=True # Include Anthropic's suggested prose intro?\n",
464 | ")->str: # XML for LM context\n",
465 | " fnames = [Path(o) for o in fnames]\n",
466 | " contents = [o.read_text() for o in fnames]\n",
467 | " return docs_xml(contents, fnames, prefix=prefix)"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": null,
473 | "id": "1bf73d36",
474 | "metadata": {},
475 | "outputs": [
476 | {
477 | "data": {
478 | "text/markdown": [
479 | "```xml\n",
480 | "Here are some documents for you to reference for your task:\n",
481 | "\n",
482 | "\n",
483 | "samples/sample_core.py\n",
484 | "\n",
485 | "import inspect\n",
486 | "empty = inspect.Parameter.empty\n",
487 | "models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'\n",
488 | "\n",
489 | "samples/sample_styles.css\n",
490 | "\n",
491 | ".cell { margin-bottom: 1rem; }\n",
492 | ".cell > .sourceCode { margin-bottom: 0; }\n",
493 | ".cell-output > pre { margin-bottom: 0; }\n",
494 | "\n",
495 | "```"
496 | ],
497 | "text/plain": [
498 | ""
499 | ]
500 | },
501 | "execution_count": null,
502 | "metadata": {},
503 | "output_type": "execute_result"
504 | }
505 | ],
506 | "source": [
507 | "fnames = ['samples/sample_core.py', 'samples/sample_styles.css']\n",
508 | "hl_md(files2ctx(fnames))"
509 | ]
510 | },
511 | {
512 | "cell_type": "markdown",
513 | "id": "191ddb2b",
514 | "metadata": {},
515 | "source": [
516 | "### Folder to context"
517 | ]
518 | },
519 | {
520 | "cell_type": "code",
521 | "execution_count": null,
522 | "id": "a0452a21",
523 | "metadata": {},
524 | "outputs": [],
525 | "source": [
526 | "#| exports\n",
527 | "@delegates(globtastic)\n",
528 | "def folder2ctx(\n",
529 | " folder:Union[str,Path], # Folder name containing files to add to context\n",
530 | " prefix:bool=True, # Include Anthropic's suggested prose intro?\n",
531 | " **kwargs # Passed to `globtastic`\n",
532 | ")->str: # XML for Claude context\n",
533 | " fnames = globtastic(folder, **kwargs)\n",
534 | " return files2ctx(fnames, prefix=prefix)"
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": null,
540 | "id": "efd52392",
541 | "metadata": {},
542 | "outputs": [
543 | {
544 | "name": "stdout",
545 | "output_type": "stream",
546 | "text": [
547 | "\n",
548 | "samples/sample_core.py\n",
549 | "\n",
550 | "import inspect\n",
551 | "empty = inspect.Parameter.empty\n",
552 | "models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'\n",
553 | "\n"
554 | ]
555 | }
556 | ],
557 | "source": [
558 | "print(folder2ctx('samples', prefix=False, file_glob='*.py'))"
559 | ]
560 | },
561 | {
562 | "cell_type": "code",
563 | "execution_count": null,
564 | "id": "0cd4bbeb-b07f-447d-abe8-2b4190d4aa63",
565 | "metadata": {},
566 | "outputs": [],
567 | "source": [
568 | "#| exports\n",
569 | "#| hide\n",
570 | "@call_parse\n",
571 | "@delegates(folder2ctx)\n",
572 | "def folder2ctx_cli(\n",
573 | " folder:str, # Folder name containing files to add to context\n",
574 | " **kwargs # Passed to `folder2ctx`\n",
575 | ")->str: # XML for Claude context\n",
576 | " print(folder2ctx(folder, **kwargs))"
577 | ]
578 | },
579 | {
580 | "cell_type": "markdown",
581 | "id": "95bc490c-bf9d-4146-a729-97f7221559af",
582 | "metadata": {},
583 | "source": [
584 | ":::{.callout-tip}\n",
585 | "\n",
586 | "After you install `toolslm`, `folder2ctx` becomes available from the command line. You can see how to use it with the following command:\n",
587 | "\n",
588 | "```bash\n",
589 | "folder2ctx -h\n",
590 | "```\n",
591 | ":::"
592 | ]
593 | },
594 | {
595 | "cell_type": "markdown",
596 | "id": "94ec4289",
597 | "metadata": {},
598 | "source": [
599 | "## Export -"
600 | ]
601 | },
602 | {
603 | "cell_type": "code",
604 | "execution_count": null,
605 | "id": "1e9ee5c1",
606 | "metadata": {},
607 | "outputs": [],
608 | "source": [
609 | "#|hide\n",
610 | "#|eval: false\n",
611 | "from nbdev.doclinks import nbdev_export\n",
612 | "nbdev_export()"
613 | ]
614 | },
615 | {
616 | "cell_type": "code",
617 | "execution_count": null,
618 | "id": "5d06a6ce",
619 | "metadata": {},
620 | "outputs": [],
621 | "source": []
622 | }
623 | ],
624 | "metadata": {
625 | "kernelspec": {
626 | "display_name": "python3",
627 | "language": "python",
628 | "name": "python3"
629 | }
630 | },
631 | "nbformat": 4,
632 | "nbformat_minor": 5
633 | }
634 |
--------------------------------------------------------------------------------
/01_funccall.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "efe78920",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "#|default_exp funccall"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "3d773712-12fe-440e-891f-36f59666dfde",
16 | "metadata": {},
17 | "source": [
18 | "# funccall source"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "id": "e5ad6b86",
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "#| exports\n",
29 | "import inspect\n",
30 | "from collections import abc\n",
31 | "from fastcore.utils import *\n",
32 | "from fastcore.docments import docments\n",
33 | "from typing import get_origin, get_args, Dict, List, Optional, Tuple, Union\n",
34 | "from types import UnionType"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "id": "aec123ab",
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "#|hide\n",
45 | "from fastcore.test import *"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "id": "a9f43047",
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "#| export\n",
56 | "empty = inspect.Parameter.empty"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "id": "1a7cdbc6",
62 | "metadata": {},
63 | "source": [
64 | "## Function calling"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "id": "7ec35c95",
70 | "metadata": {},
71 | "source": [
72 | "Many LLMs do function calling (aka tool use) by taking advantage of JSON schema.\n",
73 | "\n",
74 | "We'll use [docments](https://fastcore.fast.ai/docments.html) to make getting JSON schema from Python functions as ergonomic as possible. Each parameter (and the return value) should have a type, and a docments comment with the description of what it is. Here's an example:"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "id": "4a017af1",
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "def silly_sum(\n",
85 | " a:int, # First thing to sum\n",
86 | " b:int=1, # Second thing to sum\n",
87 | " c:list[int]=None, # A pointless argument\n",
88 | ") -> int: # The sum of the inputs\n",
89 | " \"Adds a + b.\"\n",
90 | " return a + b"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "id": "1a3ff443",
96 | "metadata": {},
97 | "source": [
98 | "This is what `docments` makes of that:"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": null,
104 | "id": "b3f2ebcf",
105 | "metadata": {},
106 | "outputs": [
107 | {
108 | "data": {
109 | "text/markdown": [
110 | "```json\n",
111 | "{ 'a': { 'anno': ,\n",
112 | " 'default': ,\n",
113 | " 'docment': 'First thing to sum'},\n",
114 | " 'b': {'anno': , 'default': 1, 'docment': 'Second thing to sum'},\n",
115 | " 'c': {'anno': list[int], 'default': None, 'docment': 'A pointless argument'},\n",
116 | " 'return': { 'anno': ,\n",
117 | " 'default': ,\n",
118 | " 'docment': 'The sum of the inputs'}}\n",
119 | "```"
120 | ],
121 | "text/plain": [
122 | "{'a': {'docment': 'First thing to sum',\n",
123 | " 'anno': int,\n",
124 | " 'default': inspect._empty},\n",
125 | " 'b': {'docment': 'Second thing to sum', 'anno': int, 'default': 1},\n",
126 | " 'c': {'docment': 'A pointless argument', 'anno': list[int], 'default': None},\n",
127 | " 'return': {'docment': 'The sum of the inputs',\n",
128 | " 'anno': int,\n",
129 | " 'default': inspect._empty}}"
130 | ]
131 | },
132 | "execution_count": null,
133 | "metadata": {},
134 | "output_type": "execute_result"
135 | }
136 | ],
137 | "source": [
138 | "d = docments(silly_sum, full=True)\n",
139 | "d"
140 | ]
141 | },
142 | {
143 | "cell_type": "markdown",
144 | "id": "745e44ea",
145 | "metadata": {},
146 | "source": [
147 | "Note that this is an [AttrDict](https://fastcore.fast.ai/basics.html#attrdict) so we can treat it like an object, *or* a dict:"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "id": "35cb279d",
154 | "metadata": {},
155 | "outputs": [
156 | {
157 | "data": {
158 | "text/plain": [
159 | "('First thing to sum', int)"
160 | ]
161 | },
162 | "execution_count": null,
163 | "metadata": {},
164 | "output_type": "execute_result"
165 | }
166 | ],
167 | "source": [
168 | "d.a.docment, d['a']['anno']"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "id": "e7bf4025",
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "#| exports\n",
179 | "def _types(t:type)->tuple[str,Optional[str]]:\n",
180 | " \"Tuple of json schema type name and (if appropriate) array item name.\"\n",
181 | " if t is empty: raise TypeError('Missing type')\n",
182 | " tmap = {int:\"integer\", float:\"number\", str:\"string\", bool:\"boolean\", list:\"array\", dict:\"object\"}\n",
183 | " tmap.update({k.__name__: v for k, v in tmap.items()})\n",
184 | " if getattr(t, '__origin__', None) in (list,tuple):\n",
185 | " args = getattr(t, '__args__', None)\n",
186 | " item_type = \"object\" if not args else tmap.get(t.__args__[0].__name__, \"object\")\n",
187 | " return \"array\", item_type\n",
188 | " # if t is a string like 'int', directly use the string as the key\n",
189 | " elif isinstance(t, str): return tmap.get(t, \"object\"), None\n",
190 | " # if t is the type itself and a container\n",
191 | " elif get_origin(t): return tmap.get(get_origin(t).__name__, \"object\"), None\n",
192 | " # if t is the type itself like int, use the __name__ representation as the key\n",
193 | " else: return tmap.get(t.__name__, \"object\"), None"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "id": "edf73046",
199 | "metadata": {},
200 | "source": [
201 | "This internal function is needed to convert Python types into JSON schema types."
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "id": "ecb7bc52",
208 | "metadata": {},
209 | "outputs": [
210 | {
211 | "data": {
212 | "text/plain": [
213 | "(('array', 'integer'), ('integer', None), ('integer', None))"
214 | ]
215 | },
216 | "execution_count": null,
217 | "metadata": {},
218 | "output_type": "execute_result"
219 | }
220 | ],
221 | "source": [
222 | "_types(list[int]), _types(int), _types('int')"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": null,
228 | "id": "38b4650a",
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "data": {
233 | "text/plain": [
234 | "(('array', 'integer'), ('object', None), ('object', None), ('array', 'string'))"
235 | ]
236 | },
237 | "execution_count": null,
238 | "metadata": {},
239 | "output_type": "execute_result"
240 | }
241 | ],
242 | "source": [
243 | "_types(List[int]), _types(Optional[str]), _types(str | None), _types(Tuple[str, int])"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "id": "f4d0ac1e",
249 | "metadata": {},
250 | "source": [
251 | "Note the current behavior:\n",
252 | "\n",
253 | "- ignores all but the first argument for tuples\n",
254 | "- union types map to object which is a stand-in for arbitrary types\n",
255 | "\n",
256 | "These and other approximations may require further refinement in the future."
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "id": "c0e3c940",
262 | "metadata": {},
263 | "source": [
264 | "Will also convert custom types to the `object` type."
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": null,
270 | "id": "9969fd00",
271 | "metadata": {},
272 | "outputs": [
273 | {
274 | "data": {
275 | "text/plain": [
276 | "(('array', 'object'), ('object', None))"
277 | ]
278 | },
279 | "execution_count": null,
280 | "metadata": {},
281 | "output_type": "execute_result"
282 | }
283 | ],
284 | "source": [
285 | "class Custom: a: int\n",
286 | "_types(list[Custom]), _types(Custom)"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "id": "4d5dc245",
293 | "metadata": {},
294 | "outputs": [],
295 | "source": [
296 | "#| exports\n",
297 | "def _param(name, info):\n",
298 | " \"json schema parameter given `name` and `info` from docments full dict.\"\n",
299 | " paramt,itemt = _types(info.anno)\n",
300 | " pschema = dict(type=paramt, description=info.docment or \"\")\n",
301 | " if itemt: pschema[\"items\"] = {\"type\": itemt}\n",
302 | " if info.default is not empty: pschema[\"default\"] = info.default\n",
303 | " return pschema"
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "id": "5337d6bd",
309 | "metadata": {},
310 | "source": [
311 | "This private function converts a key/value pair from the `docments` structure into the `dict` that will be needed for the schema."
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "id": "2450ace6",
318 | "metadata": {},
319 | "outputs": [
320 | {
321 | "name": "stdout",
322 | "output_type": "stream",
323 | "text": [
324 | "a // {'docment': 'First thing to sum', 'anno': , 'default': }\n"
325 | ]
326 | },
327 | {
328 | "data": {
329 | "text/plain": [
330 | "{'type': 'integer', 'description': 'First thing to sum'}"
331 | ]
332 | },
333 | "execution_count": null,
334 | "metadata": {},
335 | "output_type": "execute_result"
336 | }
337 | ],
338 | "source": [
339 | "n,o = first(d.items())\n",
340 | "print(n,'//', o)\n",
341 | "_param(n, o)"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": null,
347 | "id": "ba6bcac4",
348 | "metadata": {},
349 | "outputs": [],
350 | "source": [
351 | "#| export\n",
352 | "custom_types = {Path}\n",
353 | "\n",
354 | "def _handle_type(t, defs):\n",
355 | " \"Handle a single type, creating nested schemas if necessary\"\n",
356 | " if t is NoneType: return {'type': 'null'}\n",
357 | " if t in custom_types: return {'type':'string', 'format':t.__name__}\n",
358 | " if isinstance(t, type) and not issubclass(t, (int, float, str, bool)) or inspect.isfunction(t):\n",
359 | " defs[t.__name__] = _get_nested_schema(t)\n",
360 | " return {'$ref': f'#/$defs/{t.__name__}'}\n",
361 | " return {'type': _types(t)[0]}"
362 | ]
363 | },
364 | {
365 | "cell_type": "code",
366 | "execution_count": null,
367 | "id": "16dbf080",
368 | "metadata": {},
369 | "outputs": [
370 | {
371 | "data": {
372 | "text/plain": [
373 | "({'type': 'integer'}, {'type': 'string', 'format': 'Path'})"
374 | ]
375 | },
376 | "execution_count": null,
377 | "metadata": {},
378 | "output_type": "execute_result"
379 | }
380 | ],
381 | "source": [
382 | "_handle_type(int, None), _handle_type(Path, None)"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": null,
388 | "id": "7fd6cd29",
389 | "metadata": {},
390 | "outputs": [],
391 | "source": [
392 | "#| export\n",
393 | "def _is_container(t):\n",
394 | " \"Check if type is a container (list, dict, tuple, set, Union)\"\n",
395 | " origin = get_origin(t)\n",
396 | " return origin in (list, dict, tuple, set, Union) if origin else False\n",
397 | "\n",
398 | "def _is_parameterized(t):\n",
399 | " \"Check if type has arguments (e.g. list[int] vs list, dict[str, int] vs dict)\"\n",
400 | " return _is_container(t) and (get_args(t) != ())"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": null,
406 | "id": "783747af",
407 | "metadata": {},
408 | "outputs": [],
409 | "source": [
410 | "assert _is_parameterized(list[int]) == True\n",
411 | "assert _is_parameterized(int) == False\n",
412 | "assert _is_container(list[int]) == True\n",
413 | "assert _is_container(dict[str, int]) == True\n",
414 | "assert _is_container(int) == False"
415 | ]
416 | },
417 | {
418 | "cell_type": "markdown",
419 | "id": "d42c88dd",
420 | "metadata": {},
421 | "source": [
422 | "For union and optional types, `Union` covers older `Union[str]` syntax while `UnionType` covers 3.10+ `str | None` syntax."
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": null,
428 | "id": "7815799b",
429 | "metadata": {},
430 | "outputs": [
431 | {
432 | "data": {
433 | "text/plain": [
434 | "(str | None, types.UnionType, (str, NoneType))"
435 | ]
436 | },
437 | "execution_count": null,
438 | "metadata": {},
439 | "output_type": "execute_result"
440 | }
441 | ],
442 | "source": [
443 | "def _example_new_unioin(opt_tup: str | None):\n",
444 | " pass\n",
445 | "\n",
446 | "d = docments(_example_new_unioin, full=True)\n",
447 | "anno1 = first(d.items())[1].anno\n",
448 | "(anno1, get_origin(anno1), get_args(anno1))"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": null,
454 | "id": "d745c902",
455 | "metadata": {},
456 | "outputs": [
457 | {
458 | "data": {
459 | "text/plain": [
460 | "(typing.Optional[str], typing.Union, (str, NoneType))"
461 | ]
462 | },
463 | "execution_count": null,
464 | "metadata": {},
465 | "output_type": "execute_result"
466 | }
467 | ],
468 | "source": [
469 | "def _example_old_union(opt_tup: Union[str, type(None)] =None):\n",
470 | " pass\n",
471 | "\n",
472 | "d = docments(_example_old_union, full=True)\n",
473 | "anno2 = first(d.items())[1].anno\n",
474 | "(anno2, get_origin(anno2), get_args(anno2))"
475 | ]
476 | },
477 | {
478 | "cell_type": "markdown",
479 | "id": "3c5701c7",
480 | "metadata": {},
481 | "source": [
482 | "Support for both union types is part of the broader container handling:"
483 | ]
484 | },
485 | {
486 | "cell_type": "code",
487 | "execution_count": null,
488 | "id": "c1153f02",
489 | "metadata": {},
490 | "outputs": [],
491 | "source": [
492 | "#| export\n",
493 | "def _handle_container(origin, args, defs):\n",
494 | " \"Handle container types like dict, list, tuple, set, and Union\"\n",
495 | " if origin is Union or origin is UnionType:\n",
496 | " return {\"anyOf\": [_handle_type(arg, defs) for arg in args]}\n",
497 | " if origin is dict:\n",
498 | " value_type = args[1].__args__[0] if hasattr(args[1], '__args__') else args[1]\n",
499 | " return {\n",
500 | " 'type': 'object',\n",
501 | " 'additionalProperties': (\n",
502 | " {'type': 'array', 'items': _handle_type(value_type, defs)}\n",
503 | " if hasattr(args[1], '__origin__') else _handle_type(args[1], defs)\n",
504 | " )\n",
505 | " }\n",
506 | " elif origin in (list, tuple, set):\n",
507 | " schema = {'type': 'array', 'items': _handle_type(args[0], defs)}\n",
508 | " if origin is set:\n",
509 | " schema['uniqueItems'] = True\n",
510 | " return schema\n",
511 | " return None"
512 | ]
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": null,
517 | "id": "5ee1c529",
518 | "metadata": {},
519 | "outputs": [],
520 | "source": [
521 | "#| export\n",
522 | "def _process_property(name, obj, props, req, defs):\n",
523 | " \"Process a single property of the schema\"\n",
524 | " p = _param(name, obj)\n",
525 | " props[name] = p\n",
526 | " if obj.default is empty: req[name] = True\n",
527 | "\n",
528 | " if _is_container(obj.anno) and _is_parameterized(obj.anno):\n",
529 | " p.update(_handle_container(get_origin(obj.anno), get_args(obj.anno), defs)) \n",
530 | " else:\n",
531 | " # Non-container type or container without arguments\n",
532 | " p.update(_handle_type(obj.anno, defs))"
533 | ]
534 | },
535 | {
536 | "cell_type": "code",
537 | "execution_count": null,
538 | "id": "38b0f97e",
539 | "metadata": {},
540 | "outputs": [],
541 | "source": [
542 | "#| export\n",
543 | "def _get_nested_schema(obj):\n",
544 | " \"Generate nested JSON schema for a class or function\"\n",
545 | " d = docments(obj, full=True)\n",
546 | " props, req, defs = {}, {}, {}\n",
547 | "\n",
548 | " for n, o in d.items():\n",
549 | " if n != 'return' and n != 'self':\n",
550 | " _process_property(n, o, props, req, defs)\n",
551 | "\n",
552 | " schema = dict(type='object', properties=props, title=obj.__name__ if isinstance(obj, type) else None)\n",
553 | " if req: schema['required'] = list(req)\n",
554 | " if defs: schema['$defs'] = defs\n",
555 | " return schema"
556 | ]
557 | },
558 | {
559 | "cell_type": "code",
560 | "execution_count": null,
561 | "id": "1bb9df6c",
562 | "metadata": {},
563 | "outputs": [],
564 | "source": [
565 | "# Test primitive types\n",
566 | "defs = {}\n",
567 | "assert _handle_type(int, defs) == {'type': 'integer'}\n",
568 | "assert _handle_type(str, defs) == {'type': 'string'}\n",
569 | "assert _handle_type(bool, defs) == {'type': 'boolean'}\n",
570 | "assert _handle_type(float, defs) == {'type': 'number'}\n",
571 | "\n",
572 | "# Test custom class\n",
573 | "class TestClass:\n",
574 | " def __init__(self, x: int, y: int): store_attr()\n",
575 | "\n",
576 | "result = _handle_type(TestClass, defs)\n",
577 | "assert result == {'$ref': '#/$defs/TestClass'}\n",
578 | "assert 'TestClass' in defs\n",
579 | "assert defs['TestClass']['type'] == 'object'\n",
580 | "assert 'properties' in defs['TestClass']"
581 | ]
582 | },
583 | {
584 | "cell_type": "code",
585 | "execution_count": null,
586 | "id": "b1d09435",
587 | "metadata": {},
588 | "outputs": [],
589 | "source": [
590 | "# Test primitive types in containers\n",
591 | "assert _handle_container(list, (int,), defs) == {'type': 'array', 'items': {'type': 'integer'}}\n",
592 | "assert _handle_container(tuple, (str,), defs) == {'type': 'array', 'items': {'type': 'string'}}\n",
593 | "assert _handle_container(set, (str,), defs) == {'type': 'array', 'items': {'type': 'string'}, 'uniqueItems': True}\n",
594 | "assert _handle_container(dict, (str,bool), defs) == {'type': 'object', 'additionalProperties': {'type': 'boolean'}}\n",
595 | "\n",
596 | "result = _handle_container(list, (TestClass,), defs)\n",
597 | "assert result == {'type': 'array', 'items': {'$ref': '#/$defs/TestClass'}}\n",
598 | "assert 'TestClass' in defs\n",
599 | "\n",
600 | "# Test complex nested structure\n",
601 | "ComplexType = dict[str, list[TestClass]]\n",
602 | "result = _handle_container(dict, (str, list[TestClass]), defs)\n",
603 | "assert result == {\n",
604 | " 'type': 'object',\n",
605 | " 'additionalProperties': {\n",
606 | " 'type': 'array',\n",
607 | " 'items': {'$ref': '#/$defs/TestClass'}\n",
608 | " }\n",
609 | "}"
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": null,
615 | "id": "a5fd37d5",
616 | "metadata": {},
617 | "outputs": [],
618 | "source": [
619 | "# Test processing of a required integer property\n",
620 | "props, req = {}, {}\n",
621 | "class TestClass:\n",
622 | " \"Test class\"\n",
623 | " def __init__(\n",
624 | " self,\n",
625 | " x: int, # First thing\n",
626 | " y: list[float], # Second thing\n",
627 | " z: str = \"default\", # Third thing\n",
628 | " ): store_attr()\n",
629 | "\n",
630 | "d = docments(TestClass, full=True)\n",
631 | "_process_property('x', d.x, props, req, defs)\n",
632 | "assert 'x' in props\n",
633 | "assert props['x']['type'] == 'integer'\n",
634 | "assert 'x' in req\n",
635 | "\n",
636 | "# Test processing of a required list property\n",
637 | "_process_property('y', d.y, props, req, defs)\n",
638 | "assert 'y' in props\n",
639 | "assert props['y']['type'] == 'array'\n",
640 | "assert props['y']['items']['type'] == 'number'\n",
641 | "assert 'y' in req\n",
642 | "\n",
643 | "# Test processing of an optional string property with default\n",
644 | "_process_property('z', d.z, props, req, defs)\n",
645 | "assert 'z' in props\n",
646 | "assert props['z']['type'] == 'string'\n",
647 | "assert props['z']['default'] == \"default\"\n",
648 | "assert 'z' not in req"
649 | ]
650 | },
651 | {
652 | "cell_type": "code",
653 | "execution_count": null,
654 | "id": "23f54386",
655 | "metadata": {},
656 | "outputs": [],
657 | "source": [
658 | "#| exports\n",
659 | "def get_schema(f:Union[callable,dict], pname='input_schema')->dict:\n",
660 | " \"Generate JSON schema for a class, function, or method\"\n",
661 | " if isinstance(f, dict): return f\n",
662 | " schema = _get_nested_schema(f)\n",
663 | " desc = f.__doc__\n",
664 | " assert desc, \"Docstring missing!\"\n",
665 | " d = docments(f, full=True)\n",
666 | " ret = d.pop('return')\n",
667 | " if ret.anno is not empty: desc += f'\\n\\nReturns:\\n- type: {_types(ret.anno)[0]}'\n",
668 | " return {\"name\": f.__name__, \"description\": desc, pname: schema}"
669 | ]
670 | },
671 | {
672 | "cell_type": "markdown",
673 | "id": "a59df671",
674 | "metadata": {},
675 | "source": [
676 | "Putting this all together, we can now test getting a schema from `silly_sum`. The tool use spec doesn't support return annotations directly, so we put that in the description instead."
677 | ]
678 | },
679 | {
680 | "cell_type": "code",
681 | "execution_count": null,
682 | "id": "e7311af9",
683 | "metadata": {},
684 | "outputs": [
685 | {
686 | "name": "stdout",
687 | "output_type": "stream",
688 | "text": [
689 | "Adds a + b.\n",
690 | "\n",
691 | "Returns:\n",
692 | "- type: integer\n"
693 | ]
694 | },
695 | {
696 | "data": {
697 | "text/plain": [
698 | "{'name': 'silly_sum',\n",
699 | " 'input_schema': {'type': 'object',\n",
700 | " 'properties': {'a': {'type': 'integer', 'description': 'First thing to sum'},\n",
701 | " 'b': {'type': 'integer',\n",
702 | " 'description': 'Second thing to sum',\n",
703 | " 'default': 1},\n",
704 | " 'c': {'type': 'array',\n",
705 | " 'description': 'A pointless argument',\n",
706 | " 'items': {'type': 'integer'},\n",
707 | " 'default': None}},\n",
708 | " 'title': None,\n",
709 | " 'required': ['a']}}"
710 | ]
711 | },
712 | "execution_count": null,
713 | "metadata": {},
714 | "output_type": "execute_result"
715 | }
716 | ],
717 | "source": [
718 | "s = get_schema(silly_sum)\n",
719 | "desc = s.pop('description')\n",
720 | "print(desc)\n",
721 | "s"
722 | ]
723 | },
724 | {
725 | "cell_type": "markdown",
726 | "id": "d478ba6b",
727 | "metadata": {},
728 | "source": [
729 | "This also works with string annotations, e.g:"
730 | ]
731 | },
732 | {
733 | "cell_type": "code",
734 | "execution_count": null,
735 | "id": "80203962",
736 | "metadata": {},
737 | "outputs": [
738 | {
739 | "data": {
740 | "text/plain": [
741 | "{'name': 'silly_test',\n",
742 | " 'description': 'Mandatory docstring',\n",
743 | " 'input_schema': {'type': 'object',\n",
744 | " 'properties': {'a': {'type': 'integer', 'description': 'quoted type hint'}},\n",
745 | " 'title': None,\n",
746 | " 'required': ['a']}}"
747 | ]
748 | },
749 | "execution_count": null,
750 | "metadata": {},
751 | "output_type": "execute_result"
752 | }
753 | ],
754 | "source": [
755 | "def silly_test(\n",
756 | " a: 'int', # quoted type hint\n",
757 | "):\n",
758 | " \"Mandatory docstring\"\n",
759 | " return a\n",
760 | "\n",
761 | "get_schema(silly_test)"
762 | ]
763 | },
764 | {
765 | "cell_type": "markdown",
766 | "id": "e3f36f8a",
767 | "metadata": {},
768 | "source": [
769 | "This also works with instance methods:"
770 | ]
771 | },
772 | {
773 | "cell_type": "code",
774 | "execution_count": null,
775 | "id": "05d33447",
776 | "metadata": {},
777 | "outputs": [
778 | {
779 | "data": {
780 | "text/plain": [
781 | "{'name': 'sums',\n",
782 | " 'description': 'Adds a + b.\\n\\nReturns:\\n- type: integer',\n",
783 | " 'input_schema': {'type': 'object',\n",
784 | " 'properties': {'a': {'type': 'integer', 'description': 'First thing to sum'},\n",
785 | " 'b': {'type': 'integer',\n",
786 | " 'description': 'Second thing to sum',\n",
787 | " 'default': 1}},\n",
788 | " 'title': None,\n",
789 | " 'required': ['a']}}"
790 | ]
791 | },
792 | "execution_count": null,
793 | "metadata": {},
794 | "output_type": "execute_result"
795 | }
796 | ],
797 | "source": [
798 | "class Dummy:\n",
799 | " def sums(\n",
800 | " self,\n",
801 | " a:int, # First thing to sum\n",
802 | " b:int=1 # Second thing to sum\n",
803 | " ) -> int: # The sum of the inputs\n",
804 | " \"Adds a + b.\"\n",
805 | " print(f\"Finding the sum of {a} and {b}\")\n",
806 | " return a + b\n",
807 | "\n",
808 | "get_schema(Dummy.sums)"
809 | ]
810 | },
811 | {
812 | "cell_type": "markdown",
813 | "id": "ae3fdfa4",
814 | "metadata": {},
815 | "source": [
816 | "`get_schema` also handles more complicated structures such as nested classes. This is useful for things like structured outputs."
817 | ]
818 | },
819 | {
820 | "cell_type": "code",
821 | "execution_count": null,
822 | "id": "ce3be915",
823 | "metadata": {},
824 | "outputs": [
825 | {
826 | "data": {
827 | "text/plain": [
828 | "{'name': 'Conversation',\n",
829 | " 'description': 'A conversation between two speakers',\n",
830 | " 'input_schema': {'type': 'object',\n",
831 | " 'properties': {'turns': {'type': 'array',\n",
832 | " 'description': 'Turns of the conversation',\n",
833 | " 'items': {'$ref': '#/$defs/Turn'}}},\n",
834 | " 'title': 'Conversation',\n",
835 | " 'required': ['turns'],\n",
836 | " '$defs': {'Turn': {'type': 'object',\n",
837 | " 'properties': {'speaker_a': {'type': 'string',\n",
838 | " 'description': \"First speaker's message\"},\n",
839 | " 'speaker_b': {'type': 'string',\n",
840 | " 'description': \"Second speaker's message\"}},\n",
841 | " 'title': 'Turn',\n",
842 | " 'required': ['speaker_a', 'speaker_b']}}}}"
843 | ]
844 | },
845 | "execution_count": null,
846 | "metadata": {},
847 | "output_type": "execute_result"
848 | }
849 | ],
850 | "source": [
851 | "class Turn:\n",
852 | " \"Turn between two speakers\"\n",
853 | " def __init__(\n",
854 | " self,\n",
855 | " speaker_a:str, # First speaker's message\n",
856 | " speaker_b:str, # Second speaker's message\n",
857 | " ): store_attr()\n",
858 | "\n",
859 | "class Conversation:\n",
860 | " \"A conversation between two speakers\"\n",
861 | " def __init__(\n",
862 | " self,\n",
863 | " turns:list[Turn], # Turns of the conversation\n",
864 | " ): store_attr()\n",
865 | "\n",
866 | "get_schema(Conversation)"
867 | ]
868 | },
869 | {
870 | "cell_type": "code",
871 | "execution_count": null,
872 | "id": "386e514d",
873 | "metadata": {},
874 | "outputs": [
875 | {
876 | "data": {
877 | "text/plain": [
878 | "{'name': 'DictConversation',\n",
879 | " 'description': 'A conversation between two speakers',\n",
880 | " 'input_schema': {'type': 'object',\n",
881 | " 'properties': {'turns': {'type': 'object',\n",
882 | " 'description': 'dictionary of topics and the Turns of the conversation',\n",
883 | " 'additionalProperties': {'type': 'array',\n",
884 | " 'items': {'$ref': '#/$defs/Turn'}}}},\n",
885 | " 'title': 'DictConversation',\n",
886 | " 'required': ['turns'],\n",
887 | " '$defs': {'Turn': {'type': 'object',\n",
888 | " 'properties': {'speaker_a': {'type': 'string',\n",
889 | " 'description': \"First speaker's message\"},\n",
890 | " 'speaker_b': {'type': 'string',\n",
891 | " 'description': \"Second speaker's message\"}},\n",
892 | " 'title': 'Turn',\n",
893 | " 'required': ['speaker_a', 'speaker_b']}}}}"
894 | ]
895 | },
896 | "execution_count": null,
897 | "metadata": {},
898 | "output_type": "execute_result"
899 | }
900 | ],
901 | "source": [
902 | "class DictConversation:\n",
903 | " \"A conversation between two speakers\"\n",
904 | " def __init__(\n",
905 | " self,\n",
906 | " turns:dict[str,list[Turn]], # dictionary of topics and the Turns of the conversation\n",
907 | " ): store_attr()\n",
908 | "\n",
909 | "get_schema(DictConversation)"
910 | ]
911 | },
912 | {
913 | "cell_type": "code",
914 | "execution_count": null,
915 | "id": "2c08ac6b",
916 | "metadata": {},
917 | "outputs": [
918 | {
919 | "data": {
920 | "text/plain": [
921 | "{'name': 'SetConversation',\n",
922 | " 'description': 'A conversation between two speakers',\n",
923 | " 'input_schema': {'type': 'object',\n",
924 | " 'properties': {'turns': {'type': 'array',\n",
925 | " 'description': 'the unique Turns of the conversation',\n",
926 | " 'items': {'$ref': '#/$defs/Turn'},\n",
927 | " 'uniqueItems': True}},\n",
928 | " 'title': 'SetConversation',\n",
929 | " 'required': ['turns'],\n",
930 | " '$defs': {'Turn': {'type': 'object',\n",
931 | " 'properties': {'speaker_a': {'type': 'string',\n",
932 | " 'description': \"First speaker's message\"},\n",
933 | " 'speaker_b': {'type': 'string',\n",
934 | " 'description': \"Second speaker's message\"}},\n",
935 | " 'title': 'Turn',\n",
936 | " 'required': ['speaker_a', 'speaker_b']}}}}"
937 | ]
938 | },
939 | "execution_count": null,
940 | "metadata": {},
941 | "output_type": "execute_result"
942 | }
943 | ],
944 | "source": [
945 | "class SetConversation:\n",
946 | " \"A conversation between two speakers\"\n",
947 | " def __init__(\n",
948 | " self,\n",
949 | " turns:set[Turn], # the unique Turns of the conversation\n",
950 | " ): store_attr()\n",
951 | "\n",
952 | "get_schema(SetConversation)"
953 | ]
954 | },
955 | {
956 | "cell_type": "code",
957 | "execution_count": null,
958 | "id": "8cf3f35c",
959 | "metadata": {},
960 | "outputs": [],
961 | "source": [
962 | "#| exports\n",
963 | "def PathArg(\n",
964 | " path: str # A filesystem path\n",
965 | "): return Path(path)"
966 | ]
967 | },
968 | {
969 | "cell_type": "markdown",
970 | "id": "169212a6",
971 | "metadata": {},
972 | "source": [
973 | "Paths are a special case, since they only take `*args` and `**kwargs` as params, but normally we'd use them in a schema by just passing a str. So we create a custom param type for that."
974 | ]
975 | },
976 | {
977 | "cell_type": "code",
978 | "execution_count": null,
979 | "id": "e9135dfa",
980 | "metadata": {},
981 | "outputs": [
982 | {
983 | "data": {
984 | "text/plain": [
985 | "{'name': 'path_test',\n",
986 | " 'description': 'Mandatory docstring',\n",
987 | " 'input_schema': {'type': 'object',\n",
988 | " 'properties': {'a': {'type': 'object',\n",
989 | " 'description': 'a type hint',\n",
990 | " '$ref': '#/$defs/PathArg'},\n",
991 | " 'b': {'type': 'object',\n",
992 | " 'description': 'b type hint',\n",
993 | " '$ref': '#/$defs/PathArg'}},\n",
994 | " 'title': None,\n",
995 | " 'required': ['a', 'b'],\n",
996 | " '$defs': {'PathArg': {'type': 'object',\n",
997 | " 'properties': {'path': {'type': 'string',\n",
998 | " 'description': 'A filesystem path'}},\n",
999 | " 'title': None,\n",
1000 | " 'required': ['path']}}}}"
1001 | ]
1002 | },
1003 | "execution_count": null,
1004 | "metadata": {},
1005 | "output_type": "execute_result"
1006 | }
1007 | ],
1008 | "source": [
1009 | "def path_test(\n",
1010 | " a: PathArg, # a type hint\n",
1011 | " b: PathArg # b type hint\n",
1012 | "):\n",
1013 | " \"Mandatory docstring\"\n",
1014 | " return a/b\n",
1015 | "\n",
1016 | "get_schema(path_test)"
1017 | ]
1018 | },
1019 | {
1020 | "cell_type": "markdown",
1021 | "id": "c6d1d0c8",
1022 | "metadata": {},
1023 | "source": [
1024 | "Alternatively, use `Path` as usual, and handle the `format` key in the json to use that as a callable:"
1025 | ]
1026 | },
1027 | {
1028 | "cell_type": "code",
1029 | "execution_count": null,
1030 | "id": "bdb69462",
1031 | "metadata": {},
1032 | "outputs": [
1033 | {
1034 | "data": {
1035 | "text/plain": [
1036 | "{'name': 'path_test2',\n",
1037 | " 'description': 'Mandatory docstring',\n",
1038 | " 'input_schema': {'type': 'object',\n",
1039 | " 'properties': {'a': {'type': 'string',\n",
1040 | " 'description': 'a type hint',\n",
1041 | " 'format': 'Path'},\n",
1042 | " 'b': {'type': 'string', 'description': 'b type hint', 'format': 'Path'}},\n",
1043 | " 'title': None,\n",
1044 | " 'required': ['a', 'b']}}"
1045 | ]
1046 | },
1047 | "execution_count": null,
1048 | "metadata": {},
1049 | "output_type": "execute_result"
1050 | }
1051 | ],
1052 | "source": [
1053 | "def path_test2(\n",
1054 | " a: Path, # a type hint\n",
1055 | " b: Path # b type hint\n",
1056 | "):\n",
1057 | " \"Mandatory docstring\"\n",
1058 | " return a/b\n",
1059 | "\n",
1060 | "get_schema(path_test2)"
1061 | ]
1062 | },
1063 | {
1064 | "cell_type": "markdown",
1065 | "id": "369320d4",
1066 | "metadata": {},
1067 | "source": [
1068 | "### Additional `get_schema()` Test Cases"
1069 | ]
1070 | },
1071 | {
1072 | "cell_type": "markdown",
1073 | "id": "a8052380",
1074 | "metadata": {},
1075 | "source": [
1076 | "Union types are approximately mapped to JSON schema 'anyOf' with two or more value types."
1077 | ]
1078 | },
1079 | {
1080 | "cell_type": "code",
1081 | "execution_count": null,
1082 | "id": "6fc1d6f9",
1083 | "metadata": {},
1084 | "outputs": [
1085 | {
1086 | "data": {
1087 | "text/plain": [
1088 | "{'name': '_union_test',\n",
1089 | " 'description': 'Mandatory docstring',\n",
1090 | " 'input_schema': {'type': 'object',\n",
1091 | " 'properties': {'opt_tup': {'type': 'object',\n",
1092 | " 'description': '',\n",
1093 | " 'default': None,\n",
1094 | " 'anyOf': [{'type': 'array'}, {'type': 'string'}, {'type': 'integer'}]}},\n",
1095 | " 'title': None}}"
1096 | ]
1097 | },
1098 | "execution_count": null,
1099 | "metadata": {},
1100 | "output_type": "execute_result"
1101 | }
1102 | ],
1103 | "source": [
1104 | "def _union_test(opt_tup: Union[Tuple[int, int], str, int]=None):\n",
1105 | " \"Mandatory docstring\"\n",
1106 | " return \"\"\n",
1107 | "get_schema(_union_test)"
1108 | ]
1109 | },
1110 | {
1111 | "cell_type": "markdown",
1112 | "id": "7641aca8",
1113 | "metadata": {},
1114 | "source": [
1115 | "The new (Python 3.10+) union syntax can also be used, producing an equivalent schema."
1116 | ]
1117 | },
1118 | {
1119 | "cell_type": "code",
1120 | "execution_count": null,
1121 | "id": "a1a11b3b",
1122 | "metadata": {},
1123 | "outputs": [
1124 | {
1125 | "data": {
1126 | "text/plain": [
1127 | "{'name': '_new_union_test',\n",
1128 | " 'description': 'Mandatory docstring',\n",
1129 | " 'input_schema': {'type': 'object',\n",
1130 | " 'properties': {'opt_tup': {'type': 'object',\n",
1131 | " 'description': '',\n",
1132 | " 'default': None,\n",
1133 | " 'anyOf': [{'type': 'array'}, {'type': 'string'}, {'type': 'integer'}]}},\n",
1134 | " 'title': None}}"
1135 | ]
1136 | },
1137 | "execution_count": null,
1138 | "metadata": {},
1139 | "output_type": "execute_result"
1140 | }
1141 | ],
1142 | "source": [
1143 | "def _new_union_test(opt_tup: Tuple[int, int] | str | int =None):\n",
1144 | " \"Mandatory docstring\"\n",
1145 | " pass\n",
1146 | "get_schema(_new_union_test)"
1147 | ]
1148 | },
1149 | {
1150 | "cell_type": "markdown",
1151 | "id": "8d24cc0a",
1152 | "metadata": {},
1153 | "source": [
1154 | "Optional is a special case of union types, limited to two types, one of which is None (mapped to null in JSON schema):"
1155 | ]
1156 | },
1157 | {
1158 | "cell_type": "code",
1159 | "execution_count": null,
1160 | "id": "ac8f3d19",
1161 | "metadata": {},
1162 | "outputs": [
1163 | {
1164 | "data": {
1165 | "text/plain": [
1166 | "{'name': '_optional_test',\n",
1167 | " 'description': 'Mandatory docstring',\n",
1168 | " 'input_schema': {'type': 'object',\n",
1169 | " 'properties': {'opt_tup': {'type': 'object',\n",
1170 | " 'description': '',\n",
1171 | " 'default': None,\n",
1172 | " 'anyOf': [{'type': 'array'}, {'type': 'null'}]}},\n",
1173 | " 'title': None}}"
1174 | ]
1175 | },
1176 | "execution_count": null,
1177 | "metadata": {},
1178 | "output_type": "execute_result"
1179 | }
1180 | ],
1181 | "source": [
1182 | "def _optional_test(opt_tup: Optional[Tuple[int, int]]=None):\n",
1183 | " \"Mandatory docstring\"\n",
1184 | " pass\n",
1185 | "get_schema(_optional_test)"
1186 | ]
1187 | },
1188 | {
1189 | "cell_type": "markdown",
1190 | "id": "c969721b",
1191 | "metadata": {},
1192 | "source": [
1193 | "Containers can also be used, both in their parameterized form (`List[int]`) or as their unparameterized raw type (`List`). In the latter case, the item type is mapped to `object` in JSON schema."
1194 | ]
1195 | },
1196 | {
1197 | "cell_type": "code",
1198 | "execution_count": null,
1199 | "id": "b2959197",
1200 | "metadata": {},
1201 | "outputs": [
1202 | {
1203 | "data": {
1204 | "text/plain": [
1205 | "{'name': '_list_test',\n",
1206 | " 'description': 'Mandatory docstring',\n",
1207 | " 'input_schema': {'type': 'object',\n",
1208 | " 'properties': {'l': {'type': 'array',\n",
1209 | " 'description': '',\n",
1210 | " 'items': {'type': 'integer'}}},\n",
1211 | " 'title': None,\n",
1212 | " 'required': ['l']}}"
1213 | ]
1214 | },
1215 | "execution_count": null,
1216 | "metadata": {},
1217 | "output_type": "execute_result"
1218 | }
1219 | ],
1220 | "source": [
1221 | "def _list_test(l: List[int]):\n",
1222 | " \"Mandatory docstring\"\n",
1223 | " pass\n",
1224 | "get_schema(_list_test)"
1225 | ]
1226 | },
1227 | {
1228 | "cell_type": "code",
1229 | "execution_count": null,
1230 | "id": "c8fbfea7",
1231 | "metadata": {},
1232 | "outputs": [
1233 | {
1234 | "data": {
1235 | "text/plain": [
1236 | "{'name': '_raw_list_test',\n",
1237 | " 'description': 'Mandatory docstring',\n",
1238 | " 'input_schema': {'type': 'object',\n",
1239 | " 'properties': {'l': {'type': 'array',\n",
1240 | " 'description': '',\n",
1241 | " 'items': {'type': 'object'}}},\n",
1242 | " 'title': None,\n",
1243 | " 'required': ['l']}}"
1244 | ]
1245 | },
1246 | "execution_count": null,
1247 | "metadata": {},
1248 | "output_type": "execute_result"
1249 | }
1250 | ],
1251 | "source": [
1252 | "def _raw_list_test(l: List):\n",
1253 | " \"Mandatory docstring\"\n",
1254 | " pass\n",
1255 | "get_schema(_raw_list_test)"
1256 | ]
1257 | },
1258 | {
1259 | "cell_type": "markdown",
1260 | "id": "5704c197",
1261 | "metadata": {},
1262 | "source": [
1263 | "The same applies to dictionary, which can similarly be parameterized with key/value types or specified as a raw type."
1264 | ]
1265 | },
1266 | {
1267 | "cell_type": "code",
1268 | "execution_count": null,
1269 | "id": "b2e8c567",
1270 | "metadata": {},
1271 | "outputs": [
1272 | {
1273 | "data": {
1274 | "text/plain": [
1275 | "{'name': '_dict_test',\n",
1276 | " 'description': 'Mandatory docstring',\n",
1277 | " 'input_schema': {'type': 'object',\n",
1278 | " 'properties': {'d': {'type': 'object',\n",
1279 | " 'description': '',\n",
1280 | " 'additionalProperties': {'type': 'integer'}}},\n",
1281 | " 'title': None,\n",
1282 | " 'required': ['d']}}"
1283 | ]
1284 | },
1285 | "execution_count": null,
1286 | "metadata": {},
1287 | "output_type": "execute_result"
1288 | }
1289 | ],
1290 | "source": [
1291 | "def _dict_test(d: Dict[str, int]):\n",
1292 | " \"Mandatory docstring\"\n",
1293 | " pass\n",
1294 | "get_schema(_dict_test)"
1295 | ]
1296 | },
1297 | {
1298 | "cell_type": "code",
1299 | "execution_count": null,
1300 | "id": "b3138ac4",
1301 | "metadata": {},
1302 | "outputs": [
1303 | {
1304 | "data": {
1305 | "text/plain": [
1306 | "{'name': '_raw_dict_test',\n",
1307 | " 'description': 'Mandatory docstring',\n",
1308 | " 'input_schema': {'type': 'object',\n",
1309 | " 'properties': {'d': {'type': 'object', 'description': ''}},\n",
1310 | " 'title': None,\n",
1311 | " 'required': ['d']}}"
1312 | ]
1313 | },
1314 | "execution_count": null,
1315 | "metadata": {},
1316 | "output_type": "execute_result"
1317 | }
1318 | ],
1319 | "source": [
1320 | "def _raw_dict_test(d: Dict):\n",
1321 | " \"Mandatory docstring\"\n",
1322 | "get_schema(_raw_dict_test)"
1323 | ]
1324 | },
1325 | {
1326 | "cell_type": "markdown",
1327 | "id": "9529d39a",
1328 | "metadata": {},
1329 | "source": [
1330 | "### Python tool"
1331 | ]
1332 | },
1333 | {
1334 | "cell_type": "markdown",
1335 | "id": "7a69cad9",
1336 | "metadata": {},
1337 | "source": [
1338 | "In language model clients it's often useful to have a 'code interpreter' -- this is something that runs code, and generally outputs the result of the last expression (i.e like IPython or Jupyter). \n",
1339 | "\n",
1340 | "In this section we'll create the `python` function, which executes a string as Python code, with an optional timeout. If the last line is an expression, we'll return that -- just like in IPython or Jupyter, but without needing them installed."
1341 | ]
1342 | },
1343 | {
1344 | "cell_type": "code",
1345 | "execution_count": null,
1346 | "id": "873000d7",
1347 | "metadata": {},
1348 | "outputs": [],
1349 | "source": [
1350 | "#| exports\n",
1351 | "import ast, time, signal, traceback\n",
1352 | "from fastcore.utils import *"
1353 | ]
1354 | },
1355 | {
1356 | "cell_type": "code",
1357 | "execution_count": null,
1358 | "id": "4703296a",
1359 | "metadata": {},
1360 | "outputs": [],
1361 | "source": [
1362 | "#| exports\n",
1363 | "def _copy_loc(new, orig):\n",
1364 | " \"Copy location information from original node to new node and all children.\"\n",
1365 | " new = ast.copy_location(new, orig)\n",
1366 | " for field, o in ast.iter_fields(new):\n",
1367 | " if isinstance(o, ast.AST): setattr(new, field, _copy_loc(o, orig))\n",
1368 | " elif isinstance(o, list): setattr(new, field, [_copy_loc(value, orig) for value in o])\n",
1369 | " return new"
1370 | ]
1371 | },
1372 | {
1373 | "cell_type": "markdown",
1374 | "id": "6c0d4922",
1375 | "metadata": {},
1376 | "source": [
1377 | "This is an internal function that's needed for `_run` to ensure that location information is available in the abstract syntax tree (AST), since otherwise python complains."
1378 | ]
1379 | },
1380 | {
1381 | "cell_type": "code",
1382 | "execution_count": null,
1383 | "id": "1574585f",
1384 | "metadata": {},
1385 | "outputs": [],
1386 | "source": [
1387 | "#| exports\n",
1388 | "def _run(code:str, glb:dict=None, loc:dict=None):\n",
1389 | " \"Run `code`, returning final expression (similar to IPython)\"\n",
1390 | " tree = ast.parse(code)\n",
1391 | " last_node = tree.body[-1] if tree.body else None\n",
1392 | " \n",
1393 | " # If the last node is an expression, modify the AST to capture the result\n",
1394 | " if isinstance(last_node, ast.Expr):\n",
1395 | " tgt = [ast.Name(id='_result', ctx=ast.Store())]\n",
1396 | " assign_node = ast.Assign(targets=tgt, value=last_node.value)\n",
1397 | " tree.body[-1] = _copy_loc(assign_node, last_node)\n",
1398 | "\n",
1399 | " compiled_code = compile(tree, filename='', mode='exec')\n",
1400 | " glb = glb or {}\n",
1401 | " stdout_buffer = io.StringIO()\n",
1402 | " saved_stdout = sys.stdout\n",
1403 | " sys.stdout = stdout_buffer\n",
1404 | " try: exec(compiled_code, glb, loc)\n",
1405 | " finally: sys.stdout = saved_stdout\n",
1406 | " _result = glb.get('_result', None)\n",
1407 | " if _result is not None: return _result\n",
1408 | " return stdout_buffer.getvalue().strip()"
1409 | ]
1410 | },
1411 | {
1412 | "cell_type": "markdown",
1413 | "id": "92ca7f47",
1414 | "metadata": {},
1415 | "source": [
1416 | "This is the internal function used to actually run the code -- we pull off the last AST to see if it's an expression (i.e something that returns a value), and if so, we store it to a special `_result` variable so we can return it."
1417 | ]
1418 | },
1419 | {
1420 | "cell_type": "code",
1421 | "execution_count": null,
1422 | "id": "15b72cb2",
1423 | "metadata": {},
1424 | "outputs": [
1425 | {
1426 | "data": {
1427 | "text/plain": [
1428 | "479001600"
1429 | ]
1430 | },
1431 | "execution_count": null,
1432 | "metadata": {},
1433 | "output_type": "execute_result"
1434 | }
1435 | ],
1436 | "source": [
1437 | "_run('import math;math.factorial(12)')"
1438 | ]
1439 | },
1440 | {
1441 | "cell_type": "code",
1442 | "execution_count": null,
1443 | "id": "632a7ac1",
1444 | "metadata": {},
1445 | "outputs": [
1446 | {
1447 | "data": {
1448 | "text/plain": [
1449 | "'2'"
1450 | ]
1451 | },
1452 | "execution_count": null,
1453 | "metadata": {},
1454 | "output_type": "execute_result"
1455 | }
1456 | ],
1457 | "source": [
1458 | "_run('print(1+1)')"
1459 | ]
1460 | },
1461 | {
1462 | "cell_type": "markdown",
1463 | "id": "34f2e5c2",
1464 | "metadata": {},
1465 | "source": [
1466 | "We now have the machinery needed to create our `python` function."
1467 | ]
1468 | },
1469 | {
1470 | "cell_type": "code",
1471 | "execution_count": null,
1472 | "id": "81857615",
1473 | "metadata": {},
1474 | "outputs": [],
1475 | "source": [
1476 | "#| exports\n",
1477 | "def python(code:str, # Code to execute\n",
1478 | " glb:Optional[dict]=None, # Globals namespace\n",
1479 | " loc:Optional[dict]=None, # Locals namespace\n",
1480 | " timeout:int=3600 # Maximum run time in seconds before a `TimeoutError` is raised\n",
1481 | " ): # Result of last node, if it's an expression, or `None` otherwise\n",
1482 | " \"\"\"Executes python `code` with `timeout` and returning final expression (similar to IPython).\n",
1483 | " Raised exceptions are returned as a string, with a stack trace.\"\"\"\n",
1484 | " def handler(*args): raise TimeoutError()\n",
1485 | " if glb is None: glb = inspect.currentframe().f_back.f_globals\n",
1486 | " if loc is None: loc=glb\n",
1487 | " signal.signal(signal.SIGALRM, handler)\n",
1488 | " signal.alarm(timeout)\n",
1489 | " try: return _run(code, glb, loc)\n",
1490 | " except Exception as e: return traceback.format_exc()\n",
1491 | " finally: signal.alarm(0)"
1492 | ]
1493 | },
1494 | {
1495 | "cell_type": "markdown",
1496 | "id": "b6b9324f",
1497 | "metadata": {},
1498 | "source": [
1499 | "There's no builtin security here -- you should generally use this in a sandbox, or alternatively prompt before running code. It can handle multiline function definitions, and pretty much any other normal Python syntax."
1500 | ]
1501 | },
1502 | {
1503 | "cell_type": "code",
1504 | "execution_count": null,
1505 | "id": "69d74f4d",
1506 | "metadata": {},
1507 | "outputs": [
1508 | {
1509 | "data": {
1510 | "text/plain": [
1511 | "120"
1512 | ]
1513 | },
1514 | "execution_count": null,
1515 | "metadata": {},
1516 | "output_type": "execute_result"
1517 | }
1518 | ],
1519 | "source": [
1520 | "python(\"\"\"def factorial(n):\n",
1521 | " if n == 0 or n == 1: return 1\n",
1522 | " else: return n * factorial(n-1)\n",
1523 | "factorial(5)\"\"\")"
1524 | ]
1525 | },
1526 | {
1527 | "cell_type": "markdown",
1528 | "id": "6c629442",
1529 | "metadata": {},
1530 | "source": [
1531 | "If the code takes longer than `timeout` then it returns an error string."
1532 | ]
1533 | },
1534 | {
1535 | "cell_type": "code",
1536 | "execution_count": null,
1537 | "id": "fcb472b3",
1538 | "metadata": {},
1539 | "outputs": [
1540 | {
1541 | "name": "stdout",
1542 | "output_type": "stream",
1543 | "text": [
1544 | "Traceback (most recent call last):\n",
1545 | " File \"/var/folders/5c/jls7k26j1tq6l03cl_kvpnwh0000gn/T/ipykernel_97636/2963369439.py\", line 14, in python\n",
1546 | " try: return _run(code, glb, loc)\n",
1547 | " ^^^^^^^^^^^^^^^^^^^^\n",
1548 | " File \"/var/folders/5c/jls7k26j1tq6l03cl_kvpnwh0000gn/T/ipykernel_97636/1858893181.py\", line 18, in _run\n",
1549 | " try: exec(compiled_code, glb, loc)\n",
1550 | " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
1551 | " File \"\", line 1, in \n",
1552 | " File \"/var/folders/5c/jls7k26j1tq6l03cl_kvpnwh0000gn/T/ipykernel_97636/2963369439.py\", line 9, in handler\n",
1553 | " def handler(*args): raise TimeoutError()\n",
1554 | " ^^^^^^^^^^^^^^^^^^^^\n",
1555 | "TimeoutError\n",
1556 | "\n"
1557 | ]
1558 | }
1559 | ],
1560 | "source": [
1561 | "print(python('import time; time.sleep(10)', timeout=1))"
1562 | ]
1563 | },
1564 | {
1565 | "cell_type": "markdown",
1566 | "id": "d45684c1",
1567 | "metadata": {},
1568 | "source": [
1569 | "By default the caller's global namespace is used."
1570 | ]
1571 | },
1572 | {
1573 | "cell_type": "code",
1574 | "execution_count": null,
1575 | "id": "72dfe290",
1576 | "metadata": {},
1577 | "outputs": [
1578 | {
1579 | "data": {
1580 | "text/plain": [
1581 | "1"
1582 | ]
1583 | },
1584 | "execution_count": null,
1585 | "metadata": {},
1586 | "output_type": "execute_result"
1587 | }
1588 | ],
1589 | "source": [
1590 | "python(\"a=1\")\n",
1591 | "a"
1592 | ]
1593 | },
1594 | {
1595 | "cell_type": "markdown",
1596 | "id": "bf48557c",
1597 | "metadata": {},
1598 | "source": [
1599 | "Pass a different `glb` if needed."
1600 | ]
1601 | },
1602 | {
1603 | "cell_type": "code",
1604 | "execution_count": null,
1605 | "id": "55fb5613",
1606 | "metadata": {},
1607 | "outputs": [
1608 | {
1609 | "data": {
1610 | "text/plain": [
1611 | "(1, 3)"
1612 | ]
1613 | },
1614 | "execution_count": null,
1615 | "metadata": {},
1616 | "output_type": "execute_result"
1617 | }
1618 | ],
1619 | "source": [
1620 | "glb = {}\n",
1621 | "python(\"a=3\", glb)\n",
1622 | "a, glb['a']"
1623 | ]
1624 | },
1625 | {
1626 | "cell_type": "markdown",
1627 | "id": "244c502e",
1628 | "metadata": {},
1629 | "source": [
1630 | "### Tool Calling"
1631 | ]
1632 | },
1633 | {
1634 | "cell_type": "markdown",
1635 | "id": "186408f8",
1636 | "metadata": {},
1637 | "source": [
1638 | "Many LLM API providers offer tool calling where an LLM can choose to call a given tool. This is also helpful for structured outputs since the response from the LLM is contrained to the required arguments of the tool.\n",
1639 | "\n",
1640 | "This section will be dedicated to helper functions for calling tools. We don't want to allow LLMs to call just any possible function (that would be a security disaster!) so we create a namespace -- that is, a dictionary of allowable function names to call."
1641 | ]
1642 | },
1643 | {
1644 | "cell_type": "code",
1645 | "execution_count": null,
1646 | "id": "782c4415",
1647 | "metadata": {},
1648 | "outputs": [],
1649 | "source": [
1650 | "#| export\n",
1651 | "def mk_ns(*funcs_or_objs):\n",
1652 | " merged = {}\n",
1653 | " for o in funcs_or_objs:\n",
1654 | " if isinstance(o, type): merged |= {n:getattr(o,n) for n,m in o.__dict__.items() if isinstance(m, (staticmethod, classmethod))}\n",
1655 | " if isinstance(o, object): merged |= {n:getattr(o,n) for n, m in inspect.getmembers(o, inspect.ismethod)} | {n:m for n,m in o.__class__.__dict__.items() if isinstance(m, staticmethod)}\n",
1656 | " if callable(o) and hasattr(o, '__name__'): merged |= {o.__name__: o}\n",
1657 | " return merged"
1658 | ]
1659 | },
1660 | {
1661 | "cell_type": "code",
1662 | "execution_count": null,
1663 | "id": "5947aac4",
1664 | "metadata": {},
1665 | "outputs": [
1666 | {
1667 | "data": {
1668 | "text/plain": [
1669 | "{'sums': }"
1670 | ]
1671 | },
1672 | "execution_count": null,
1673 | "metadata": {},
1674 | "output_type": "execute_result"
1675 | }
1676 | ],
1677 | "source": [
1678 | "def sums(a, b): return a + b\n",
1679 | "ns = mk_ns(sums); ns"
1680 | ]
1681 | },
1682 | {
1683 | "cell_type": "code",
1684 | "execution_count": null,
1685 | "id": "86ce0458",
1686 | "metadata": {},
1687 | "outputs": [
1688 | {
1689 | "data": {
1690 | "text/plain": [
1691 | "3"
1692 | ]
1693 | },
1694 | "execution_count": null,
1695 | "metadata": {},
1696 | "output_type": "execute_result"
1697 | }
1698 | ],
1699 | "source": [
1700 | "ns['sums'](1, 2)"
1701 | ]
1702 | },
1703 | {
1704 | "cell_type": "code",
1705 | "execution_count": null,
1706 | "id": "29d22f82",
1707 | "metadata": {},
1708 | "outputs": [],
1709 | "source": [
1710 | "class Dummy:\n",
1711 | " def __init__(self,a): self.a = a\n",
1712 | " def __call__(self): return self.a\n",
1713 | " def sums(self, a, b): return a + b\n",
1714 | " @staticmethod\n",
1715 | " def subs(a, b): return a - b\n",
1716 | " @classmethod\n",
1717 | " def mults(cls, a, b): return a * b"
1718 | ]
1719 | },
1720 | {
1721 | "cell_type": "code",
1722 | "execution_count": null,
1723 | "id": "ca50b957",
1724 | "metadata": {},
1725 | "outputs": [
1726 | {
1727 | "data": {
1728 | "text/plain": [
1729 | "{'subs': ,\n",
1730 | " 'mults': >,\n",
1731 | " 'Dummy': __main__.Dummy}"
1732 | ]
1733 | },
1734 | "execution_count": null,
1735 | "metadata": {},
1736 | "output_type": "execute_result"
1737 | }
1738 | ],
1739 | "source": [
1740 | "ns = mk_ns(Dummy); ns"
1741 | ]
1742 | },
1743 | {
1744 | "cell_type": "code",
1745 | "execution_count": null,
1746 | "id": "59ef734f",
1747 | "metadata": {},
1748 | "outputs": [
1749 | {
1750 | "data": {
1751 | "text/plain": [
1752 | "(-1, 6)"
1753 | ]
1754 | },
1755 | "execution_count": null,
1756 | "metadata": {},
1757 | "output_type": "execute_result"
1758 | }
1759 | ],
1760 | "source": [
1761 | "ns['subs'](1, 2), ns['mults'](3, 2)"
1762 | ]
1763 | },
1764 | {
1765 | "cell_type": "code",
1766 | "execution_count": null,
1767 | "id": "15871e6d",
1768 | "metadata": {},
1769 | "outputs": [
1770 | {
1771 | "data": {
1772 | "text/plain": [
1773 | "{'__call__': >,\n",
1774 | " '__init__': >,\n",
1775 | " 'mults': >,\n",
1776 | " 'sums': >,\n",
1777 | " 'subs': )>}"
1778 | ]
1779 | },
1780 | "execution_count": null,
1781 | "metadata": {},
1782 | "output_type": "execute_result"
1783 | }
1784 | ],
1785 | "source": [
1786 | "d = Dummy(10)\n",
1787 | "ns = mk_ns(d); ns"
1788 | ]
1789 | },
1790 | {
1791 | "cell_type": "code",
1792 | "execution_count": null,
1793 | "id": "13cb7685",
1794 | "metadata": {},
1795 | "outputs": [
1796 | {
1797 | "data": {
1798 | "text/plain": [
1799 | "(-1, 6, 5, 10)"
1800 | ]
1801 | },
1802 | "execution_count": null,
1803 | "metadata": {},
1804 | "output_type": "execute_result"
1805 | }
1806 | ],
1807 | "source": [
1808 | "ns['subs'](1, 2), ns['mults'](3, 2), ns['sums'](3, 2), ns['__call__']()"
1809 | ]
1810 | },
1811 | {
1812 | "cell_type": "code",
1813 | "execution_count": null,
1814 | "id": "2dfe13ae",
1815 | "metadata": {},
1816 | "outputs": [
1817 | {
1818 | "data": {
1819 | "text/plain": [
1820 | "(None, -99)"
1821 | ]
1822 | },
1823 | "execution_count": null,
1824 | "metadata": {},
1825 | "output_type": "execute_result"
1826 | }
1827 | ],
1828 | "source": [
1829 | "ns['__init__'](-99), ns['__call__']()"
1830 | ]
1831 | },
1832 | {
1833 | "cell_type": "code",
1834 | "execution_count": null,
1835 | "id": "85b4734f",
1836 | "metadata": {},
1837 | "outputs": [],
1838 | "source": [
1839 | "#| exports\n",
1840 | "def call_func(fc_name, fc_inputs, ns, raise_on_err=True):\n",
1841 | " \"Call the function `fc_name` with the given `fc_inputs` using namespace `ns`.\"\n",
1842 | " if not isinstance(ns, abc.Mapping): ns = mk_ns(*ns)\n",
1843 | " func = ns[fc_name]\n",
1844 | " try: return func(**fc_inputs)\n",
1845 | " except Exception as e:\n",
1846 | " if raise_on_err: raise e\n",
1847 | " else: return traceback.format_exc()"
1848 | ]
1849 | },
1850 | {
1851 | "cell_type": "markdown",
1852 | "id": "ce9cce60",
1853 | "metadata": {},
1854 | "source": [
1855 | "Now when we an LLM responses with the tool to use and its inputs, we can simply use the same namespace it was given to look up the tool and call it."
1856 | ]
1857 | },
1858 | {
1859 | "cell_type": "code",
1860 | "execution_count": null,
1861 | "id": "f2ade8a8",
1862 | "metadata": {},
1863 | "outputs": [
1864 | {
1865 | "data": {
1866 | "text/plain": [
1867 | "3"
1868 | ]
1869 | },
1870 | "execution_count": null,
1871 | "metadata": {},
1872 | "output_type": "execute_result"
1873 | }
1874 | ],
1875 | "source": [
1876 | "call_func('sums', {'a': 1, 'b': 2}, ns=[sums])"
1877 | ]
1878 | },
1879 | {
1880 | "cell_type": "code",
1881 | "execution_count": null,
1882 | "id": "9aace64a",
1883 | "metadata": {},
1884 | "outputs": [
1885 | {
1886 | "data": {
1887 | "text/plain": [
1888 | "-1"
1889 | ]
1890 | },
1891 | "execution_count": null,
1892 | "metadata": {},
1893 | "output_type": "execute_result"
1894 | }
1895 | ],
1896 | "source": [
1897 | "call_func('subs', {'a': 1, 'b': 2}, ns=mk_ns(d))"
1898 | ]
1899 | },
1900 | {
1901 | "cell_type": "code",
1902 | "execution_count": null,
1903 | "id": "6c93c0ef",
1904 | "metadata": {},
1905 | "outputs": [],
1906 | "source": [
1907 | "assert \"unsupported operand type(s) for -: 'int' and 'str'\" in call_func('subs', {'a': 1, 'b': '3'}, ns=mk_ns(d), raise_on_err=False)"
1908 | ]
1909 | },
1910 | {
1911 | "cell_type": "code",
1912 | "execution_count": null,
1913 | "id": "85489c3d",
1914 | "metadata": {},
1915 | "outputs": [],
1916 | "source": [
1917 | "test_fail(call_func, args=['subs', {'a': 1, 'b': '3'}], kwargs={'ns': mk_ns(d)})"
1918 | ]
1919 | },
1920 | {
1921 | "cell_type": "code",
1922 | "execution_count": null,
1923 | "id": "b19298ac",
1924 | "metadata": {},
1925 | "outputs": [],
1926 | "source": [
1927 | "%%ai\n",
1928 | "How do I get the whole traceback of an error instead of just str(e) like above?"
1929 | ]
1930 | },
1931 | {
1932 | "cell_type": "markdown",
1933 | "id": "6ec89b42",
1934 | "metadata": {},
1935 | "source": [
1936 | "To get the whole traceback of an error instead of just `str(e)`, you can use the `traceback` module, which you've already imported in your code. Modify the `call_func` function to capture and return the full traceback when an error occurs:\n",
1937 | "\n",
1938 | "```python\n",
1939 | "#| exports\n",
1940 | "def call_func(fc_name, fc_inputs, ns, raise_on_err=True):\n",
1941 | " \"Call the function `fc_name` with the given `fc_inputs` using namespace `ns`.\"\n",
1942 | " if not isinstance(ns, abc.Mapping): ns = mk_ns(*ns)\n",
1943 | " func = ns[fc_name]\n",
1944 | " try: return func(**fc_inputs)\n",
1945 | " except Exception as e:\n",
1946 | " if raise_on_err: raise e\n",
1947 | " else: return traceback.format_exc()\n",
1948 | "```\n",
1949 | "\n",
1950 | "This replaces `str(e)` with `traceback.format_exc()`, which returns the full traceback as a string, including the error type, message, and the call stack that led to the error. This gives you much more context about where and why the error occurred."
1951 | ]
1952 | },
1953 | {
1954 | "cell_type": "markdown",
1955 | "id": "591574b8-6b53-4908-8159-b87be42133f7",
1956 | "metadata": {},
1957 | "source": [
1958 | "### Async function calling"
1959 | ]
1960 | },
1961 | {
1962 | "cell_type": "markdown",
1963 | "id": "96a3a7d3-31ef-4cc6-b47c-35eaa8bbff8b",
1964 | "metadata": {},
1965 | "source": [
1966 | "Since tools defined by MCP servers are async function, it is probably a good idea to have an async version of `call_func`."
1967 | ]
1968 | },
1969 | {
1970 | "cell_type": "code",
1971 | "execution_count": null,
1972 | "id": "e273507b-6e4b-40bb-ae23-6397e89a4d51",
1973 | "metadata": {},
1974 | "outputs": [
1975 | {
1976 | "data": {
1977 | "text/plain": [
1978 | "{'asums': }"
1979 | ]
1980 | },
1981 | "execution_count": null,
1982 | "metadata": {},
1983 | "output_type": "execute_result"
1984 | }
1985 | ],
1986 | "source": [
1987 | "async def asums(a, b): return a + b\n",
1988 | "ns = mk_ns(asums); ns"
1989 | ]
1990 | },
1991 | {
1992 | "cell_type": "code",
1993 | "execution_count": null,
1994 | "id": "7ac04e80-7bb9-4b52-8285-454684605d47",
1995 | "metadata": {},
1996 | "outputs": [],
1997 | "source": [
1998 | "#| exports\n",
1999 | "async def call_func_async(fc_name, fc_inputs, ns, raise_on_err=True):\n",
2000 | " \"Awaits the function `fc_name` with the given `fc_inputs` using namespace `ns`.\"\n",
2001 | " res = call_func(fc_name, fc_inputs, ns, raise_on_err=raise_on_err)\n",
2002 | " if inspect.iscoroutine(res):\n",
2003 | " try: res = await res\n",
2004 | " except Exception as e:\n",
2005 | " if raise_on_err: raise e\n",
2006 | " else: return traceback.format_exc()\n",
2007 | " return res"
2008 | ]
2009 | },
2010 | {
2011 | "cell_type": "code",
2012 | "execution_count": null,
2013 | "id": "b83998ac-68e2-4dbe-b594-65fb4fdf59b8",
2014 | "metadata": {},
2015 | "outputs": [
2016 | {
2017 | "data": {
2018 | "text/plain": [
2019 | "3"
2020 | ]
2021 | },
2022 | "execution_count": null,
2023 | "metadata": {},
2024 | "output_type": "execute_result"
2025 | }
2026 | ],
2027 | "source": [
2028 | "await call_func_async('asums', {'a': 1, 'b': 2}, ns=[asums])"
2029 | ]
2030 | },
2031 | {
2032 | "cell_type": "code",
2033 | "execution_count": null,
2034 | "id": "91092ee9",
2035 | "metadata": {},
2036 | "outputs": [],
2037 | "source": [
2038 | "test_eq(await call_func_async('asums', {'a': 1, 'b': '2'}, ns=[asums], raise_on_err=False), \"unsupported operand type(s) for +: 'int' and 'str'\")"
2039 | ]
2040 | },
2041 | {
2042 | "cell_type": "code",
2043 | "execution_count": null,
2044 | "id": "a06776cf",
2045 | "metadata": {},
2046 | "outputs": [],
2047 | "source": [
2048 | "ex = False\n",
2049 | "try: await call_func_async('asums', {'a': 1, 'b': '2'}, ns=[asums], raise_on_err=True)\n",
2050 | "except: ex = True\n",
2051 | "assert ex"
2052 | ]
2053 | },
2054 | {
2055 | "cell_type": "markdown",
2056 | "id": "94ec4289",
2057 | "metadata": {},
2058 | "source": [
2059 | "## Export -"
2060 | ]
2061 | },
2062 | {
2063 | "cell_type": "code",
2064 | "execution_count": null,
2065 | "id": "1e9ee5c1",
2066 | "metadata": {},
2067 | "outputs": [],
2068 | "source": [
2069 | "#|hide\n",
2070 | "#|eval: false\n",
2071 | "from nbdev.doclinks import nbdev_export\n",
2072 | "nbdev_export()"
2073 | ]
2074 | },
2075 | {
2076 | "cell_type": "code",
2077 | "execution_count": null,
2078 | "id": "9cf037e0",
2079 | "metadata": {},
2080 | "outputs": [],
2081 | "source": []
2082 | }
2083 | ],
2084 | "metadata": {
2085 | "kernelspec": {
2086 | "display_name": "python3",
2087 | "language": "python",
2088 | "name": "python3"
2089 | }
2090 | },
2091 | "nbformat": 4,
2092 | "nbformat_minor": 5
2093 | }
2094 |
--------------------------------------------------------------------------------
/02_shell.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "efe78920",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "#|default_exp shell"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "3d773712-12fe-440e-891f-36f59666dfde",
16 | "metadata": {},
17 | "source": [
18 | "# shell source"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "id": "1328ef69",
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "#| exports\n",
29 | "import ast, time, signal, traceback\n",
30 | "from fastcore.utils import *"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "id": "481b4368",
36 | "metadata": {},
37 | "source": [
38 | "`get_shell` is like `python`, except it also maintains a stateful interpreter, rather than just running a single line of code. This is implemented using IPython, so that must be installed."
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "id": "6bbf062d",
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "#| exports\n",
49 | "from IPython.terminal.interactiveshell import TerminalInteractiveShell\n",
50 | "from IPython.utils.capture import capture_output"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "id": "d3d04ec5",
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "def exception2str(ex:Exception)->str:\n",
61 | " \"Convert exception `ex` into a string\"\n",
62 | " return ''.join(traceback.format_exception(type(ex), ex, ex.__traceback__))"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "id": "d6ba32b4",
69 | "metadata": {},
70 | "outputs": [
71 | {
72 | "name": "stdout",
73 | "output_type": "stream",
74 | "text": [
75 | "Traceback (most recent call last):\n",
76 | " File \"/var/folders/ss/34z569j921v58v8n1n_8z7h40000gn/T/ipykernel_37260/4058275565.py\", line 1, in \n",
77 | " try: print(1/0)\n",
78 | " ~^~\n",
79 | "ZeroDivisionError: division by zero\n",
80 | "\n"
81 | ]
82 | }
83 | ],
84 | "source": [
85 | "try: print(1/0)\n",
86 | "except Exception as e: print(exception2str(e))"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "id": "34099c2f",
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "#| exports\n",
97 | "TerminalInteractiveShell.orig_run = TerminalInteractiveShell.run_cell"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "id": "d6aa8e7b",
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "#| exports\n",
108 | "@patch\n",
109 | "def run_cell(self:TerminalInteractiveShell, cell, timeout=None):\n",
110 | " \"Wrapper for original `run_cell` which adds timeout and output capture\"\n",
111 | " if timeout:\n",
112 | " def handler(*args): raise TimeoutError()\n",
113 | " signal.signal(signal.SIGALRM, handler)\n",
114 | " signal.alarm(timeout)\n",
115 | " try:\n",
116 | " with capture_output() as io: result = self.orig_run(cell)\n",
117 | " result.stdout = io.stdout\n",
118 | " return result\n",
119 | " except TimeoutException as e:\n",
120 | " result = self.ExecutionResult(error_before_exec=None, error_in_exec=e)\n",
121 | " finally:\n",
122 | " if timeout: signal.alarm(0)"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": null,
128 | "id": "cdadbb12",
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "#| exports\n",
133 | "def get_shell()->TerminalInteractiveShell:\n",
134 | " \"Get a `TerminalInteractiveShell` with minimal functionality\"\n",
135 | " sh = TerminalInteractiveShell()\n",
136 | " sh.logger.log_output = sh.history_manager.enabled = False\n",
137 | " dh = sh.displayhook\n",
138 | " dh.finish_displayhook = dh.write_output_prompt = dh.start_displayhook = lambda: None\n",
139 | " dh.write_format_data = lambda format_dict, md_dict=None: None\n",
140 | " sh.logstart = sh.automagic = sh.autoindent = False\n",
141 | " sh.autocall = 0\n",
142 | " sh.system = lambda cmd: None\n",
143 | " return sh"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "id": "5ffbe57e",
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "shell = get_shell()"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "id": "b03b78b3",
160 | "metadata": {},
161 | "outputs": [
162 | {
163 | "data": {
164 | "text/plain": [
165 | "(2, '3\\n')"
166 | ]
167 | },
168 | "execution_count": null,
169 | "metadata": {},
170 | "output_type": "execute_result"
171 | }
172 | ],
173 | "source": [
174 | "r = shell.run_cell('print(3); 1+1')\n",
175 | "r.result,r.stdout"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "id": "48849fc3",
182 | "metadata": {},
183 | "outputs": [
184 | {
185 | "name": "stdout",
186 | "output_type": "stream",
187 | "text": [
188 | "Traceback (most recent call last):\n",
189 | " File \"/Users/jhoward/miniconda3/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3577, in run_code\n",
190 | " exec(code_obj, self.user_global_ns, self.user_ns)\n",
191 | " File \"\", line 1, in \n",
192 | " raise Exception(\"blah\")\n",
193 | "Exception: blah\n",
194 | "\n"
195 | ]
196 | }
197 | ],
198 | "source": [
199 | "r = shell.run_cell('raise Exception(\"blah\")')\n",
200 | "print(exception2str(r.error_in_exec))"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "id": "ddabea6d",
207 | "metadata": {},
208 | "outputs": [
209 | {
210 | "data": {
211 | "text/plain": [
212 | "TimeoutError()"
213 | ]
214 | },
215 | "execution_count": null,
216 | "metadata": {},
217 | "output_type": "execute_result"
218 | }
219 | ],
220 | "source": [
221 | "r = shell.run_cell('import time; time.sleep(10)', timeout=1)\n",
222 | "r.error_in_exec"
223 | ]
224 | },
225 | {
226 | "cell_type": "markdown",
227 | "id": "94ec4289",
228 | "metadata": {},
229 | "source": [
230 | "## Export -"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "id": "1e9ee5c1",
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "#|hide\n",
241 | "#|eval: false\n",
242 | "from nbdev.doclinks import nbdev_export\n",
243 | "nbdev_export()"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "id": "207f9715",
250 | "metadata": {},
251 | "outputs": [],
252 | "source": []
253 | }
254 | ],
255 | "metadata": {
256 | "kernelspec": {
257 | "display_name": "python3",
258 | "language": "python",
259 | "name": "python3"
260 | }
261 | },
262 | "nbformat": 4,
263 | "nbformat_minor": 5
264 | }
265 |
--------------------------------------------------------------------------------
/03_download.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "92c3dff2",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "#| default_exp download"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "id": "1d533800",
16 | "metadata": {},
17 | "source": [
18 | "# Download helpers\n",
19 | "\n",
20 | "- Download and process LLM-ready documents"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "id": "e58d8c43",
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "#| export\n",
31 | "from fastcore.utils import *\n",
32 | "from httpx import get\n",
33 | "from fastcore.meta import delegates\n",
34 | "from urllib.parse import urlparse, urljoin"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "id": "30199708",
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "from IPython.display import Markdown,HTML\n",
45 | "from fastcore.test import *"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "id": "95c4cab1",
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "#| export\n",
56 | "def clean_md(text, rm_comments=True, rm_details=True):\n",
57 | " \"Remove comments and `` sections from `text`\"\n",
58 | " if rm_comments: text = re.sub(r'\\n?\\n?', '', text, flags=re.DOTALL)\n",
59 | " if rm_details: text = re.sub(r'\\n?.*? \\n?', '', text, flags=re.DOTALL)\n",
60 | " return text"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "id": "0f3d5c69",
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "#| export\n",
71 | "@delegates(get)\n",
72 | "def read_md(url, rm_comments=True, rm_details=True, **kwargs):\n",
73 | " \"Read text from `url` and clean with `clean_docs`\"\n",
74 | " return clean_md(get(url, **kwargs).text, rm_comments=rm_comments, rm_details=rm_details)"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "id": "478d5508",
81 | "metadata": {},
82 | "outputs": [],
83 | "source": [
84 | "mdurl = 'https://claudette.answer.ai/index.html.md'\n",
85 | "md = read_md(mdurl)\n",
86 | "# Markdown(md)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "id": "d8d61937",
93 | "metadata": {},
94 | "outputs": [],
95 | "source": [
96 | "#| export\n",
97 | "def html2md(s:str, ignore_links=True):\n",
98 | " \"Convert `s` from HTML to markdown\"\n",
99 | " import html2text\n",
100 | " o = html2text.HTML2Text(bodywidth=5000)\n",
101 | " o.ignore_links = ignore_links\n",
102 | " o.mark_code = True\n",
103 | " o.ignore_images = True\n",
104 | " return o.handle(s)"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "id": "5e897053",
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "#| export\n",
115 | "def read_html(url, # URL to read\n",
116 | " sel=None, # Read only outerHTML of CSS selector `sel`\n",
117 | " rm_comments=True, # Removes HTML comments\n",
118 | " rm_details=True, # Removes `` tags\n",
119 | " multi=False, # Get all matches to `sel` or first one \n",
120 | " wrap_tag=None, #If multi, each selection wrapped with content\n",
121 | " ignore_links=True,\n",
122 | " ): # Cleaned markdown\n",
123 | " \"Get `url`, optionally selecting CSS selector `sel`, and convert to clean markdown\"\n",
124 | " page = get(url).text\n",
125 | " if sel:\n",
126 | " from bs4 import BeautifulSoup\n",
127 | " soup = BeautifulSoup(page, 'html.parser')\n",
128 | " if multi:\n",
129 | " page = [str(el) for el in soup.select(sel)]\n",
130 | " if not wrap_tag: page = \"\\n\".join(page)\n",
131 | " else: page = str(soup.select_one(sel))\n",
132 | " mds = map(lambda x: clean_md(html2md(x, ignore_links=ignore_links), rm_comments, rm_details=rm_details), tuplify(page))\n",
133 | " if wrap_tag: return '\\n'.join([f\"\\n<{wrap_tag}>\\n{o}{wrap_tag}>\\n\" for o in mds])\n",
134 | " else: return'\\n'.join(mds)"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "id": "1d07c687",
141 | "metadata": {},
142 | "outputs": [],
143 | "source": [
144 | "# test single class selector\n",
145 | "listings = read_html('https://www.answer.ai/', sel='.listing-description')\n",
146 | "assert len(listings) < 500\n",
147 | "\n",
148 | "# Test multi class selector\n",
149 | "listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True)\n",
150 | "assert len(listings) > 1000 # returns more than single so selecting multi\n",
151 | "\n",
152 | "# Test multi_wrap_tag\n",
153 | "listings = read_html('https://www.answer.ai/', sel='.listing-description', multi=True, wrap_tag='document')\n",
154 | "assert '' in listings and '' in listings "
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "id": "20188898",
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "data": {
165 | "text/plain": [
166 | "'[My experience learning GPU programming, and implementing a new GPU education app in the process](./posts/2025-03-17-gpu-programming-scratch.html)\\n\\n'"
167 | ]
168 | },
169 | "execution_count": null,
170 | "metadata": {},
171 | "output_type": "execute_result"
172 | }
173 | ],
174 | "source": [
175 | "read_html('https://www.answer.ai/', sel='.listing-description', ignore_links=False)"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "id": "7406a52d",
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "# test tag css selectors\n",
186 | "assert len(read_html('https://www.answer.ai/', sel='div.listing-description', multi=True)) > 1000\n",
187 | "assert len(read_html('https://www.answer.ai/', sel='div', multi=True)) > 1000"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "id": "8f25e767",
194 | "metadata": {},
195 | "outputs": [],
196 | "source": [
197 | "htmlurl = 'https://hypermedia.systems/hypermedia-a-reintroduction/'\n",
198 | "hmd = read_html(htmlurl)\n",
199 | "assert len(hmd) > 100\n",
200 | "# Markdown(hmd)"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": null,
206 | "id": "066b5532",
207 | "metadata": {},
208 | "outputs": [],
209 | "source": [
210 | "#| export\n",
211 | "def get_llmstxt(url, optional=False, n_workers=None):\n",
212 | " \"Get llms.txt file from and expand it with `llms_txt.create_ctx()`\"\n",
213 | " if not url.endswith('llms.txt'): return None\n",
214 | " import llms_txt\n",
215 | " resp = get(url)\n",
216 | " if resp.status_code!=200: return None\n",
217 | " return llms_txt.create_ctx(resp.text, optional=optional, n_workers=n_workers)"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": null,
223 | "id": "2c370bf2",
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "# print(get_llmstxt('https://llmstxt.org/llms.txt'))"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "id": "a2fc5a55",
234 | "metadata": {},
235 | "outputs": [],
236 | "source": [
237 | "#| export\n",
238 | "def split_url(url):\n",
239 | " \"Split `url` into base, path, and file name, normalising name to '/' if empty\"\n",
240 | " parsed = urlparse(url.strip('/'))\n",
241 | " base = f\"{parsed.scheme}://{parsed.netloc}\"\n",
242 | " path,spl,fname = parsed.path.rpartition('/')\n",
243 | " fname = spl+fname\n",
244 | " if not path and not fname: path='/'\n",
245 | " return base,path,fname"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "id": "1a92b74e",
252 | "metadata": {},
253 | "outputs": [
254 | {
255 | "data": {
256 | "text/plain": [
257 | "[('https://claudette.answer.ai', '', '/path'),\n",
258 | " ('https://claudette.answer.ai', '/', ''),\n",
259 | " ('https://llmstxt.org', '/', ''),\n",
260 | " ('https://llmstxt.org', '/', '')]"
261 | ]
262 | },
263 | "execution_count": null,
264 | "metadata": {},
265 | "output_type": "execute_result"
266 | }
267 | ],
268 | "source": [
269 | "urls = ('https://claudette.answer.ai/path/', 'https://claudette.answer.ai/', 'https://llmstxt.org', 'https://llmstxt.org/')\n",
270 | "\n",
271 | "[split_url(o) for o in urls]"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": null,
277 | "id": "5337c0a2",
278 | "metadata": {},
279 | "outputs": [],
280 | "source": [
281 | "#| export\n",
282 | "def _tryget(url):\n",
283 | " \"Return response from `url` if `status_code!=404`, otherwise `None`\"\n",
284 | " res = get(url)\n",
285 | " return None if res.status_code==404 else url"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": null,
291 | "id": "189f5b24",
292 | "metadata": {},
293 | "outputs": [],
294 | "source": [
295 | "#| export\n",
296 | "def find_docs(url):\n",
297 | " \"If available, return LLM-friendly llms.txt context or markdown file location from `url`\"\n",
298 | " base,path,fname = split_url(url)\n",
299 | " url = (base+path+fname).strip('/')\n",
300 | " if fname=='/llms.txt': return url\n",
301 | " if Path(fname).suffix in('.md', '.txt', '.rst'): return _tryget(url)\n",
302 | " if '.' in fname: return _tryget(url+'.md') or find_docs(url[:url.rfind('/')])\n",
303 | " res = _tryget(url+'/llms.txt')\n",
304 | " if res: return res\n",
305 | " res = _tryget(url+'/index.md')\n",
306 | " if res: return res\n",
307 | " res = _tryget(url+'/index.html.md')\n",
308 | " if res: return res\n",
309 | " res = _tryget(url+'/index-commonmark.md')\n",
310 | " if res: return res\n",
311 | " parsed_url = urlparse(url)\n",
312 | " if parsed_url.path == '/' or not parsed_url.path: return None\n",
313 | " return find_docs(urljoin(url, '..'))"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "id": "5d1722d9",
320 | "metadata": {},
321 | "outputs": [],
322 | "source": [
323 | "fl_url = 'https://answerdotai.github.io/fastlite'"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": null,
329 | "id": "0b226407",
330 | "metadata": {},
331 | "outputs": [
332 | {
333 | "data": {
334 | "text/plain": [
335 | "'https://answerdotai.github.io/fastlite/llms.txt'"
336 | ]
337 | },
338 | "execution_count": null,
339 | "metadata": {},
340 | "output_type": "execute_result"
341 | }
342 | ],
343 | "source": [
344 | "find_docs(fl_url)"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": null,
350 | "id": "14344890",
351 | "metadata": {},
352 | "outputs": [
353 | {
354 | "name": "stdout",
355 | "output_type": "stream",
356 | "text": [
357 | "https://claudette.answer.ai/llms.txt\n",
358 | "https://claudette.answer.ai/llms.txt\n",
359 | "https://llmstxt.org/llms.txt\n",
360 | "https://llmstxt.org/llms.txt\n"
361 | ]
362 | }
363 | ],
364 | "source": [
365 | "for o in urls: print(find_docs(o))"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": null,
371 | "id": "439546d4",
372 | "metadata": {},
373 | "outputs": [],
374 | "source": [
375 | "#| eval: false\n",
376 | "suffixes = [\"/\", \"/tmp\", \"/tmp/tmp/\"]\n",
377 | "for suff in suffixes:\n",
378 | " for o in urls: test_eq(find_docs(o), find_docs(o+suff))\n",
379 | "\n",
380 | "test_eq(find_docs(\"https://github.com\"), \"https://github.com/llms.txt\")\n",
381 | "test_eq(find_docs(\"https://github.com/AnswerDotAI\"), \"https://github.com/llms.txt\")\n",
382 | "test_eq(find_docs(\"https://github.com/AnswerDotAI/\"), \"https://github.com/llms.txt\")"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": null,
388 | "id": "771d1208",
389 | "metadata": {},
390 | "outputs": [],
391 | "source": [
392 | "#| export\n",
393 | "def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True):\n",
394 | " \"If available, return LLM-friendly llms.txt context or markdown file response for `url`\"\n",
395 | " url = find_docs(url)\n",
396 | " if url.endswith('/llms.txt'): res = get_llmstxt(url, optional=optional, n_workers=n_workers)\n",
397 | " else: res = get(url).text\n",
398 | " return clean_md(res, rm_comments=rm_comments, rm_details=rm_details)"
399 | ]
400 | },
401 | {
402 | "cell_type": "markdown",
403 | "id": "94ec4289",
404 | "metadata": {},
405 | "source": [
406 | "## Export -"
407 | ]
408 | },
409 | {
410 | "cell_type": "code",
411 | "execution_count": null,
412 | "id": "1e9ee5c1",
413 | "metadata": {},
414 | "outputs": [],
415 | "source": [
416 | "#|hide\n",
417 | "#|eval: false\n",
418 | "from nbdev.doclinks import nbdev_export\n",
419 | "nbdev_export()"
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": null,
425 | "id": "0c01784b",
426 | "metadata": {},
427 | "outputs": [],
428 | "source": []
429 | }
430 | ],
431 | "metadata": {
432 | "kernelspec": {
433 | "display_name": "python3",
434 | "language": "python",
435 | "name": "python3"
436 | }
437 | },
438 | "nbformat": 4,
439 | "nbformat_minor": 5
440 | }
441 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Release notes
2 |
3 |
4 |
5 | ## 0.2.1
6 |
7 | ### New Features
8 |
9 | - Optionally dont raise error on `call_func` ([#31](https://github.com/AnswerDotAI/toolslm/pull/31)), thanks to [@erikgaas](https://github.com/erikgaas)
10 | - dict support in `get_schema` ([#30](https://github.com/AnswerDotAI/toolslm/issues/30))
11 |
12 |
13 | ## 0.2.0
14 |
15 | ### Breaking changes
16 |
17 | - Optional libs (http2text, beautifulsoup, llms_txt) are no longer automatically installed
18 |
19 | ### New Features
20 |
21 | - Lazily load optional modules ([#29](https://github.com/AnswerDotAI/toolslm/issues/29))
22 |
23 |
24 | ## 0.1.3
25 |
26 | ### New Features
27 |
28 | - Pass glb,loc to python ([#28](https://github.com/AnswerDotAI/toolslm/issues/28))
29 |
30 | ## 0.1.2
31 |
32 | ### New Features
33 |
34 | - Adds `call_func_async` ([#27](https://github.com/AnswerDotAI/toolslm/pull/27)), thanks to [@mikonapoli](https://github.com/mikonapoli)
35 | - Add arg ignore links ([#26](https://github.com/AnswerDotAI/toolslm/pull/26)), thanks to [@Isaac-Flath](https://github.com/Isaac-Flath)
36 |
37 |
38 | ## 0.1.1
39 |
40 | ### New Features
41 |
42 | - Add arg ignore links ([#26](https://github.com/AnswerDotAI/toolslm/pull/26)), thanks to [@Isaac-Flath](https://github.com/Isaac-Flath)
43 |
44 | ### Bugs Squashed
45 |
46 | - fix: prevent markdown heading detection inside code blocks ([#25](https://github.com/AnswerDotAI/toolslm/pull/25)), thanks to [@franckalbinet](https://github.com/franckalbinet)
47 | - Fix markdown hierarchy parsing for arbitrary header levels ([#22](https://github.com/AnswerDotAI/toolslm/pull/22)), thanks to [@erikgaas](https://github.com/erikgaas)
48 |
49 |
50 | ## 0.1.0
51 |
52 | ### Breaking changes
53 |
54 | - Replace `source` with `src` in context generation ([#17](https://github.com/AnswerDotAI/toolslm/issues/17))
55 |
56 |
57 | ## 0.0.8
58 |
59 | ### New Features
60 |
61 | - Escape and print context in `folder2ctx` et al ([#16](https://github.com/AnswerDotAI/toolslm/issues/16))
62 |
63 |
64 | ## 0.0.7
65 |
66 | ### New Features
67 |
68 | - Add `dict2obj` to `md_hier` funcs ([#15](https://github.com/AnswerDotAI/toolslm/issues/15))
69 | - Migrate call_func from claudette to toolslm ([#14](https://github.com/AnswerDotAI/toolslm/pull/14)), thanks to [@ncoop57](https://github.com/ncoop57)
70 | - Allow for getting schemas from nested structures ([#11](https://github.com/AnswerDotAI/toolslm/pull/11)), thanks to [@ncoop57](https://github.com/ncoop57)
71 | - Allow for `sel` to select and wrap multiple element results ([#10](https://github.com/AnswerDotAI/toolslm/pull/10)), thanks to [@Isaac-Flath](https://github.com/Isaac-Flath)
72 |
73 | ### Bugs Squashed
74 |
75 | - Using `get_schema` on class method results in type missing error ([#12](https://github.com/AnswerDotAI/toolslm/issues/12))
76 |
77 |
78 | ## 0.0.6
79 |
80 | ### New Features
81 |
82 | - Add `read_docs` and `find_docs` ([#8](https://github.com/AnswerDotAI/toolslm/issues/8))
83 |
84 |
85 | ## 0.0.5
86 |
87 | ### Bugs Squashed
88 |
89 | - XML tools assume all files have content ([#3](https://github.com/AnswerDotAI/toolslm/issues/3))
90 |
91 |
92 | ## 0.0.4
93 |
94 | - Minor updates
95 |
96 | ## 0.0.2
97 |
98 | - Rename project
99 |
100 |
101 | ## 0.0.1
102 |
103 | - Initial alpha release
104 |
105 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include settings.ini
2 | include LICENSE
3 | include CONTRIBUTING.md
4 | include README.md
5 | recursive-exclude * __pycache__
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # toolslm
2 |
3 |
4 |
5 |
6 | This is a work in progress…
7 |
8 | ## Install
9 |
10 | ``` sh
11 | pip install toolslm
12 | ```
13 |
14 | ## How to use
15 |
16 | ### Context creation
17 |
18 | toolslm has some helpers to make it easier to generate XML context from
19 | files, for instance
20 | [`folder2ctx`](https://AnswerDotAI.github.io/toolslm/xml.html#folder2ctx):
21 |
22 | ``` python
23 | print(folder2ctx('samples', prefix=False, file_glob='*.py'))
24 | ```
25 |
26 |
27 | samples/sample_core.py
28 |
29 | import inspect
30 | empty = inspect.Parameter.empty
31 | models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'
32 |
33 |
34 | JSON doesn’t map as nicely to XML as the `ft` data structure from
35 | `fastcore.xml`, but for simple XML trees it can be convenient. The
36 | [`json_to_xml`](https://AnswerDotAI.github.io/toolslm/xml.html#json_to_xml)
37 | function handles that conversion:
38 |
39 | ``` python
40 | a = dict(surname='Howard', firstnames=['Jeremy','Peter'],
41 | address=dict(state='Queensland',country='Australia'))
42 | print(json_to_xml(a, 'person'))
43 | ```
44 |
45 |
46 | Howard
47 |
48 | - Jeremy
49 | - Peter
50 |
51 |
52 | Queensland
53 | Australia
54 |
55 |
56 |
--------------------------------------------------------------------------------
/_quarto.yml:
--------------------------------------------------------------------------------
1 | project:
2 | type: website
3 |
4 | format:
5 | html:
6 | theme: cosmo
7 | css: styles.css
8 | toc: true
9 | keep-md: true
10 | commonmark: default
11 |
12 | website:
13 | twitter-card: true
14 | open-graph: true
15 | repo-actions: [issue]
16 | navbar:
17 | background: primary
18 | search: true
19 | sidebar:
20 | style: floating
21 |
22 | metadata-files: [nbdev.yml, sidebar.yml]
23 |
--------------------------------------------------------------------------------
/index.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "56e2fbc1",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "#| hide\n",
11 | "from toolslm.xml import *"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "9c85d17d",
17 | "metadata": {},
18 | "source": [
19 | "# toolslm\n",
20 | "\n",
21 | "> Tools to make language models a bit easier to use"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "id": "947109d0",
27 | "metadata": {},
28 | "source": [
29 | "This is a work in progress..."
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "id": "431900fc",
35 | "metadata": {},
36 | "source": [
37 | "## Install"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "id": "6cf13202",
43 | "metadata": {},
44 | "source": [
45 | "```sh\n",
46 | "pip install toolslm\n",
47 | "```"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "id": "36346546",
53 | "metadata": {},
54 | "source": [
55 | "## How to use"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "id": "2a8a7a9a",
61 | "metadata": {},
62 | "source": [
63 | "### Context creation"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "id": "3778e8ed",
69 | "metadata": {},
70 | "source": [
71 | "toolslm has some helpers to make it easier to generate XML context from files, for instance `folder2ctx`:"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "id": "efd52392",
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "\n",
85 | "samples/sample_core.py\n",
86 | "\n",
87 | "import inspect\n",
88 | "empty = inspect.Parameter.empty\n",
89 | "models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'\n",
90 | "\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "print(folder2ctx('samples', prefix=False, file_glob='*.py'))"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "id": "58206da8",
101 | "metadata": {},
102 | "source": [
103 | "JSON doesn't map as nicely to XML as the `ft` data structure from `fastcore.xml`, but for simple XML trees it can be convenient. The `json_to_xml` function handles that conversion:"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "id": "9bcb985e",
110 | "metadata": {},
111 | "outputs": [
112 | {
113 | "name": "stdout",
114 | "output_type": "stream",
115 | "text": [
116 | "\n",
117 | " Howard\n",
118 | " \n",
119 | " - Jeremy
\n",
120 | " - Peter
\n",
121 | " \n",
122 | " \n",
123 | " Queensland\n",
124 | " Australia\n",
125 | " \n",
126 | "\n"
127 | ]
128 | }
129 | ],
130 | "source": [
131 | "a = dict(surname='Howard', firstnames=['Jeremy','Peter'],\n",
132 | " address=dict(state='Queensland',country='Australia'))\n",
133 | "print(json_to_xml(a, 'person'))"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "id": "7a3b2c28",
140 | "metadata": {},
141 | "outputs": [],
142 | "source": []
143 | }
144 | ],
145 | "metadata": {
146 | "kernelspec": {
147 | "display_name": "python3",
148 | "language": "python",
149 | "name": "python3"
150 | }
151 | },
152 | "nbformat": 4,
153 | "nbformat_minor": 5
154 | }
155 |
--------------------------------------------------------------------------------
/nbdev.yml:
--------------------------------------------------------------------------------
1 | project:
2 | output-dir: _docs
3 |
4 | website:
5 | title: "toolslm"
6 | site-url: "https://AnswerDotAI.github.io/toolslm"
7 | description: "Tools to make language models a bit easier to use"
8 | repo-branch: main
9 | repo-url: "https://github.com/AnswerDotAI/toolslm"
10 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=64.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name="toolslm"
7 | requires-python=">=3.9"
8 | dynamic = [ "keywords", "description", "version", "dependencies", "optional-dependencies", "readme", "license", "authors", "classifiers", "entry-points", "scripts", "urls"]
9 |
10 | [tool.uv]
11 | cache-keys = [{ file = "pyproject.toml" }, { file = "settings.ini" }, { file = "setup.py" }]
12 |
--------------------------------------------------------------------------------
/samples/sample_core.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | empty = inspect.Parameter.empty
3 | models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'
4 |
--------------------------------------------------------------------------------
/samples/sample_styles.css:
--------------------------------------------------------------------------------
1 | .cell { margin-bottom: 1rem; }
2 | .cell > .sourceCode { margin-bottom: 0; }
3 | .cell-output > pre { margin-bottom: 0; }
4 |
5 |
--------------------------------------------------------------------------------
/settings.ini:
--------------------------------------------------------------------------------
1 | [DEFAULT]
2 | repo = toolslm
3 | lib_name = toolslm
4 | version = 0.2.2
5 | min_python = 3.9
6 | license = apache2
7 | black_formatting = False
8 | requirements = fastcore>=1.5.47 httpx
9 | doc_path = _docs
10 | lib_path = toolslm
11 | nbs_path = .
12 | recursive = True
13 | tst_flags = notest
14 | put_version_in_init = True
15 | branch = main
16 | custom_sidebar = False
17 | doc_host = https://AnswerDotAI.github.io
18 | doc_baseurl = /toolslm
19 | git_url = https://github.com/AnswerDotAI/toolslm
20 | title = toolslm
21 | audience = Developers
22 | author = Jeremy Howard
23 | author_email = j@fast.ai
24 | copyright = 2024 onwards, Jeremy Howard
25 | description = Tools to make language models a bit easier to use
26 | keywords = nbdev jupyter notebook python
27 | language = English
28 | status = 3
29 | user = AnswerDotAI
30 | readme_nb = index.ipynb
31 | allowed_metadata_keys =
32 | allowed_cell_metadata_keys =
33 | jupyter_hooks = True
34 | clean_ids = True
35 | clear_all = False
36 | conda_user = fastai
37 | console_scripts = folder2ctx=toolslm.xml:folder2ctx_cli
38 | cell_number = True
39 | skip_procs =
40 | update_pyproject = True
41 |
42 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from pkg_resources import parse_version
2 | from configparser import ConfigParser
3 | import setuptools, shlex
4 | assert parse_version(setuptools.__version__)>=parse_version('36.2')
5 |
6 | # note: all settings are in settings.ini; edit there, not here
7 | config = ConfigParser(delimiters=['='])
8 | config.read('settings.ini', encoding='utf-8')
9 | cfg = config['DEFAULT']
10 |
11 | cfg_keys = 'version description keywords author author_email'.split()
12 | expected = cfg_keys + "lib_name user branch license status min_python audience language".split()
13 | for o in expected: assert o in cfg, "missing expected setting: {}".format(o)
14 | setup_cfg = {o:cfg[o] for o in cfg_keys}
15 |
16 | licenses = {
17 | 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'),
18 | 'mit': ('MIT License', 'OSI Approved :: MIT License'),
19 | 'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'),
20 | 'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'),
21 | 'bsd3': ('BSD License', 'OSI Approved :: BSD License'),
22 | }
23 | statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha',
24 | '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ]
25 | py_versions = '3.6 3.7 3.8 3.9 3.10'.split()
26 |
27 | requirements = shlex.split(cfg.get('requirements', ''))
28 | if cfg.get('pip_requirements'): requirements += shlex.split(cfg.get('pip_requirements', ''))
29 | min_python = cfg['min_python']
30 | lic = licenses.get(cfg['license'].lower(), (cfg['license'], None))
31 | dev_requirements = (cfg.get('dev_requirements') or '').split()
32 |
33 | setuptools.setup(
34 | name = cfg['lib_name'],
35 | license = lic[0],
36 | classifiers = [
37 | 'Development Status :: ' + statuses[int(cfg['status'])],
38 | 'Intended Audience :: ' + cfg['audience'].title(),
39 | 'Natural Language :: ' + cfg['language'].title(),
40 | ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []),
41 | url = cfg['git_url'],
42 | packages = setuptools.find_packages(),
43 | include_package_data = True,
44 | install_requires = requirements,
45 | extras_require={ 'dev': dev_requirements },
46 | dependency_links = cfg.get('dep_links','').split(),
47 | python_requires = '>=' + cfg['min_python'],
48 | long_description = open('README.md', encoding='utf-8').read(),
49 | long_description_content_type = 'text/markdown',
50 | zip_safe = False,
51 | entry_points = {
52 | 'console_scripts': cfg.get('console_scripts','').split(),
53 | 'nbdev': [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d']
54 | },
55 | **setup_cfg)
56 |
57 |
58 |
--------------------------------------------------------------------------------
/styles.css:
--------------------------------------------------------------------------------
1 | .cell {
2 | margin-bottom: 1rem;
3 | }
4 |
5 | .cell > .sourceCode {
6 | margin-bottom: 0;
7 | }
8 |
9 | .cell-output > pre {
10 | margin-bottom: 0;
11 | }
12 |
13 | .cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre {
14 | margin-left: 0.8rem;
15 | margin-top: 0;
16 | background: none;
17 | border-left: 2px solid lightsalmon;
18 | border-top-left-radius: 0;
19 | border-top-right-radius: 0;
20 | }
21 |
22 | .cell-output > .sourceCode {
23 | border: none;
24 | }
25 |
26 | .cell-output > .sourceCode {
27 | background: none;
28 | margin-top: 0;
29 | }
30 |
31 | div.description {
32 | padding-left: 2px;
33 | padding-top: 5px;
34 | font-style: italic;
35 | font-size: 135%;
36 | opacity: 70%;
37 | }
38 |
--------------------------------------------------------------------------------
/toolslm/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.2"
2 |
--------------------------------------------------------------------------------
/toolslm/_modidx.py:
--------------------------------------------------------------------------------
1 | # Autogenerated by nbdev
2 |
3 | d = { 'settings': { 'branch': 'main',
4 | 'doc_baseurl': '/toolslm',
5 | 'doc_host': 'https://AnswerDotAI.github.io',
6 | 'git_url': 'https://github.com/AnswerDotAI/toolslm',
7 | 'lib_path': 'toolslm'},
8 | 'syms': { 'toolslm.download': { 'toolslm.download._tryget': ('download.html#_tryget', 'toolslm/download.py'),
9 | 'toolslm.download.clean_md': ('download.html#clean_md', 'toolslm/download.py'),
10 | 'toolslm.download.find_docs': ('download.html#find_docs', 'toolslm/download.py'),
11 | 'toolslm.download.get_llmstxt': ('download.html#get_llmstxt', 'toolslm/download.py'),
12 | 'toolslm.download.html2md': ('download.html#html2md', 'toolslm/download.py'),
13 | 'toolslm.download.read_docs': ('download.html#read_docs', 'toolslm/download.py'),
14 | 'toolslm.download.read_html': ('download.html#read_html', 'toolslm/download.py'),
15 | 'toolslm.download.read_md': ('download.html#read_md', 'toolslm/download.py'),
16 | 'toolslm.download.split_url': ('download.html#split_url', 'toolslm/download.py')},
17 | 'toolslm.funccall': { 'toolslm.funccall.PathArg': ('funccall.html#patharg', 'toolslm/funccall.py'),
18 | 'toolslm.funccall._copy_loc': ('funccall.html#_copy_loc', 'toolslm/funccall.py'),
19 | 'toolslm.funccall._get_nested_schema': ('funccall.html#_get_nested_schema', 'toolslm/funccall.py'),
20 | 'toolslm.funccall._handle_container': ('funccall.html#_handle_container', 'toolslm/funccall.py'),
21 | 'toolslm.funccall._handle_type': ('funccall.html#_handle_type', 'toolslm/funccall.py'),
22 | 'toolslm.funccall._is_container': ('funccall.html#_is_container', 'toolslm/funccall.py'),
23 | 'toolslm.funccall._is_parameterized': ('funccall.html#_is_parameterized', 'toolslm/funccall.py'),
24 | 'toolslm.funccall._param': ('funccall.html#_param', 'toolslm/funccall.py'),
25 | 'toolslm.funccall._process_property': ('funccall.html#_process_property', 'toolslm/funccall.py'),
26 | 'toolslm.funccall._run': ('funccall.html#_run', 'toolslm/funccall.py'),
27 | 'toolslm.funccall._types': ('funccall.html#_types', 'toolslm/funccall.py'),
28 | 'toolslm.funccall.call_func': ('funccall.html#call_func', 'toolslm/funccall.py'),
29 | 'toolslm.funccall.call_func_async': ('funccall.html#call_func_async', 'toolslm/funccall.py'),
30 | 'toolslm.funccall.get_schema': ('funccall.html#get_schema', 'toolslm/funccall.py'),
31 | 'toolslm.funccall.mk_ns': ('funccall.html#mk_ns', 'toolslm/funccall.py'),
32 | 'toolslm.funccall.python': ('funccall.html#python', 'toolslm/funccall.py')},
33 | 'toolslm.md_hier': {},
34 | 'toolslm.shell': { 'toolslm.shell.TerminalInteractiveShell.run_cell': ( 'shell.html#terminalinteractiveshell.run_cell',
35 | 'toolslm/shell.py'),
36 | 'toolslm.shell.get_shell': ('shell.html#get_shell', 'toolslm/shell.py')},
37 | 'toolslm.xml': { 'toolslm.xml._add_nls': ('xml.html#_add_nls', 'toolslm/xml.py'),
38 | 'toolslm.xml.docs_xml': ('xml.html#docs_xml', 'toolslm/xml.py'),
39 | 'toolslm.xml.files2ctx': ('xml.html#files2ctx', 'toolslm/xml.py'),
40 | 'toolslm.xml.folder2ctx': ('xml.html#folder2ctx', 'toolslm/xml.py'),
41 | 'toolslm.xml.folder2ctx_cli': ('xml.html#folder2ctx_cli', 'toolslm/xml.py'),
42 | 'toolslm.xml.json_to_xml': ('xml.html#json_to_xml', 'toolslm/xml.py'),
43 | 'toolslm.xml.mk_doc': ('xml.html#mk_doc', 'toolslm/xml.py'),
44 | 'toolslm.xml.mk_doctype': ('xml.html#mk_doctype', 'toolslm/xml.py')}}}
45 |
--------------------------------------------------------------------------------
/toolslm/download.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../03_download.ipynb.
2 |
3 | # %% auto 0
4 | __all__ = ['clean_md', 'read_md', 'html2md', 'read_html', 'get_llmstxt', 'split_url', 'find_docs', 'read_docs']
5 |
6 | # %% ../03_download.ipynb 2
7 | from fastcore.utils import *
8 | from httpx import get
9 | from fastcore.meta import delegates
10 | from urllib.parse import urlparse, urljoin
11 |
12 | # %% ../03_download.ipynb 4
13 | def clean_md(text, rm_comments=True, rm_details=True):
14 | "Remove comments and `` sections from `text`"
15 | if rm_comments: text = re.sub(r'\n?\n?', '', text, flags=re.DOTALL)
16 | if rm_details: text = re.sub(r'\n?.*? \n?', '', text, flags=re.DOTALL)
17 | return text
18 |
19 | # %% ../03_download.ipynb 5
20 | @delegates(get)
21 | def read_md(url, rm_comments=True, rm_details=True, **kwargs):
22 | "Read text from `url` and clean with `clean_docs`"
23 | return clean_md(get(url, **kwargs).text, rm_comments=rm_comments, rm_details=rm_details)
24 |
25 | # %% ../03_download.ipynb 7
26 | def html2md(s:str, ignore_links=True):
27 | "Convert `s` from HTML to markdown"
28 | import html2text
29 | o = html2text.HTML2Text(bodywidth=5000)
30 | o.ignore_links = ignore_links
31 | o.mark_code = True
32 | o.ignore_images = True
33 | return o.handle(s)
34 |
35 | # %% ../03_download.ipynb 8
36 | def read_html(url, # URL to read
37 | sel=None, # Read only outerHTML of CSS selector `sel`
38 | rm_comments=True, # Removes HTML comments
39 | rm_details=True, # Removes `` tags
40 | multi=False, # Get all matches to `sel` or first one
41 | wrap_tag=None, #If multi, each selection wrapped with content
42 | ignore_links=True,
43 | ): # Cleaned markdown
44 | "Get `url`, optionally selecting CSS selector `sel`, and convert to clean markdown"
45 | page = get(url).text
46 | if sel:
47 | from bs4 import BeautifulSoup
48 | soup = BeautifulSoup(page, 'html.parser')
49 | if multi:
50 | page = [str(el) for el in soup.select(sel)]
51 | if not wrap_tag: page = "\n".join(page)
52 | else: page = str(soup.select_one(sel))
53 | mds = map(lambda x: clean_md(html2md(x, ignore_links=ignore_links), rm_comments, rm_details=rm_details), tuplify(page))
54 | if wrap_tag: return '\n'.join([f"\n<{wrap_tag}>\n{o}{wrap_tag}>\n" for o in mds])
55 | else: return'\n'.join(mds)
56 |
57 | # %% ../03_download.ipynb 13
58 | def get_llmstxt(url, optional=False, n_workers=None):
59 | "Get llms.txt file from and expand it with `llms_txt.create_ctx()`"
60 | if not url.endswith('llms.txt'): return None
61 | import llms_txt
62 | resp = get(url)
63 | if resp.status_code!=200: return None
64 | return llms_txt.create_ctx(resp.text, optional=optional, n_workers=n_workers)
65 |
66 | # %% ../03_download.ipynb 15
67 | def split_url(url):
68 | "Split `url` into base, path, and file name, normalising name to '/' if empty"
69 | parsed = urlparse(url.strip('/'))
70 | base = f"{parsed.scheme}://{parsed.netloc}"
71 | path,spl,fname = parsed.path.rpartition('/')
72 | fname = spl+fname
73 | if not path and not fname: path='/'
74 | return base,path,fname
75 |
76 | # %% ../03_download.ipynb 17
77 | def _tryget(url):
78 | "Return response from `url` if `status_code!=404`, otherwise `None`"
79 | res = get(url)
80 | return None if res.status_code==404 else url
81 |
82 | # %% ../03_download.ipynb 18
83 | def find_docs(url):
84 | "If available, return LLM-friendly llms.txt context or markdown file location from `url`"
85 | base,path,fname = split_url(url)
86 | url = (base+path+fname).strip('/')
87 | if fname=='/llms.txt': return url
88 | if Path(fname).suffix in('.md', '.txt', '.rst'): return _tryget(url)
89 | if '.' in fname: return _tryget(url+'.md') or find_docs(url[:url.rfind('/')])
90 | res = _tryget(url+'/llms.txt')
91 | if res: return res
92 | res = _tryget(url+'/index.md')
93 | if res: return res
94 | res = _tryget(url+'/index.html.md')
95 | if res: return res
96 | res = _tryget(url+'/index-commonmark.md')
97 | if res: return res
98 | parsed_url = urlparse(url)
99 | if parsed_url.path == '/' or not parsed_url.path: return None
100 | return find_docs(urljoin(url, '..'))
101 |
102 | # %% ../03_download.ipynb 23
103 | def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True):
104 | "If available, return LLM-friendly llms.txt context or markdown file response for `url`"
105 | url = find_docs(url)
106 | if url.endswith('/llms.txt'): res = get_llmstxt(url, optional=optional, n_workers=n_workers)
107 | else: res = get(url).text
108 | return clean_md(res, rm_comments=rm_comments, rm_details=rm_details)
109 |
--------------------------------------------------------------------------------
/toolslm/funccall.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../01_funccall.ipynb.
2 |
3 | # %% auto 0
4 | __all__ = ['empty', 'custom_types', 'get_schema', 'PathArg', 'python', 'mk_ns', 'call_func', 'call_func_async']
5 |
6 | # %% ../01_funccall.ipynb 2
7 | import inspect
8 | from collections import abc
9 | from fastcore.utils import *
10 | from fastcore.docments import docments
11 | from typing import get_origin, get_args, Dict, List, Optional, Tuple, Union
12 | from types import UnionType
13 |
14 | # %% ../01_funccall.ipynb 4
15 | empty = inspect.Parameter.empty
16 |
17 | # %% ../01_funccall.ipynb 12
18 | def _types(t:type)->tuple[str,Optional[str]]:
19 | "Tuple of json schema type name and (if appropriate) array item name."
20 | if t is empty: raise TypeError('Missing type')
21 | tmap = {int:"integer", float:"number", str:"string", bool:"boolean", list:"array", dict:"object"}
22 | tmap.update({k.__name__: v for k, v in tmap.items()})
23 | if getattr(t, '__origin__', None) in (list,tuple):
24 | args = getattr(t, '__args__', None)
25 | item_type = "object" if not args else tmap.get(t.__args__[0].__name__, "object")
26 | return "array", item_type
27 | # if t is a string like 'int', directly use the string as the key
28 | elif isinstance(t, str): return tmap.get(t, "object"), None
29 | # if t is the type itself and a container
30 | elif get_origin(t): return tmap.get(get_origin(t).__name__, "object"), None
31 | # if t is the type itself like int, use the __name__ representation as the key
32 | else: return tmap.get(t.__name__, "object"), None
33 |
34 | # %% ../01_funccall.ipynb 19
35 | def _param(name, info):
36 | "json schema parameter given `name` and `info` from docments full dict."
37 | paramt,itemt = _types(info.anno)
38 | pschema = dict(type=paramt, description=info.docment or "")
39 | if itemt: pschema["items"] = {"type": itemt}
40 | if info.default is not empty: pschema["default"] = info.default
41 | return pschema
42 |
43 | # %% ../01_funccall.ipynb 22
44 | custom_types = {Path}
45 |
46 | def _handle_type(t, defs):
47 | "Handle a single type, creating nested schemas if necessary"
48 | if t is NoneType: return {'type': 'null'}
49 | if t in custom_types: return {'type':'string', 'format':t.__name__}
50 | if isinstance(t, type) and not issubclass(t, (int, float, str, bool)) or inspect.isfunction(t):
51 | defs[t.__name__] = _get_nested_schema(t)
52 | return {'$ref': f'#/$defs/{t.__name__}'}
53 | return {'type': _types(t)[0]}
54 |
55 | # %% ../01_funccall.ipynb 24
56 | def _is_container(t):
57 | "Check if type is a container (list, dict, tuple, set, Union)"
58 | origin = get_origin(t)
59 | return origin in (list, dict, tuple, set, Union) if origin else False
60 |
61 | def _is_parameterized(t):
62 | "Check if type has arguments (e.g. list[int] vs list, dict[str, int] vs dict)"
63 | return _is_container(t) and (get_args(t) != ())
64 |
65 | # %% ../01_funccall.ipynb 30
66 | def _handle_container(origin, args, defs):
67 | "Handle container types like dict, list, tuple, set, and Union"
68 | if origin is Union or origin is UnionType:
69 | return {"anyOf": [_handle_type(arg, defs) for arg in args]}
70 | if origin is dict:
71 | value_type = args[1].__args__[0] if hasattr(args[1], '__args__') else args[1]
72 | return {
73 | 'type': 'object',
74 | 'additionalProperties': (
75 | {'type': 'array', 'items': _handle_type(value_type, defs)}
76 | if hasattr(args[1], '__origin__') else _handle_type(args[1], defs)
77 | )
78 | }
79 | elif origin in (list, tuple, set):
80 | schema = {'type': 'array', 'items': _handle_type(args[0], defs)}
81 | if origin is set:
82 | schema['uniqueItems'] = True
83 | return schema
84 | return None
85 |
86 | # %% ../01_funccall.ipynb 31
87 | def _process_property(name, obj, props, req, defs):
88 | "Process a single property of the schema"
89 | p = _param(name, obj)
90 | props[name] = p
91 | if obj.default is empty: req[name] = True
92 |
93 | if _is_container(obj.anno) and _is_parameterized(obj.anno):
94 | p.update(_handle_container(get_origin(obj.anno), get_args(obj.anno), defs))
95 | else:
96 | # Non-container type or container without arguments
97 | p.update(_handle_type(obj.anno, defs))
98 |
99 | # %% ../01_funccall.ipynb 32
100 | def _get_nested_schema(obj):
101 | "Generate nested JSON schema for a class or function"
102 | d = docments(obj, full=True)
103 | props, req, defs = {}, {}, {}
104 |
105 | for n, o in d.items():
106 | if n != 'return' and n != 'self':
107 | _process_property(n, o, props, req, defs)
108 |
109 | schema = dict(type='object', properties=props, title=obj.__name__ if isinstance(obj, type) else None)
110 | if req: schema['required'] = list(req)
111 | if defs: schema['$defs'] = defs
112 | return schema
113 |
114 | # %% ../01_funccall.ipynb 36
115 | def get_schema(f:Union[callable,dict], pname='input_schema')->dict:
116 | "Generate JSON schema for a class, function, or method"
117 | if isinstance(f, dict): return f
118 | schema = _get_nested_schema(f)
119 | desc = f.__doc__
120 | assert desc, "Docstring missing!"
121 | d = docments(f, full=True)
122 | ret = d.pop('return')
123 | if ret.anno is not empty: desc += f'\n\nReturns:\n- type: {_types(ret.anno)[0]}'
124 | return {"name": f.__name__, "description": desc, pname: schema}
125 |
126 | # %% ../01_funccall.ipynb 47
127 | def PathArg(
128 | path: str # A filesystem path
129 | ): return Path(path)
130 |
131 | # %% ../01_funccall.ipynb 67
132 | import ast, time, signal, traceback
133 | from fastcore.utils import *
134 |
135 | # %% ../01_funccall.ipynb 68
136 | def _copy_loc(new, orig):
137 | "Copy location information from original node to new node and all children."
138 | new = ast.copy_location(new, orig)
139 | for field, o in ast.iter_fields(new):
140 | if isinstance(o, ast.AST): setattr(new, field, _copy_loc(o, orig))
141 | elif isinstance(o, list): setattr(new, field, [_copy_loc(value, orig) for value in o])
142 | return new
143 |
144 | # %% ../01_funccall.ipynb 70
145 | def _run(code:str, glb:dict=None, loc:dict=None):
146 | "Run `code`, returning final expression (similar to IPython)"
147 | tree = ast.parse(code)
148 | last_node = tree.body[-1] if tree.body else None
149 |
150 | # If the last node is an expression, modify the AST to capture the result
151 | if isinstance(last_node, ast.Expr):
152 | tgt = [ast.Name(id='_result', ctx=ast.Store())]
153 | assign_node = ast.Assign(targets=tgt, value=last_node.value)
154 | tree.body[-1] = _copy_loc(assign_node, last_node)
155 |
156 | compiled_code = compile(tree, filename='', mode='exec')
157 | glb = glb or {}
158 | stdout_buffer = io.StringIO()
159 | saved_stdout = sys.stdout
160 | sys.stdout = stdout_buffer
161 | try: exec(compiled_code, glb, loc)
162 | finally: sys.stdout = saved_stdout
163 | _result = glb.get('_result', None)
164 | if _result is not None: return _result
165 | return stdout_buffer.getvalue().strip()
166 |
167 | # %% ../01_funccall.ipynb 75
168 | def python(code:str, # Code to execute
169 | glb:Optional[dict]=None, # Globals namespace
170 | loc:Optional[dict]=None, # Locals namespace
171 | timeout:int=3600 # Maximum run time in seconds before a `TimeoutError` is raised
172 | ): # Result of last node, if it's an expression, or `None` otherwise
173 | """Executes python `code` with `timeout` and returning final expression (similar to IPython).
174 | Raised exceptions are returned as a string, with a stack trace."""
175 | def handler(*args): raise TimeoutError()
176 | if glb is None: glb = inspect.currentframe().f_back.f_globals
177 | if loc is None: loc=glb
178 | signal.signal(signal.SIGALRM, handler)
179 | signal.alarm(timeout)
180 | try: return _run(code, glb, loc)
181 | except Exception as e: return traceback.format_exc()
182 | finally: signal.alarm(0)
183 |
184 | # %% ../01_funccall.ipynb 86
185 | def mk_ns(*funcs_or_objs):
186 | merged = {}
187 | for o in funcs_or_objs:
188 | if isinstance(o, type): merged |= {n:getattr(o,n) for n,m in o.__dict__.items() if isinstance(m, (staticmethod, classmethod))}
189 | if isinstance(o, object): merged |= {n:getattr(o,n) for n, m in inspect.getmembers(o, inspect.ismethod)} | {n:m for n,m in o.__class__.__dict__.items() if isinstance(m, staticmethod)}
190 | if callable(o) and hasattr(o, '__name__'): merged |= {o.__name__: o}
191 | return merged
192 |
193 | # %% ../01_funccall.ipynb 95
194 | def call_func(fc_name, fc_inputs, ns, raise_on_err=True):
195 | "Call the function `fc_name` with the given `fc_inputs` using namespace `ns`."
196 | if not isinstance(ns, abc.Mapping): ns = mk_ns(*ns)
197 | func = ns[fc_name]
198 | try: return func(**fc_inputs)
199 | except Exception as e:
200 | if raise_on_err: raise e
201 | else: return traceback.format_exc()
202 |
203 | # %% ../01_funccall.ipynb 106
204 | async def call_func_async(fc_name, fc_inputs, ns, raise_on_err=True):
205 | "Awaits the function `fc_name` with the given `fc_inputs` using namespace `ns`."
206 | res = call_func(fc_name, fc_inputs, ns, raise_on_err=raise_on_err)
207 | if inspect.iscoroutine(res):
208 | try: res = await res
209 | except Exception as e:
210 | if raise_on_err: raise e
211 | else: return traceback.format_exc()
212 | return res
213 |
--------------------------------------------------------------------------------
/toolslm/md_hier.py:
--------------------------------------------------------------------------------
1 | import re
2 | from fastcore.utils import *
3 | __all__ = ['markdown_to_dict', 'create_heading_dict']
4 |
5 | def markdown_to_dict(markdown_content):
6 | def clean_heading(text): return re.sub(r'[^A-Za-z0-9 ]+', '', text).strip()
7 |
8 | lines = markdown_content.splitlines()
9 | headings = []
10 | in_code_block = False
11 |
12 | # Parse headings with their levels and line numbers
13 | for idx, line in enumerate(lines):
14 | # Toggle code block state when encountering fence
15 | if line.strip().startswith('```'): in_code_block = not in_code_block
16 |
17 | # Only detect headings when not in a code block
18 | if in_code_block: continue
19 | match = re.match(r'^(#{1,6})\s*(.*)', line)
20 | if match:
21 | level = len(match.group(1))
22 | text = match.group(2).strip()
23 | headings.append({'level': level, 'text': text, 'line': idx})
24 |
25 | # Assign content to each heading, including subheadings
26 | for i, h in enumerate(headings):
27 | start = h['line'] # Include the heading line itself
28 | # Find the end index: next heading of same or higher level
29 | for j in range(i + 1, len(headings)):
30 | if headings[j]['level'] <= h['level']:
31 | end = headings[j]['line']
32 | break
33 | else: end = len(lines)
34 | h['content'] = '\n'.join(lines[start:end]).strip()
35 |
36 | # Build the dictionary with hierarchical keys
37 | result,stack = {},[]
38 | first_level = headings[0]['level']
39 | for h in headings:
40 | stack = stack[:h['level'] - first_level] + [clean_heading(h['text'])]
41 | key = '.'.join(stack)
42 | result[key] = h['content']
43 | return dict2obj(result)
44 |
45 | def create_heading_dict(text):
46 | text = re.sub(r'```[\s\S]*?```', '', text)
47 | headings = re.findall(r'^#+.*', text, flags=re.MULTILINE)
48 | result = {}
49 | stack = [result]
50 | prev_level = 0
51 |
52 | for heading in headings:
53 | level = heading.count('#')
54 | title = heading.strip('#').strip()
55 | while level <= prev_level:
56 | stack.pop()
57 | prev_level -= 1
58 | new_dict = {}
59 | stack[-1][title] = new_dict
60 | stack.append(new_dict)
61 | prev_level = level
62 | return dict2obj(result)
63 |
64 |
65 | if __name__=='__main__':
66 | md_content = """
67 | # User
68 |
69 | This is the User section.
70 |
71 | ## Tokens
72 |
73 | Details about tokens.
74 |
75 | ### Value
76 |
77 | The value of tokens.
78 |
79 | Some more details.
80 |
81 | ## Settings
82 |
83 | User settings information.
84 |
85 | # Admin
86 |
87 | Admin section.
88 |
89 | ## Users
90 |
91 | Admin users management.
92 | """
93 |
94 | result = markdown_to_dict(md_content)
95 | #for key, value in result.items(): print(f'Key: {key}\nValue:\n{value}\n{"-"*40}')
96 |
97 | def test_empty_content():
98 | md_content = "# Empty Heading"
99 | result = markdown_to_dict(md_content)
100 | assert result['Empty Heading'] == '# Empty Heading'
101 |
102 | def test_special_characters():
103 | md_content = "# Heading *With* Special _Characters_!\nContent under heading."
104 | result = markdown_to_dict(md_content)
105 | assert 'Heading With Special Characters' in result
106 | assert result['Heading With Special Characters'] == '# Heading *With* Special _Characters_!\nContent under heading.'
107 |
108 | def test_duplicate_headings():
109 | md_content = "# Duplicate\n## Duplicate\n### Duplicate\nContent under duplicate headings."
110 | result = markdown_to_dict(md_content)
111 | assert 'Duplicate' in result
112 | assert 'Duplicate.Duplicate' in result
113 | assert 'Duplicate.Duplicate.Duplicate' in result
114 | assert result['Duplicate.Duplicate.Duplicate'] == '### Duplicate\nContent under duplicate headings.'
115 |
116 | def test_no_content():
117 | md_content = "# No Content Heading\n## Subheading"
118 | result = markdown_to_dict(md_content)
119 | assert result['No Content Heading'] == '# No Content Heading\n## Subheading'
120 | assert result['No Content Heading.Subheading'] == '## Subheading'
121 |
122 | def test_different_levels():
123 | md_content = "### Level 3 Heading\nContent at level 3.\n# Level 1 Heading\nContent at level 1."
124 | result = markdown_to_dict(md_content)
125 | assert 'Level 3 Heading' in result
126 | assert 'Level 1 Heading' in result
127 | assert result['Level 3 Heading'] == '### Level 3 Heading\nContent at level 3.'
128 | assert result['Level 1 Heading'] == '# Level 1 Heading\nContent at level 1.'
129 |
130 | def test_parent_includes_subheadings():
131 | md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content."
132 | result = markdown_to_dict(md_content)
133 | assert result['Parent'] == '# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content.'
134 | assert result['Parent.Child'] == '## Child\nChild content.\n### Grandchild\nGrandchild content.'
135 | assert result['Parent.Child.Grandchild'] == '### Grandchild\nGrandchild content.'
136 |
137 | def test_multiple_level2_siblings():
138 | md_content = "##Sib 1\n##Sib 2\n##Sib 3\n##Sib 4\n##Sib 5'"
139 | result = markdown_to_dict(md_content)
140 | assert 'Sib 1' in result
141 | assert 'Sib 2' in result
142 | assert 'Sib 3' in result
143 | assert 'Sib 4' in result
144 | assert 'Sib 5' in result
145 |
146 | def test_code_chunks_escaped():
147 | md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
148 | result = markdown_to_dict(md_content)
149 | assert 'Code comment' not in result
150 | assert "# Code comment" in result['Parent.Child']
151 |
152 | test_empty_content()
153 | test_special_characters()
154 | test_duplicate_headings()
155 | test_no_content()
156 | test_different_levels()
157 | test_parent_includes_subheadings()
158 | test_multiple_level2_siblings()
159 | test_code_chunks_escaped()
160 | print('tests passed')
161 |
162 | def test_nested_headings():
163 | md_content = "# Parent\nParent content.\n## Child\nChild content.\n### Grandchild\nGrandchild content."
164 | result = create_heading_dict(md_content)
165 | assert 'Child' in result['Parent']
166 | assert 'Grandchild' in result['Parent']['Child']
167 |
168 | def test_code_chunks_escaped():
169 | md_content = "# Parent\nParent content.\n## Child\nChild content.\n```python\n# Code comment\nprint('Hello, world!')\n```"
170 | result = create_heading_dict(md_content)
171 | assert 'Code comment' not in result
172 |
173 | test_nested_headings()
174 | test_code_chunks_escaped()
175 | print('tests passed')
--------------------------------------------------------------------------------
/toolslm/shell.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../02_shell.ipynb.
2 |
3 | # %% auto 0
4 | __all__ = ['get_shell']
5 |
6 | # %% ../02_shell.ipynb 2
7 | import ast, time, signal, traceback
8 | from fastcore.utils import *
9 |
10 | # %% ../02_shell.ipynb 4
11 | from IPython.terminal.interactiveshell import TerminalInteractiveShell
12 | from IPython.utils.capture import capture_output
13 |
14 | # %% ../02_shell.ipynb 7
15 | TerminalInteractiveShell.orig_run = TerminalInteractiveShell.run_cell
16 |
17 | # %% ../02_shell.ipynb 8
18 | @patch
19 | def run_cell(self:TerminalInteractiveShell, cell, timeout=None):
20 | "Wrapper for original `run_cell` which adds timeout and output capture"
21 | if timeout:
22 | def handler(*args): raise TimeoutError()
23 | signal.signal(signal.SIGALRM, handler)
24 | signal.alarm(timeout)
25 | try:
26 | with capture_output() as io: result = self.orig_run(cell)
27 | result.stdout = io.stdout
28 | return result
29 | except TimeoutException as e:
30 | result = self.ExecutionResult(error_before_exec=None, error_in_exec=e)
31 | finally:
32 | if timeout: signal.alarm(0)
33 |
34 | # %% ../02_shell.ipynb 9
35 | def get_shell()->TerminalInteractiveShell:
36 | "Get a `TerminalInteractiveShell` with minimal functionality"
37 | sh = TerminalInteractiveShell()
38 | sh.logger.log_output = sh.history_manager.enabled = False
39 | dh = sh.displayhook
40 | dh.finish_displayhook = dh.write_output_prompt = dh.start_displayhook = lambda: None
41 | dh.write_format_data = lambda format_dict, md_dict=None: None
42 | sh.logstart = sh.automagic = sh.autoindent = False
43 | sh.autocall = 0
44 | sh.system = lambda cmd: None
45 | return sh
46 |
--------------------------------------------------------------------------------
/toolslm/xml.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED! DO NOT EDIT! File to edit: ../00_xml.ipynb.
2 |
3 | # %% auto 0
4 | __all__ = ['doctype', 'json_to_xml', 'mk_doctype', 'mk_doc', 'docs_xml', 'files2ctx', 'folder2ctx', 'folder2ctx_cli']
5 |
6 | # %% ../00_xml.ipynb 3
7 | import hashlib,xml.etree.ElementTree as ET
8 | from collections import namedtuple
9 |
10 | from fastcore.utils import *
11 | from fastcore.meta import delegates
12 | from fastcore.xtras import hl_md
13 | from fastcore.xml import to_xml, Document, Documents, Document_content, Src
14 | from fastcore.script import call_parse
15 | try: from IPython import display
16 | except: display=None
17 |
18 | # %% ../00_xml.ipynb 4
19 | def json_to_xml(d:dict, # JSON dictionary to convert
20 | rnm:str # Root name
21 | )->str:
22 | "Convert `d` to XML."
23 | root = ET.Element(rnm)
24 | def build_xml(data, parent):
25 | if isinstance(data, dict):
26 | for key, value in data.items(): build_xml(value, ET.SubElement(parent, key))
27 | elif isinstance(data, list):
28 | for item in data: build_xml(item, ET.SubElement(parent, 'item'))
29 | else: parent.text = str(data)
30 | build_xml(d, root)
31 | ET.indent(root)
32 | return ET.tostring(root, encoding='unicode')
33 |
34 | # %% ../00_xml.ipynb 9
35 | doctype = namedtuple('doctype', ['src', 'content'])
36 |
37 | # %% ../00_xml.ipynb 11
38 | def _add_nls(s):
39 | "Add newlines to start and end of `s` if missing"
40 | if not s: return s
41 | if s[ 0]!='\n': s = '\n'+s
42 | if s[-1]!='\n': s = s+'\n'
43 | return s
44 |
45 | # %% ../00_xml.ipynb 16
46 | def mk_doctype(content:str, # The document content
47 | src:Optional[str]=None # URL, filename, etc; defaults to `md5(content)` if not provided
48 | ) -> namedtuple:
49 | "Create a `doctype` named tuple"
50 | if src is None: src = hashlib.md5(content.encode()).hexdigest()[:8]
51 | return doctype(_add_nls(str(src).strip()), _add_nls(content.strip()))
52 |
53 | # %% ../00_xml.ipynb 19
54 | def mk_doc(index:int, # The document index
55 | content:str, # The document content
56 | src:Optional[str]=None, # URL, filename, etc; defaults to `md5(content)` if not provided
57 | **kwargs
58 | ) -> tuple:
59 | "Create an `ft` format tuple for a single doc in Anthropic's recommended format"
60 | dt = mk_doctype(content, src)
61 | content = Document_content(NotStr(dt.content))
62 | src = Src(NotStr(dt.src))
63 | return Document(src, content, index=index, **kwargs)
64 |
65 | # %% ../00_xml.ipynb 22
66 | def docs_xml(docs:list[str], # The content of each document
67 | srcs:Optional[list]=None, # URLs, filenames, etc; each one defaults to `md5(content)` if not provided
68 | prefix:bool=True, # Include Anthropic's suggested prose intro?
69 | details:Optional[list]=None # Optional list of dicts with additional attrs for each doc
70 | )->str:
71 | "Create an XML string containing `docs` in Anthropic's recommended format"
72 | pre = 'Here are some documents for you to reference for your task:\n\n' if prefix else ''
73 | if srcs is None: srcs = [None]*len(docs)
74 | if details is None: details = [{}]*len(docs)
75 | docs = (mk_doc(i+1, d, s, **kw) for i,(d,s,kw) in enumerate(zip(docs,srcs,details)))
76 | return pre + to_xml(Documents(docs))
77 |
78 | # %% ../00_xml.ipynb 29
79 | def files2ctx(
80 | fnames:list[Union[str,Path]], # List of file names to add to context
81 | prefix:bool=True # Include Anthropic's suggested prose intro?
82 | )->str: # XML for LM context
83 | fnames = [Path(o) for o in fnames]
84 | contents = [o.read_text() for o in fnames]
85 | return docs_xml(contents, fnames, prefix=prefix)
86 |
87 | # %% ../00_xml.ipynb 32
88 | @delegates(globtastic)
89 | def folder2ctx(
90 | folder:Union[str,Path], # Folder name containing files to add to context
91 | prefix:bool=True, # Include Anthropic's suggested prose intro?
92 | **kwargs # Passed to `globtastic`
93 | )->str: # XML for Claude context
94 | fnames = globtastic(folder, **kwargs)
95 | return files2ctx(fnames, prefix=prefix)
96 |
97 | # %% ../00_xml.ipynb 34
98 | @call_parse
99 | @delegates(folder2ctx)
100 | def folder2ctx_cli(
101 | folder:str, # Folder name containing files to add to context
102 | **kwargs # Passed to `folder2ctx`
103 | )->str: # XML for Claude context
104 | print(folder2ctx(folder, **kwargs))
105 |
--------------------------------------------------------------------------------