├── .github
    ├── dependabot.yml
    └── workflows
    │   └── python-package.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── mark
    ├── __main__.py
    ├── cli.py
    ├── config.py
    ├── llm.py
    ├── llm_request.py
    ├── llm_response.py
    ├── markdown_file.py
    └── scraper.py
├── poetry.lock
├── pyproject.toml
├── templates
    └── default_system_prompt.md
└── tests
    ├── conftest.py
    ├── test_cli.py
    └── test_scraper.py


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "weekly"
12 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.9", "3.10", "3.11"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v4
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install poetry flake8 pytest
31 |         poetry install --with dev
32 |     - name: Lint with flake8
33 |       run: |
34 |         # stop the build if there are Python syntax errors or undefined names
35 |         poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 |         poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 |     - name: Test with pytest
39 |       run: |
40 |         export OPENAI_API_KEY=test_key && poetry run python -m pytest
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | thread.md


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | ## Changelog
  2 | All notable changes to this project will be documented in this file.
  3 | 
  4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
  5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  6 | 
  7 | ### [0.10.2] - 2025-03-10
  8 | #### Changed
  9 | - Bump llm to 0.25
 10 | - Bump pytest from 8.3.4 to 8.3.5
 11 | 
 12 | ### [0.10.1] - 2025-03-05
 13 | #### Fixed
 14 | - Fixed issue where the OPENAI_BASE_URL was no longer being set correctly in the 0.10.0 update. 
 15 | - Streaming is disabled for all completion requests to prevent issues 
 16 | 
 17 | ### [0.10.0] - 2025-02-18
 18 | #### Added
 19 | - `mark models` subcommand will list all available LLM models available
 20 | 
 21 | ### [0.9.0] - 2025-02-18
 22 | #### Fixed
 23 | - OpenAI reasoning models now work with Mark
 24 | 
 25 | #### Added
 26 | - LLM is now a dependency to expand model availability to Mark and delegate low-level model interaction.
 27 | - Added model specification test
 28 | 
 29 | #### Removed
 30 | - Import direct import of `openai` (referencing LLMs dependencies directly)
 31 | 
 32 | ### [0.8.3] - 2025-01-20
 33 | #### Removed
 34 | - Import of `image_to_data_url` has been removed from LangChain.
 35 | 
 36 | ### [0.8.2] - 2024-08-13
 37 | #### Fixed
 38 | - Slightly improved errors raise when scraping pages on low memory hardware
 39 | 
 40 | ### [0.8.1] - 2024-08-13
 41 | #### Changed
 42 | - Bump langchain from 0.2.14 to 0.2.15
 43 | - Bump httpx from 0.27.0 to 0.27.2
 44 | - Bump openai from 1.42.0 to 1.43.0
 45 | - Bump ipython from 8.26.0 to 8.27.0
 46 | - Bump langchain-community from 0.2.4 to 0.2.12
 47 | - Bump openai from 1.41.1 to 1.42.0
 48 | - Bump openai from 1.14.2 to 1.41.1
 49 | - Bump langchain from 0.2.1 to 0.2.14
 50 | - Bump markdownify from 0.12.1 to 0.13.1
 51 | - Bump pyyaml from 6.0.1 to 6.0.2
 52 | - Bump langchain-community from 0.2.1 to 0.2.4
 53 | - Bump ipython from 8.21.0 to 8.26.0
 54 | - Bump flake8 from 7.1.0 to 7.1.1
 55 | - Bump pytest from 6.2.5 to 8.3.
 56 | 
 57 | ### [0.8.0] - 2024-08-13
 58 | #### Added
 59 | - Support for `--model` option to allow for selecting a specific OpenAI model
 60 | 
 61 | ### [0.7.3] - 2024-07-24
 62 | #### Added
 63 | - Support for `--version` option in the CLI
 64 | 
 65 | ### [0.7.2] - 2024-07-24
 66 | #### Added
 67 | - Aliases for cli options `--system` (`-s`) and `--generate-images` (`-i`)
 68 | 
 69 | ### [0.7.1] - 2024-06-28
 70 | #### Fixed
 71 | - Gracefully handle timeouts when fetching urls
 72 | 
 73 | ### [0.7.0] - 2024-06-27
 74 | #### Changed
 75 | - Updated the scraping logic to render pages as clean markdown which exposes the LLM to urls on the page.
 76 | 
 77 | ### [0.6.3] - 2024-06-20
 78 | #### Fixed
 79 | - Gracefully handle broken links in markdown files.
 80 | 
 81 | ### [0.6.2] - 2024-06-20
 82 | #### Added
 83 | - Cleaner OpenAI error handling for common issues
 84 | 
 85 | ### [0.6.1] - 2024-06-20
 86 | #### Removed
 87 | - Response log for image generation
 88 | 
 89 | ### [0.6.0] - 2024-06-20
 90 | #### Added
 91 | - Ability to override the OpenAI endpoint with OPENAI_API_BASE_URL env var
 92 | 
 93 | ### [0.5.0] - 2024-06-18
 94 | #### Added
 95 | - Adding experimental support for DALL-E image generation
 96 | 
 97 | ### [0.4.0] - 2024-06-06
 98 | #### Added
 99 | - Requests are now logged to `~/.mark/logs/`
100 | 
101 | ### [0.3.6] - 2024-06-06
102 | #### Fixed
103 | - USER_AGENT warning
104 | 
105 | ### [0.3.5] - 2024-06-06
106 | #### Added
107 | - Included additional files in the project for `templates/default_system_prompt.md`.
108 | 
109 | #### Changed
110 | - Updated default system prompt and refactored into the templates directory.
111 | 
112 | ### [0.3.4] - 2024-06-04
113 | #### Added
114 | - Returned a pretty error if no `OPENAI_API_KEY` is found.
115 | 
116 | ### [0.3.3] - 2024-06-04
117 | #### Fixed
118 | - Fixed stdin use case.
119 | 
120 | ### [0.3.2] - 2024-06-04
121 | #### Added
122 | - Added local file references to page links.
123 | 
124 | ### [0.3.1] - 2024-06-04
125 | #### Added
126 | - Utilized LangChain image utilities for local image encoding.
127 | 
128 | ### [0.3.0] - 2024-06-04
129 | #### Added
130 | - Bumped version.
131 | - Added new dependencies: `langchain ^0.2.1` and `langchain-community ^0.2.1`.
132 | 
133 | ### [0.2.3] - 2024-05-29
134 | #### Changed
135 | - Updated model.
136 | 
137 | ### [0.2.2] - 2024-05-03
138 | #### Fixed
139 | - Fixed issue handling malformed image tags.
140 | 
141 | ### [0.2.1] - 2024-04-29
142 | #### Fixed
143 | - Fixed pathing issue with images.
144 | 
145 | ### [0.2.0] - 2024-04-23
146 | #### Added
147 | - Added parsing support for images in markdown text.
148 | - Added new dependencies: `beautifulsoup4 ^4.12.3` and `markdown ^3.6`.
149 | 
150 | ### [0.1.0] - 2024-03-25
151 | #### Added
152 | - Initial setup with dependencies: `python ^3.8`, `PyYAML 5.4.1`, `ipython 8.21.0`, `openai 1.14.2`.
153 | - Replace `typer` with `click` for CLI tool.
154 | - Setup CLI interface with entry point `agent_gpt.__main__:cli`.
155 | - Added development dependencies: `pytest ^6.2.5`.
156 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Ryan Elston
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![Build Status](https://img.shields.io/github/actions/workflow/status/relston/mark/.github/workflows/python-package.yml)
  2 | ![License](https://img.shields.io/github/license/relston/mark)
  3 | 
  4 | # Mark
  5 | **Mark lets you seamlessly use markdown, images and links to interact with LLMs**
  6 | 
  7 | > "Markdown is still the best format for feeding into language models" <br/> - Simon Willison (co-creator of Django)
  8 | 
  9 | Why Markdown + LLMs? For a full rundown of the benefits and use cases, check out [Introducing 'Mark', a Markdown CLI tool for LLMs](https://relston.github.io/markdown/gpt4o/cli/2024/06/07/introducing-mark.html)
 10 | 
 11 | ## Key Features
 12 | - Interact with LLMs using Markdown
 13 | - Visual recognition of markdown image references via multi-modal LLMs
 14 | - Local and remote links are scraped for context
 15 | - LLM responses appended directly into Markdown files
 16 | - `stdin` and `stdout` support for piping
 17 | - Support for Image Generation 
 18 | 
 19 | ## Example
 20 | ```markdown
 21 | Describe ![this image](https://www.animal.photos/mamm1/cat-tig1_files/bengal12.jpg) to me in one short sentence.
 22 | 
 23 | # GPT Response (model: gpt-4o-2024-05-13, system: default)
 24 | A Bengal tiger is walking on a sandy path with greenery in the background.
 25 | 
 26 | # User Response
 27 | List each section of content on [this page](https://en.wikipedia.org/wiki/Bengal_tiger)
 28 | 
 29 | # GPT Response (model: gpt-4o-2024-05-13, system: default)
 30 | ## Sections of Content on the Bengal Tiger Wikipedia Page
 31 | 
 32 | 1. **Taxonomy**
 33 |    - Genetic ancestry
 34 | 
 35 | 2. **Characteristics**
 36 |    - Body weight and size
 37 | 
 38 | 3. **Distribution and habitat**
 39 |    - India
 40 |    - Bangladesh
 41 |     ...
 42 | ```
 43 | 
 44 | This allows you to carry on a conversation directly in the markdown file - or simply integrate the response back into the document you're working on. This is a powerful, flexible and natural way to interact with LLMs that unlocks countless use cases. `mark`'s focus on explicitly using Markdown as both a prompt and response medium for LLMs presents a unique approach. By blending interactive, version-controlled Markdown documents with LLM responses in a CLI tool offers an innovative workflow for those looking to integrate LLM interactions into their development and documentation processes.
 45 | 
 46 | # Install the Mark CLI
 47 | ```bash
 48 | pip install mark
 49 | ```
 50 | 
 51 | # Usage
 52 | By default, `mark` will read a markdown file, extract any context references, and send them to the LLM. The responses are then appended to the markdown file.
 53 | ```bash
 54 | mark path/to/markdown.md
 55 | ```
 56 | *Requires an OpenAI API key in the `OPENAI_API_KEY` environment variable*
 57 | 
 58 | Also supports `stdin` with `stdout` for piping LLM responses into other tools
 59 | ```bash
 60 | cat path/to/markdown.md | mark 
 61 | # LLM response....
 62 | ```
 63 | 
 64 | ## Use a specific LLM model
 65 | You can specify a different LLM model to use with the `--model` (or `-m`) flag
 66 | ```bash
 67 | mark path/to/markdown.md --model gpt-4o-2024-05-13
 68 | ```
 69 | 
 70 | ## Custom system prompts
 71 | The system prompts folder is located at `~/.mark/system_prompts` and it includes a `default.md` prompt. You can add any additional system prompts you'd like to use in this folder and use them with the `--system` (or `-s`) flag.
 72 | ```bash
 73 | # ~/.mark/system_prompts/custom.md
 74 | mark path/to/markdown.md --system custom
 75 | ```
 76 | 
 77 | ## Override the OpenAI API endpoint
 78 | If you want to use a different LLM API endpoint that is fully compatible with the OpenAI API, set the `OPENAI_API_BASE_URL` environment variable to that endpoint value. This should enable you to use OpenAI proxy services like [credal.ai](https://www.credal.ai/), or other LLMs that are compatible with the OpenAI SDK. 
 79 | 
 80 | ## Image Generation 
 81 | To generate an image based on the input just add the `--generate-image` (or `-i`) flag to the command
 82 | ```bash
 83 | mark path/to/markdown.md --generate-image
 84 | ```
 85 | This will generate an image using the 'dall-e-3' model and append it to the markdown file.
 86 | 
 87 | # Development
 88 | ## Local Setup
 89 | ```bash
 90 | poetry install
 91 | ```
 92 | *[Requires poetry](https://python-poetry.org/docs/)*
 93 | 
 94 | ## Run the CLI Tool locally
 95 | ```bash
 96 | poetry run mark path/to/markdown.md
 97 | ```
 98 | 
 99 | ## Run the tests
100 | ```bash
101 | poetry run python -m pytest
102 | ```
103 | 
104 | ## Auto-fix lint errors
105 | ```bash
106 | poetry run autopep8 --in-place --aggressive --aggressive --recursive .
107 | ```
108 | 


--------------------------------------------------------------------------------
/mark/__main__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/relston/mark/149eca99d91275a1bf05f273cb77e46d87003f9a/mark/__main__.py


--------------------------------------------------------------------------------
/mark/cli.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from click_default_group import DefaultGroup
 3 | from click.testing import CliRunner
 4 | from mark import llm
 5 | from llm.cli import cli as llm_cli
 6 | from mark.llm_request import LLMRequest
 7 | from mark.markdown_file import MarkdownFile
 8 | from mark.config import get_config
 9 | from importlib.metadata import version, PackageNotFoundError
10 | 
11 | try:
12 |     package_version = version("mark")
13 | except PackageNotFoundError:
14 |     package_version = "unknown"
15 | 
16 | DEFAULT_MODEL = "gpt-4o"
17 | DALL_E_MODEL = "dall-e-3"
18 | 
19 | 
20 | @click.group(
21 |     cls=DefaultGroup,
22 |     default="down",
23 |     default_if_no_args=True,
24 | )
25 | @click.version_option(version=package_version)
26 | def mark_cli():
27 |     """Markdown powered LLM CLI - Multi-modal AI text generation tool"""
28 | 
29 | 
30 | @mark_cli.command(name="down")
31 | @click.argument('file', type=click.File())
32 | @click.option('--system', '-s', type=click.STRING,
33 |               default='default', help='The system prompt to use')
34 | @click.option('--model', '-m', type=click.STRING, help='The llm model')
35 | @click.option('--generate-image', '-i', is_flag=True, default=False,
36 |               help='EXPERIMENTAL: Generate an image using DALL-E.')
37 | def down(file, system, model, generate_image):
38 |     """
39 |     Default: Process markdown file or stdin
40 | 
41 |     In-document Thread Example:
42 |     mark path/to/markdown.md
43 | 
44 |     stdin Example:
45 |     echo "Hello, World!" | mark -
46 |     """
47 |     system_prompt = get_config().system_prompts().get(system, 'default')
48 |     markdown_file = MarkdownFile(file)
49 | 
50 |     if not model:
51 |         model = DALL_E_MODEL if generate_image else DEFAULT_MODEL
52 | 
53 |     request = LLMRequest(model) \
54 |         .with_prompt(markdown_file.content) \
55 |         .with_system_message(system_prompt)
56 | 
57 |     [request.with_image(image) for image in markdown_file.images]
58 |     [request.with_link(link) for link in markdown_file.links]
59 | 
60 |     if generate_image:
61 |         response = llm.generate_image(request)
62 |     else:
63 |         response = llm.get_completion(request)
64 | 
65 |     response.with_system(system)
66 | 
67 |     if markdown_file.file_path:
68 |         with open(markdown_file.file_path, "a") as file:
69 |             file.write(response.to_markdown())
70 |     else:
71 |         click.echo(response.content)
72 | 
73 | 
74 | @mark_cli.command("models")
75 | def models_command():
76 |     """List available llm models"""
77 |     runner = CliRunner()
78 |     result = runner.invoke(llm_cli, ["models"])
79 |     if result.exception:
80 |         raise click.ClickException(str(result.exception))
81 |     click.echo(result.output)
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     mark_cli()
86 | 


--------------------------------------------------------------------------------
/mark/config.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import os
 3 | from importlib.resources import read_text
 4 | 
 5 | 
 6 | class Config:
 7 |     DEFAULT_SYSTEM_PROMPT_TEMPLATE_PATH = 'templates/default_system_prompt.md'
 8 | 
 9 |     def __init__(self):
10 |         self.config_dir = os.getenv(
11 |             'MARK_CONFIG_PATH',
12 |             os.path.expanduser("~/.mark"))
13 |         self.system_prompts_dir = f"/{self.config_dir}/system_prompts"
14 |         self.default_system_prompt = f"{self.system_prompts_dir}/default.md"
15 |         self.log_folder = f"{self.config_dir}/logs"
16 | 
17 |         if not os.path.exists(self.system_prompts_dir):
18 |             os.makedirs(self.system_prompts_dir)
19 | 
20 |         if not os.path.exists(self.default_system_prompt):
21 |             default_config = read_text('templates', 'default_system_prompt.md')
22 | 
23 |             with open(os.path.expanduser(self.default_system_prompt), "w") as file:
24 |                 file.write(default_config)
25 | 
26 |         if not os.path.exists(self.log_folder):
27 |             os.makedirs(self.log_folder)
28 | 
29 |     def system_prompts(self):
30 |         system_prompts = {}
31 |         for filename in os.listdir(self.system_prompts_dir):
32 |             filepath = os.path.join(self.system_prompts_dir, filename)
33 |             with open(filepath, "r") as file:
34 |                 system_prompt_name = os.path.splitext(filename)[0]
35 |                 system_prompts[system_prompt_name] = file.read()
36 |         return system_prompts
37 | 
38 |     def log(self, content):
39 |         # Get current date and time as string
40 |         dt_string = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
41 |         log_file = f"{self.log_folder}/{dt_string}.md"
42 |         with open(log_file, "w") as file:
43 |             file.write(content)
44 | 
45 | 
46 | _config = None
47 | 
48 | 
49 | def reset():
50 |     """
51 |     Reset the config object.
52 |     """
53 |     global _config
54 |     _config = None
55 | 
56 | 
57 | def get_config():
58 |     """
59 |     Return memoized config object.
60 |     """
61 |     global _config
62 |     if not _config:
63 |         _config = Config()
64 |     return _config
65 | 


--------------------------------------------------------------------------------
/mark/llm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import click
  3 | import llm
  4 | from llm.default_plugins.openai_models import openai, Chat, AsyncChat
  5 | from mark.config import get_config
  6 | from mark.llm_request import LLMRequest
  7 | from mark.llm_response import LLMResponse, LLMImageResponse
  8 | 
  9 | # TODO: Remove this. Only needed to support image generation.
 10 | # Should differ to llm model registration
 11 | OPENAI_BASE_URL = os.getenv('OPENAI_API_BASE_URL', openai.base_url)
 12 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
 13 | if not OPENAI_API_KEY:
 14 |     click.echo(
 15 |         "OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
 16 |     exit(1)
 17 | 
 18 | client = openai.OpenAI(
 19 |     api_key=OPENAI_API_KEY,
 20 |     base_url=OPENAI_BASE_URL
 21 | )
 22 | 
 23 | 
 24 | def handle_openai_errors(func):
 25 |     def error_handler(*args, **kwargs):
 26 |         try:
 27 |             return func(*args, **kwargs)
 28 |         except openai.APIConnectionError as e:
 29 |             click.echo(f"{OPENAI_BASE_URL} could not be reached")
 30 |             click.echo(e.__cause__)
 31 |             exit(1)
 32 |         except openai.RateLimitError:
 33 |             click.echo("RateLimitError was received; we should back off a bit.")
 34 |             exit(1)
 35 |         except openai.BadRequestError as e:
 36 |             click.echo("BadRequestError was received")
 37 |             click.echo(e.message)
 38 |             exit(1)
 39 |         except openai.APIStatusError as e:
 40 |             click.echo("Another non-200-range status code was received")
 41 |             click.echo(e.status_code)
 42 |             click.echo(e.response)
 43 |             exit(1)
 44 | 
 45 |     return error_handler
 46 | 
 47 | 
 48 | def get_completion(llm_request):
 49 |     """
 50 |     Get completion from the OpenAI model for the given prompt and agent.
 51 |     """
 52 |     get_config().log(llm_request.to_log())
 53 | 
 54 |     response_text = _llm_call_completion(llm_request)
 55 | 
 56 |     return LLMResponse(response_text, llm_request.model)
 57 | 
 58 | 
 59 | def generate_image(llm_request):
 60 |     get_config().log(llm_request.to_log())
 61 | 
 62 |     response = _call_generate_image(
 63 |         llm_request.to_flat_prompt(),
 64 |         llm_request.model)
 65 | 
 66 |     return LLMImageResponse(
 67 |         response.url,
 68 |         llm_request.model,
 69 |         response.revised_prompt)
 70 | 
 71 | 
 72 | @handle_openai_errors
 73 | def _call_generate_image(prompt, model):
 74 |     # TODO: Can I manually register the dall-e-3 using the llm api?
 75 |     response = client.images.generate(
 76 |         prompt=prompt,
 77 |         model=model,
 78 |         size="1024x1024",
 79 |         n=1
 80 |     )
 81 | 
 82 |     return response.data[0]
 83 | 
 84 | 
 85 | @handle_openai_errors
 86 | def _llm_call_completion(llm_request: LLMRequest) -> str:
 87 |     model = llm.get_model(llm_request.model)
 88 |     if isinstance(model, (Chat, AsyncChat)) and model.api_base == None:
 89 |         # Backwards compatible with the older override
 90 |         model.api_base = OPENAI_BASE_URL
 91 | 
 92 |     attachment = []
 93 |     for image in llm_request.images:
 94 |         if image.is_web_reference():
 95 |             attachment.append(llm.Attachment(url=image.src))
 96 |         else:
 97 |             attachment.append(llm.Attachment(path=image.src))
 98 | 
 99 |     # llm.Attachment(path="pelican.jpg"),
100 |     # llm.Attachment(url="https://static.simonwillison.net/static/2024/pelicans.jpg"),
101 |     # llm.Attachment(content=b"binary image content here")
102 |     return model.prompt(
103 |         llm_request.prompt,
104 |         system=llm_request.system_content(),
105 |         attachments=attachment,
106 |         stream=False # we do not support streaming
107 |     )
108 | 


--------------------------------------------------------------------------------
/mark/llm_request.py:
--------------------------------------------------------------------------------
 1 | from textwrap import dedent
 2 | from .markdown_file import Image, Link
 3 | from typing import (
 4 |     List,
 5 |     Optional
 6 | )
 7 | 
 8 | 
 9 | class LLMRequest:
10 |     model: str
11 |     prompt: Optional[str]
12 |     system_message: Optional[str]
13 |     images: List[Image]
14 |     links: List[Link]
15 | 
16 |     def __init__(self, model):
17 |         """
18 |         Can serialize itself into a payload that can be sent to the OpenAI API (potentially others in the future)
19 |         """
20 |         self.system_message = None
21 |         self.prompt = None
22 |         self.model = model
23 |         self.images = []
24 |         self.links = []
25 | 
26 |     def with_system_message(self, system_message):
27 |         self.system_message = system_message
28 |         return self
29 | 
30 |     def with_prompt(self, prompt):
31 |         self.prompt = prompt
32 |         return self
33 | 
34 |     def with_image(self, image):
35 |         self.images.append(image)
36 |         return self
37 | 
38 |     def with_link(self, document):
39 |         self.links.append(document)
40 |         return self
41 | 
42 |     def system_content(self) -> str:
43 |         system_content = ""
44 | 
45 |         if self.links:
46 |             link_content_block = "---".join([str(link) for link in self.links])
47 |             system_content += link_content_block
48 | 
49 |         if self.system_message:
50 |             system_content += "\n" + self.system_message
51 | 
52 |         return system_content
53 | 
54 |     def to_payload(self):
55 |         system_message = {"role": "system", "content": self.system_content()}
56 | 
57 |         if self.images:
58 |             user_content = [{'type': 'text', 'text': self.prompt}]
59 |             for image in self.images:
60 |                 if image.url:
61 |                     user_content.append(
62 |                         {'type': 'image_url', 'image_url': {'url': image.url}})
63 |         else:
64 |             user_content = self.prompt
65 | 
66 |         user_message = {"role": "user", "content": user_content}
67 |         return [system_message, user_message]
68 | 
69 |     def to_flat_prompt(self) -> str:
70 |         return self.system_content() + "\n" + self.prompt
71 | 
72 |     def to_log(self) -> str:
73 |         return dedent("""
74 |         # System message
75 |         ---
76 |         """) \
77 |             + self.system_content() \
78 |             + dedent("""
79 |         ---
80 |         # User Message
81 |         ---
82 |         """) \
83 |             + self.prompt \
84 |             + dedent("""
85 |         ---
86 |         # Images
87 |         ---
88 |         """) \
89 |             + "\n".join([image.url for image in self.images])
90 | 


--------------------------------------------------------------------------------
/mark/llm_response.py:
--------------------------------------------------------------------------------
 1 | from textwrap import dedent
 2 | 
 3 | 
 4 | class LLMResponse(object):
 5 |     RESPONSE_TEMPLATE = dedent(
 6 |         """
 7 |         # GPT Response (model: {model}, system: {system})
 8 |         {content}
 9 | 
10 |         # User Response
11 |         """
12 |     )
13 | 
14 |     def __init__(self, content, model):
15 |         self.model = model
16 |         self.content = content
17 |         self.system = 'default'
18 | 
19 |     def with_system(self, system):
20 |         self.system = system
21 |         return self
22 | 
23 |     def to_markdown(self):
24 |         content = self.content
25 |         return self.RESPONSE_TEMPLATE.format(
26 |             model=self.model, system=self.system, content=content)
27 | 
28 | 
29 | class LLMImageResponse(LLMResponse):
30 |     def __init__(self, image_url, model, revised_prompt=None):
31 |         super().__init__(image_url, model)
32 |         self.revised_prompt = revised_prompt
33 | 
34 |     def to_markdown(self):
35 |         content = f"![Generated Image]({self.content})"
36 | 
37 |         if self.revised_prompt:
38 |             content = f"{self.revised_prompt}\n\n{content}"
39 | 
40 |         return self.RESPONSE_TEMPLATE.format(
41 |             model=self.model, system=self.system, content=content)
42 | 


--------------------------------------------------------------------------------
/mark/markdown_file.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from langchain_core.documents import Document
  4 | from io import TextIOWrapper
  5 | from textwrap import dedent
  6 | import click
  7 | from mark import scraper
  8 | import base64
  9 | import mimetypes
 10 | 
 11 | """
 12 | MarkdownFile
 13 | Parses the markdown and extracts image elements from the file, resolving the paths of local images.
 14 | """
 15 | 
 16 | 
 17 | class MarkdownFile:
 18 |     def __init__(self, file_wrapper: TextIOWrapper):
 19 |         """
 20 |         Initializes the MarkdownFile object with a TextIOWrapper, usually obtained from opening a file.
 21 |         """
 22 |         self.file_path = None
 23 |         self.file_dir = None
 24 |         if hasattr(file_wrapper, 'name') and file_wrapper.name != '<stdin>':
 25 |             self.file_path = file_wrapper.name
 26 |             self.file_dir = os.path.dirname(file_wrapper.name)
 27 |         else:
 28 |             self.file_dir = os.getcwd()
 29 |         self.file_content = file_wrapper.read()
 30 |         self._images = None
 31 |         self._links = None
 32 | 
 33 |     @property
 34 |     def content(self):
 35 |         """
 36 |         Returns the content of the markdown file as a string.
 37 |         """
 38 |         return self.file_content
 39 | 
 40 |     @property
 41 |     def images(self):
 42 |         if not self._images:
 43 |             self._images = self._parse_elements(Image)
 44 |         return self._images
 45 | 
 46 |     @property
 47 |     def links(self):
 48 |         if not self._links:
 49 |             self._links = self._parse_elements(Link)
 50 |         return self._links
 51 | 
 52 |     def _parse_elements(self, cls):
 53 |         matches = re.findall(cls.REGX_PATTERN, self.file_content)
 54 |         return [
 55 |             cls.from_reference_folder(self.file_dir)
 56 |             .with_src(src)
 57 |             .with_text(text)
 58 |             for text, src in matches
 59 |         ]
 60 | 
 61 | 
 62 | class PageReference:
 63 |     @classmethod
 64 |     def from_reference_folder(cls, folder):
 65 |         return cls(folder)
 66 | 
 67 |     def __init__(self, reference_folder, src=None):
 68 |         self.reference_folder = reference_folder
 69 |         self.src = src
 70 |         self.uri = None
 71 |         if src:
 72 |             self._resolve_uri()
 73 | 
 74 |     def with_src(self, src):
 75 |         self.src = src
 76 |         self._resolve_uri()
 77 |         return self
 78 | 
 79 |     def with_text(self, text):
 80 |         self.link_text = text
 81 |         return self
 82 | 
 83 |     def is_web_reference(self):
 84 |         return self.src.startswith("http")
 85 | 
 86 |     def _resolve_uri(self):
 87 |         if self.is_web_reference():
 88 |             self.uri = self.src
 89 |         else:
 90 |             self.uri = os.path.normpath(
 91 |                 os.path.join(
 92 |                     self.reference_folder,
 93 |                     self.src))
 94 | 
 95 | 
 96 | class Image(PageReference):
 97 |     # Regular expression to find Markdown image syntax with alt text
 98 |     REGX_PATTERN = r'!\[(.*?)\]\((.*?)\)'
 99 | 
100 |     @property
101 |     def url(self):
102 |         if self.is_web_reference():
103 |             return self.uri
104 |         else:
105 |             try:
106 |                 return Image.image_to_data_url(self.uri)
107 |             except (FileNotFoundError, IsADirectoryError):
108 |                 click.echo(f"Image Reference {self.src} not found. Skipping")
109 |                 return ''
110 | 
111 |     @classmethod
112 |     def encode_image(cls, image_path: str) -> str:
113 |         """Get base64 string from image URI.
114 | 
115 |         Args:
116 |             image_path: The path to the image.
117 | 
118 |         Returns:
119 |             The base64 string of the image.
120 |         """
121 |         with open(image_path, "rb") as image_file:
122 |             return base64.b64encode(image_file.read()).decode("utf-8")
123 | 
124 |     @classmethod
125 |     def image_to_data_url(cls, image_path: str) -> str:
126 |         """Get data URL from image URI.
127 | 
128 |         Args:
129 |             image_path: The path to the image.
130 | 
131 |         Returns:
132 |             The data URL of the image.
133 |         """
134 |         encoding = cls.encode_image(image_path)
135 |         mime_type = mimetypes.guess_type(image_path)[0]
136 |         return f"data:{mime_type};base64,{encoding}"
137 | 
138 | 
139 | class Link(PageReference):
140 |     # Regular expression to find Markdown link syntax
141 |     # it will match `[text](url)` but not `![text](url)`
142 |     REGX_PATTERN = r'(?<!\!)\[([^\]]+)\]\(([^)]+)\)'
143 | 
144 |     def __init__(self, reference_folder, src=None):
145 |         super().__init__(reference_folder, src)
146 |         self._document = None
147 | 
148 |     def __str__(self):
149 |         if not self.document:
150 |             return f"\nNo document found for: {self.src}\n"
151 | 
152 |         serialized_document = dedent(f"""
153 |         Link Text: {self.link_text}
154 |         SRC: {self.src}
155 |         Page Title: {self.document.metadata['title']}
156 |         Page Content:
157 |         """)
158 |         serialized_document += self.document.page_content + "\n"
159 |         return serialized_document
160 | 
161 |     @property
162 |     def document(self):
163 |         if not self._document:
164 |             try:
165 |                 self._document = self._get_document(self.uri)
166 |                 self._document.metadata['link text'] = self.link_text
167 |             except (FileNotFoundError, IsADirectoryError):
168 |                 click.echo(f"Link Reference {self.src} not found. Skipping")
169 |                 return
170 |         return self._document
171 | 
172 |     def _get_document(self, uri):
173 |         if self.is_web_reference():
174 |             return scraper.get(uri).to_document()
175 |         else:
176 |             with open(uri, 'r') as file:
177 |                 file_content = file.read()
178 |                 file_document = Document(
179 |                     page_content=file_content, metadata={
180 |                         'title': os.path.basename(uri)})
181 |                 return file_document
182 | 


--------------------------------------------------------------------------------
/mark/scraper.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | import click
  3 | import re
  4 | import asyncio
  5 | import pyppeteer
  6 | from bs4 import BeautifulSoup
  7 | from markdownify import MarkdownConverter
  8 | from langchain_core.documents import Document
  9 | 
 10 | DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' + \
 11 |     ' AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.198 Safari/537.36'
 12 | 
 13 | 
 14 | class Page(object):
 15 |     def __init__(
 16 |             self,
 17 |             url,
 18 |             body: str = None,
 19 |             soup: BeautifulSoup = None,
 20 |             title: str = None):
 21 |         self.url = url
 22 |         self.body = body
 23 |         self.soup = soup
 24 |         self.title = title
 25 | 
 26 |     def with_title(self, title: str):
 27 |         self.title = title
 28 |         return self
 29 | 
 30 |     def to_document(self):
 31 |         return Document(
 32 |             self.body,
 33 |             metadata={'title': self.title, 'url': self.url}
 34 |         )
 35 | 
 36 | 
 37 | def get(url: str) -> Page:
 38 |     raw_html = get_rendered_html(url)
 39 |     clean_soup = _clean_soup_from_html(raw_html)
 40 |     markdown = _markdown_from_soup(clean_soup)
 41 |     page = Page(url, body=markdown, soup=clean_soup)
 42 | 
 43 |     if title := clean_soup.find('title'):
 44 |         page.with_title(title.text)
 45 | 
 46 |     return page
 47 | 
 48 | 
 49 | def get_rendered_html(url: str) -> str:
 50 |     try:
 51 |         return asyncio.run(_render_page(url))
 52 |     except pyppeteer.errors.BrowserError:
 53 |         click.echo(f"BrowserError while fetching {url}")
 54 |         return "BrowserError while fetching"
 55 |     except pyppeteer.errors.TimeoutError:
 56 |         click.echo(f"Timeout while fetching {url}")
 57 |         return "Timeout while fetching page"
 58 | 
 59 | 
 60 | async def _render_page(url: str) -> str:
 61 |     browser = None
 62 |     try:
 63 |         browser = await pyppeteer.launch()
 64 |         page = await browser.newPage()
 65 |         await page.setUserAgent(DEFAULT_USER_AGENT)
 66 |         await page.goto(url)
 67 |         rendered_html = await page.content()
 68 |     finally:
 69 |         if browser:
 70 |             await browser.close()
 71 |     return rendered_html
 72 | 
 73 | 
 74 | def _clean_soup_from_html(html: str) -> BeautifulSoup:
 75 |     # warnings.filterwarnings("ignore")
 76 | 
 77 |     soup = BeautifulSoup(html, 'html.parser')
 78 | 
 79 |     # List of tags to decompose
 80 |     tags_to_decompose = ['script', 'meta', 'link', 'style']
 81 | 
 82 |     for tag in soup.find_all(True):
 83 |         # Remove class attributes
 84 |         if 'class' in tag.attrs:
 85 |             del tag['class']
 86 | 
 87 |         # Remove style attributes
 88 |         if 'style' in tag.attrs:
 89 |             del tag['style']
 90 | 
 91 |         # Decompose unwanted tags
 92 |         if tag.name in tags_to_decompose:
 93 |             tag.decompose()
 94 | 
 95 |     return soup
 96 | 
 97 | 
 98 | def _markdown_from_soup(soup: BeautifulSoup) -> str:
 99 |     raw_markdown_text = MarkdownConverter().convert_soup(soup)
100 |     return re.sub(r'\n{3,}', '\n\n', raw_markdown_text)
101 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "mark"
 3 | version = "0.10.2"
 4 | description = "Mark lets you seamlessly use markdown, images and links to interact with LLMs"
 5 | readme = "README.md"
 6 | requires-python = ">=3.10"
 7 | classifiers = [
 8 |     "Programming Language :: Python :: 3",
 9 |     "Operating System :: OS Independent",
10 | ]
11 | license = "MIT"
12 | license-files = ["LICEN[CS]E*"]
13 | 
14 | [project.urls]
15 | Homepage = "https://github.com/relston/mark"
16 | Issues = "https://github.com/relston/mark/issues"
17 | 
18 | [tool.poetry]
19 | name = "mark"
20 | version = "0.10.2"
21 | description = "Mark lets you seamlessly use markdown, images and links to interact with LLMs"
22 | authors = ["Ryan Elston <elston.ryan@gmail.com>"]
23 | 
24 | # Include additional files
25 | include = [
26 |     "templates/default_system_prompt.md"
27 | ]
28 | 
29 | [tool.poetry.dependencies]
30 | python = "^3.10"
31 | PyYAML = "6.0.2"
32 | ipython = "8.32.0"
33 | click = "^8.1.8"
34 | beautifulsoup4 = "^4.12.3"
35 | langchain = "^0.2.16"
36 | langchain-community = "^0.2.15"
37 | pyppeteer = "^2.0.0"
38 | markdownify = "^0.14.1"
39 | llm = "^0.25"
40 | click-default-group = "^1.2.4"
41 | 
42 | [tool.poetry.scripts]
43 | mark = "mark.cli:mark_cli"
44 | 
45 | [tool.poetry.group.dev.dependencies]
46 | pytest = "^8.3.4"
47 | respx = "^0.22.0"
48 | httpx = "^0.28.1"
49 | flake8 = "^7.1.1"
50 | autopep8 = "^2.3.1"
51 | 
52 | [build-system]
53 | requires = ["poetry-core>=1.0.0"]
54 | build-backend = "poetry.core.masonry.api"


--------------------------------------------------------------------------------
/templates/default_system_prompt.md:
--------------------------------------------------------------------------------
1 | You are a helpful LLM agent that will receive user input in the form of a markdown file.
2 | The contents of the file will be used as context and the specific prompt from the use will be located at the end of the file.
3 | Your response to the users request should also be written in markdown format.
4 | 
5 | RULES:
6 | - Do not echo back any of the input into your response to the user.
7 | - If using a heading in your response, start with a level 2 heading
8 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | from unittest.mock import patch
 4 | import llm
 5 | 
 6 | 
 7 | @pytest.fixture(autouse=True)
 8 | def mock_openai_key():
 9 |     os.environ['OPENAI_API_KEY'] = 'test_key'
10 | 
11 | 
12 | @pytest.fixture(autouse=True)
13 | def mock_cwd(tmp_path):
14 |     with patch('os.getcwd') as mock:
15 |         mock.return_value = tmp_path
16 |         yield mock
17 | 
18 | 
19 | @pytest.fixture
20 | def mock_stdout():
21 |     with patch('click.echo') as mock:
22 |         yield mock
23 | 
24 | 
25 | @pytest.fixture
26 | def mock_llm_response():
27 |     with patch('llm.models._Model.prompt') as mock:
28 |         yield mock
29 | 
30 | 
31 | @pytest.fixture
32 | def mock_llm_get_model():
33 |     get_model_method = llm.get_model
34 | 
35 |     with patch('llm.get_model') as mock:
36 |         mock.side_effect = get_model_method
37 |         yield mock
38 | 
39 | 
40 | @pytest.fixture
41 | def mock_image_generation():
42 |     with patch('mark.llm._call_generate_image') as mock:
43 |         yield mock
44 | 
45 | 
46 | @pytest.fixture
47 | def create_file(tmp_path):
48 |     def _create_file(file_path, content, binary=False):
49 |         file = tmp_path / file_path
50 |         file.parent.mkdir(parents=True, exist_ok=True)
51 |         if binary:
52 |             file.write_bytes(content)
53 |         else:
54 |             file.write_text(content, encoding="utf-8")
55 |         return file
56 |     return _create_file
57 | 
58 | 
59 | @pytest.fixture
60 | def mock_web_page():
61 |     url_to_content = {}
62 | 
63 |     def _mock(url, page_content):
64 |         url_to_content[url] = page_content
65 | 
66 |     with patch('mark.scraper.get_rendered_html') as mock:
67 |         def side_effect(url):
68 |             return url_to_content[url]
69 |         mock.side_effect = side_effect
70 |         yield _mock
71 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
  1 | from mark.cli import mark_cli
  2 | from textwrap import dedent
  3 | from mark import config
  4 | import pytest
  5 | import os
  6 | import sys
  7 | import io
  8 | import llm
  9 | from unittest.mock import Mock
 10 | 
 11 | """
 12 | These tests are meant to act as 'functional-lite'. Maximizing code coverage for
 13 | each of the main use cases of the CLI command and minimizing the number of
 14 | basic unit tests needed for each individual module.
 15 | 
 16 | We just mock out the files and the OpenAI API calls, and then test the CLI
 17 | """
 18 | 
 19 | 
 20 | class TestCLI:
 21 |     @pytest.fixture(autouse=True)
 22 |     def use_tmp_config_path(self, tmp_path):
 23 |         # MARK_CONFIG_PATH defaults to ~/.mark
 24 |         # for all tests we use a temporary directory
 25 |         self.config_path = tmp_path / 'config'
 26 |         os.environ['MARK_CONFIG_PATH'] = str(self.config_path)
 27 | 
 28 |     @pytest.fixture(autouse=True)
 29 |     def define_files(
 30 |             self,
 31 |             create_file,
 32 |             mock_llm_response,
 33 |             mock_web_page,
 34 |             mock_image_generation):
 35 |         config.reset()
 36 | 
 37 |         # Given a markdown file with the following content
 38 |         self.mock_markdown_file_content = dedent("""
 39 |         A Markdown file with various images and links
 40 | 
 41 |         Local image:
 42 |         ![Local Image](./images/sample.png)
 43 | 
 44 |         Remote image:
 45 |         ![Remote Image](https://example.com/image.png)
 46 | 
 47 |         Relative image outside directory:
 48 |         ![Outside Image](../images/outside.png)
 49 | 
 50 |         External url link:
 51 |         [External URL](https://example.com/some-article)
 52 | 
 53 |         Local file link:
 54 |         [Anther Reference](./docs/another-reference.md)
 55 |         """)
 56 | 
 57 |         # and the files exists in the file system
 58 |         self.markdown_file = create_file(
 59 |             "test.md", self.mock_markdown_file_content)
 60 |         create_file("./images/sample.png", b"sample image data", binary=True)
 61 |         create_file(
 62 |             "../images/outside.png",
 63 |             b"outside image data",
 64 |             binary=True)
 65 |         create_file("./docs/another-reference.md", "Another reference content")
 66 | 
 67 |         # and the external url link returns this response
 68 |         html_content = """
 69 |         <!DOCTYPE html>
 70 |         <html>
 71 |         <head>
 72 |             <title>Basic HTML Page</title>
 73 |         </head>
 74 |         <body>
 75 |             <h1>Welcome to My Page</h1>
 76 |             <a href="https://www.example.com">Visit Example.com</a>
 77 |         </body>
 78 |         </html>
 79 |         """
 80 |         mock_web_page('https://example.com/some-article', html_content)
 81 | 
 82 |         # and llm returning this response
 83 |         mock_llm_response.return_value = "Test completion"
 84 |         mock_image_generation.return_value = Mock(
 85 |             url='https://generated.image.url/image.png',
 86 |             revised_prompt='A revised mock image prompt'
 87 |         )
 88 | 
 89 |         self.default_system_prompt = dedent(
 90 |             """
 91 |             You are a helpful LLM agent that will receive user input in the form of a markdown file.
 92 |             The contents of the file will be used as context and the specific prompt from the use will be located at the end of the file.
 93 |             Your response to the users request should also be written in markdown format.
 94 | 
 95 |             RULES:
 96 |             - Do not echo back any of the input into your response to the user.
 97 |             - If using a heading in your response, start with a level 2 heading
 98 |             """
 99 |         )
100 | 
101 |         self.default_expected_context = dedent(
102 |             """
103 |             Link Text: External URL
104 |             SRC: https://example.com/some-article
105 |             Page Title: Basic HTML Page
106 |             Page Content:
107 | 
108 | 
109 |             Basic HTML Page
110 | 
111 |             Welcome to My Page
112 |             ==================
113 | 
114 |             [Visit Example.com](https://www.example.com)
115 | 
116 | 
117 |             ---
118 |             Link Text: Anther Reference
119 |             SRC: ./docs/another-reference.md
120 |             Page Title: another-reference.md
121 |             Page Content:
122 |             Another reference content
123 |             """
124 |         )
125 | 
126 |         self.default_expected_system_message = self.default_expected_context + \
127 |             self.default_system_prompt
128 | 
129 |         self.default_expected_llm_request = [
130 |             {'role': 'system', 'content': self.default_expected_system_message},
131 |             {'role': 'user', 'content': [
132 |                 {'type': 'text', 'text': self.mock_markdown_file_content},
133 |                 {'type': 'image_url', 'image_url': {'url': 'data:image/png;base64,c2FtcGxlIGltYWdlIGRhdGE='}},
134 |                 {'type': 'image_url', 'image_url': {'url': 'https://example.com/image.png'}},
135 |                 {'type': 'image_url', 'image_url': {'url': 'data:image/png;base64,b3V0c2lkZSBpbWFnZSBkYXRh'}},
136 |             ]
137 |             }
138 |         ]
139 |         self.default_expected_attachements = [
140 |             llm.Attachment(path='./images/sample.png'),
141 |             llm.Attachment(url='https://example.com/image.png'),
142 |             llm.Attachment(path='../images/outside.png')
143 |         ]
144 | 
145 |     def test_command_default(self, mock_llm_response):
146 |         """Test CLI command without specifying an agent (default agent should be used)."""
147 | 
148 |         # Run the CLI command with only the markdown file
149 |         mark_cli([str(self.markdown_file)], None, None, False)
150 | 
151 |         mock_llm_response.assert_called_once_with(
152 |             self.mock_markdown_file_content,
153 |             system=self.default_expected_system_message,
154 |             attachments=self.default_expected_attachements,
155 |             stream=False
156 |         )
157 | 
158 |         # The markdown file will be updated with the response
159 |         expected_markdown_file_content = self.mock_markdown_file_content + \
160 |             dedent("""
161 |             # GPT Response (model: gpt-4o, system: default)
162 |             Test completion
163 | 
164 |             # User Response
165 |             """)
166 | 
167 |         assert self.markdown_file.read_text() == expected_markdown_file_content
168 | 
169 |     def test_command_with_stdin(self, mock_llm_response, mock_stdout):
170 |         byte_string = self.mock_markdown_file_content.encode('utf-8')
171 |         input = io.TextIOWrapper(io.BytesIO(byte_string), encoding='utf-8')
172 |         sys.stdin = input
173 | 
174 |         mark_cli(['-'], None, None, False)
175 | 
176 |         mock_llm_response.assert_called_once_with(
177 |             self.mock_markdown_file_content,
178 |             system=self.default_expected_system_message,
179 |             attachments=self.default_expected_attachements,
180 |             stream=False
181 |         )
182 | 
183 |         mock_stdout.assert_called_once_with("Test completion")
184 | 
185 |     def test_command_custom_model(self, mock_llm_get_model, mock_llm_response):
186 |         """
187 |         mark --model o1 path/to/markdown.md
188 |         """
189 | 
190 |         mark_cli(['--model', 'o1', str(self.markdown_file)], None, None, False)
191 | 
192 |         mock_llm_get_model.assert_called_once_with('o1')
193 | 
194 |         mock_llm_response.assert_called_once_with(
195 |             self.mock_markdown_file_content,
196 |             system=self.default_expected_system_message,
197 |             attachments=self.default_expected_attachements,
198 |             stream=False
199 |         )
200 | 
201 |     def test_command_custom_agent(self, create_file, mock_llm_response):
202 |         # Define a custom agent
203 |         create_file(
204 |             self.config_path / 'system_prompts/custom.md',
205 |             """You're a custom agent that ....."""
206 |         )
207 | 
208 |         # Run the CLI command with the custom agent
209 |         mark_cli([str(self.markdown_file), '--system=custom'],
210 |                  None, None, False)
211 | 
212 |         expected_system_message = self.default_expected_context + \
213 |             "\nYou're a custom agent that ....."
214 | 
215 |         mock_llm_response.assert_called_once_with(
216 |             self.mock_markdown_file_content,
217 |             system=expected_system_message,
218 |             attachments=self.default_expected_attachements,
219 |             stream=False
220 |         )
221 | 
222 |         # The markdown file will be updated indicating the custom agent
223 |         expected_markdown_file_content = self.mock_markdown_file_content + \
224 |             dedent("""
225 |             # GPT Response (model: gpt-4o, system: custom)
226 |             Test completion
227 | 
228 |             # User Response
229 |             """)
230 |         assert self.markdown_file.read_text() == expected_markdown_file_content
231 | 
232 |     def test_command_generate_image(self, mock_image_generation):
233 |         """
234 |         Test CLI command with the --generate-image option.
235 |         """
236 | 
237 |         mark_cli([str(self.markdown_file), '--generate-image'],
238 |                  None, None, False)
239 | 
240 |         expected_prompt = self.default_expected_system_message + \
241 |             "\n" + self.mock_markdown_file_content
242 |         mock_image_generation.assert_called_once_with(
243 |             expected_prompt, "dall-e-3")
244 | 
245 |         # The markdown file will be updated with the generated image URL
246 |         expected_markdown_file_content = self.mock_markdown_file_content + \
247 |             dedent("""
248 |             # GPT Response (model: dall-e-3, system: default)
249 |             A revised mock image prompt
250 | 
251 |             ![Generated Image](https://generated.image.url/image.png)
252 | 
253 |             # User Response
254 |             """)
255 | 
256 |         assert self.markdown_file.read_text() == expected_markdown_file_content
257 | 
258 |     def test_command_models(self, mock_stdout):
259 |         """
260 |         Test for `mark models`
261 |         """
262 | 
263 |         mark_cli(['models'], None, None, False)
264 | 
265 |         call = mock_stdout.call_args_list[0]
266 |         assert 'OpenAI Chat' in call[0][0]
267 | 


--------------------------------------------------------------------------------
/tests/test_scraper.py:
--------------------------------------------------------------------------------
 1 | import pyppeteer
 2 | from mark import scraper
 3 | from unittest.mock import patch
 4 | 
 5 | 
 6 | def test_page_scrape(mock_web_page):
 7 |     html_content = """
 8 |         <!DOCTYPE html>
 9 |     <html>
10 |     <head>
11 |         <title>Basic HTML Page</title>
12 |     </head>
13 |     <body>
14 |         <h1>Welcome to My Page</h1>
15 |         <a href="https://www.example.com">Visit Example.com</a>
16 |     </body>
17 |     </html>
18 |     """
19 | 
20 |     mock_web_page('https://supercool.com', html_content)
21 | 
22 |     page = scraper.get('https://supercool.com')
23 | 
24 |     assert page.title == 'Basic HTML Page'
25 |     assert page.url == 'https://supercool.com'
26 |     assert page.body == '\n\nBasic HTML Page\n\nWelcome to My Page\n' + \
27 |         '==================\n\n[Visit Example.com](https://www.example.com)\n\n'
28 | 
29 | 
30 | def test_timeout_error_handling():
31 |     with patch('mark.scraper._render_page', side_effect=pyppeteer.errors.TimeoutError):
32 |         page = scraper.get('https://timeout-test.com')
33 | 
34 |         assert page.body == 'Timeout while fetching page'
35 |         assert page.title is None, "Expected no title when a TimeoutError occurs"
36 |         assert page.url == 'https://timeout-test.com', "URL should be correct even when timeout occurs"
37 | 


--------------------------------------------------------------------------------