├── src
└── structured_qa
│ ├── __init__.py
│ ├── config.py
│ ├── preprocessing.py
│ ├── cli.py
│ ├── model_loaders.py
│ └── workflow.py
├── .gitattributes
├── example_data
├── 1706.03762v7.pdf
└── config.yaml
├── images
├── Blueprint-logo-black.png
├── Blueprint-logo-white.png
└── structured-qa-diagram.png
├── docs
├── images
│ ├── Blueprint-logo-black.png
│ ├── Blueprint-logo-white.png
│ ├── structured-qa-diagram.png
│ └── Blueprint-logo-black-flavicon.png
├── api.md
├── assets
│ └── custom.css
├── cli.md
├── getting-started.md
├── future-features-contributions.md
├── customization.md
├── index.md
└── step-by-step-guide.md
├── .github
├── setup.sh
├── .devcontainer.json
├── workflows
│ ├── tests.yaml
│ ├── lint.yaml
│ ├── release.yaml
│ ├── package.yaml
│ ├── docs.yaml
│ └── sync_hf_space.yaml
├── pull_request_template.md
└── ISSUE_TEMPLATE
│ ├── feature_request.yaml
│ └── bug_report.yaml
├── tests
├── conftest.py
└── unit
│ ├── test_preprocessing.py
│ └── test_workflow.py
├── demo
├── README.md
├── reboot_space.py
├── Dockerfile
├── app.py
└── notebook.ipynb
├── benchmark
└── perfect_context
│ ├── 4 OUR METHOD.txt
│ ├── OVERVIEW AND GOAL.txt
│ ├── 5.2 Hardware and Schedule.txt
│ ├── 5.3 Optimizer.txt
│ ├── 2.3 Optimizer.txt
│ ├── 4.1. Natural lighting.txt
│ ├── 3.4 Embeddings and Softmax.txt
│ ├── RAID.txt
│ ├── 3 Arithmetic Reasoning.txt
│ ├── 5.4 Regularization.txt
│ ├── 2.1. Toilets.txt
│ ├── 3 Model Architecture.txt
│ ├── CLEANUP PHASE.txt
│ ├── 5 Bias, Toxicity and Misinformation.txt
│ ├── CARBON MONOXIDE DETECTION AND VENTING.txt
│ ├── END OF THE GAME.txt
│ ├── GAME END.txt
│ ├── 1.2.1. Internal partitions and doors.txt
│ ├── CONQUERING MIDDLE-EARTH.txt
│ ├── ACTION PHASE.txt
│ ├── 3.2.2 Multi-Head Attention.txt
│ ├── 3.1 Encoder and Decoder Stacks.txt
│ ├── 5.2 CrowS-Pairs.txt
│ ├── 3.5 Positional Encoding.txt
│ ├── LORA ABSTRACT.txt
│ ├── LOOKOUT PHASE.txt
│ ├── CARD AND TILE EFFECTS.txt
│ ├── CARD AND TILE COSTS.txt
│ ├── Risk Perception.txt
│ ├── EXPEDITION PHASE.txt
│ ├── Europe.txt
│ ├── 2.4 Recurrent Networks.txt
│ ├── CHAPTER 5: WHERE DO YOU WANT TO MAKE YOUR WORK AVAILABLE?.txt
│ ├── LLM Tokenization Introduces Unfairness.txt
│ ├── 3 Main results.txt
│ ├── WHY ARE OPEN ACCESS POLICIES ADOPTED?.txt
│ ├── 15.3. API Fundamentals.txt
│ ├── 23.1. What is Lazy Loading?.txt
│ ├── CHAPTER OVERVIEW.txt
│ ├── 3.4 Robustness of Chain of Thought.txt
│ ├── Procurement in an emerging market.txt
│ ├── Training Cost.txt
│ ├── 6.1.1. Compilation Workflow.txt
│ ├── 3.2 Results.txt
│ ├── 3.1 Experimental Setup.txt
│ ├── Limitations of generative AI and LLMs.txt
│ ├── LOCATION ABILITIES.txt
│ ├── 5 Symbolic Reasoning.txt
│ ├── U.S. Regulation.txt
│ ├── SECTION I. INTRODUCTION.txt
│ ├── 1 INTRODUCTION.txt
│ ├── Accountability and responsibility.txt
│ ├── 5.2. Thread Hierarchy.txt
│ ├── HOW DO YOU CHOOSE AN OPEN ACCESS PUBLISHER?.txt
│ ├── 2.1 Pre-training Data.txt
│ ├── 3 Experimental Results.txt
│ ├── OVERCOMING RESERVATIONS ABOUT OPEN ACCESS.txt
│ └── Prohibited AI Practices.txt
├── Dockerfile
├── .pre-commit-config.yaml
├── pyproject.toml
├── mkdocs.yml
├── CONTRIBUTING.md
├── CODE_OF_CONDUCT.md
├── .gitignore
├── README.md
└── LICENSE
/src/structured_qa/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | benchmark/* linguist-vendored
2 |
--------------------------------------------------------------------------------
/example_data/1706.03762v7.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla-ai/structured-qa/HEAD/example_data/1706.03762v7.pdf
--------------------------------------------------------------------------------
/images/Blueprint-logo-black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla-ai/structured-qa/HEAD/images/Blueprint-logo-black.png
--------------------------------------------------------------------------------
/images/Blueprint-logo-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla-ai/structured-qa/HEAD/images/Blueprint-logo-white.png
--------------------------------------------------------------------------------
/images/structured-qa-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla-ai/structured-qa/HEAD/images/structured-qa-diagram.png
--------------------------------------------------------------------------------
/docs/images/Blueprint-logo-black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla-ai/structured-qa/HEAD/docs/images/Blueprint-logo-black.png
--------------------------------------------------------------------------------
/docs/images/Blueprint-logo-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla-ai/structured-qa/HEAD/docs/images/Blueprint-logo-white.png
--------------------------------------------------------------------------------
/docs/images/structured-qa-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla-ai/structured-qa/HEAD/docs/images/structured-qa-diagram.png
--------------------------------------------------------------------------------
/.github/setup.sh:
--------------------------------------------------------------------------------
1 | python -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
2 | python -m pip install -e .
3 |
--------------------------------------------------------------------------------
/docs/images/Blueprint-logo-black-flavicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla-ai/structured-qa/HEAD/docs/images/Blueprint-logo-black-flavicon.png
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import pytest
4 |
5 |
6 | @pytest.fixture(scope="session")
7 | def example_data():
8 | return Path(__file__).parent.parent / "example_data"
9 |
--------------------------------------------------------------------------------
/docs/api.md:
--------------------------------------------------------------------------------
1 | # API Reference
2 |
3 | ::: structured_qa.preprocessing
4 |
5 | ::: structured_qa.model_loaders
6 |
7 | ::: structured_qa.workflow
8 |
9 | ::: structured_qa.config.FIND_PROMPT
10 |
11 | ::: structured_qa.config.ANSWER_PROMPT
12 |
--------------------------------------------------------------------------------
/demo/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Structured Qa
3 | emoji: 📚
4 | colorFrom: green
5 | colorTo: purple
6 | sdk: docker
7 | app_port: 8501
8 | pinned: false
9 | license: apache-2.0
10 | short_description: Question answering for structured documents
11 | ---
12 |
--------------------------------------------------------------------------------
/docs/assets/custom.css:
--------------------------------------------------------------------------------
1 | @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
2 |
3 | :root {
4 | --md-default-font: "Inter", sans-serif;
5 | --md-code-font: "Fira Code", monospace;
6 | --md-primary-font: "Inter", sans-serif;
7 | }
8 |
--------------------------------------------------------------------------------
/demo/reboot_space.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from huggingface_hub import HfApi
4 |
5 | if __name__ == "__main__":
6 | api = HfApi()
7 | api.restart_space(
8 | repo_id="mozilla-ai/structured-qa",
9 | token=os.getenv("HF_TOKEN"),
10 | factory_reboot=True,
11 | )
12 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/4 OUR METHOD.txt:
--------------------------------------------------------------------------------
1 | We describe the simple design of LoRA and its practical benefits. The principles outlined here apply
2 | to any dense layers in deep learning models, though we only focus on certain weights in Transformer
3 | language models in our experiments as the motivating use case.
4 |
--------------------------------------------------------------------------------
/.github/.devcontainer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Python Development Container",
3 | "image": "mcr.microsoft.com/devcontainers/base:ubuntu",
4 | "features": {
5 | "ghcr.io/devcontainers/features/python": {
6 | "version": "latest"
7 | }
8 | },
9 | "postCreateCommand": "pip install -e ."
10 | }
11 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/OVERVIEW AND GOAL.txt:
--------------------------------------------------------------------------------
1 | A game plays over 3 successive chapters that unfold similarly.
2 | On your turn, strengthen your Skills, hoard your treasure, stretch your presence across Middle-earth,
3 | rally Races to your cause, or advance the Quest of the Ring.
4 | Immediately win the game by fulfilling one of the 3 victory conditions:
5 | complete the Quest of the Ring,
6 | rally the support of 6 different Races,
7 | conquer Middle-earth
8 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/5.2 Hardware and Schedule.txt:
--------------------------------------------------------------------------------
1 | We trained our models on one machine with 8 NVIDIA P100 GPUs. For our base models using
2 | the hyperparameters described throughout the paper, each training step took about 0.4 seconds. We
3 | trained the base models for a total of 100,000 steps or 12 hours. For our big models,(described on the
4 | bottom line of table 3), step time was 1.0 seconds. The big models were trained for 300,000 steps
5 | (3.5 days).
6 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/5.3 Optimizer.txt:
--------------------------------------------------------------------------------
1 | We used the Adam optimizer [20] with β1 = 0.9, β2 = 0.98 and ϵ = 10−9. We varied the learning
2 | rate over the course of training, according to the formula:
3 | lrate = d−0.5
4 | model · min(step_num−0.5, step_num · warmup_steps−1.5) (3)
5 | This corresponds to increasing the learning rate linearly for the first warmup_steps training steps,
6 | and decreasing it thereafter proportionally to the inverse square root of the step number. We used
7 | warmup_steps = 4000.
8 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/2.3 Optimizer.txt:
--------------------------------------------------------------------------------
1 | Our models are trained using the AdamW opti-
2 | mizer (Loshchilov and Hutter, 2017), with the fol-
3 | lowing hyper-parameters: β1 = 0.9, β2 = 0.95.
4 | We use a cosine learning rate schedule, such that
5 | the final learning rate is equal to 10% of the maxi-
6 | mal learning rate. We use a weight decay of 0.1 and
7 | gradient clipping of 1.0. We use 2, 000 warmup
8 | steps, and vary the learning rate and batch size with
9 | the size of the model (see Table 2 for details).
10 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10-slim
2 |
3 | RUN pip3 install --no-cache-dir --upgrade pip
4 | RUN apt-get update && apt-get install -y \
5 | build-essential \
6 | software-properties-common \
7 | git
8 |
9 |
10 | COPY . /home/appuser/structured_qa
11 | WORKDIR /home/appuser/structured_qa
12 |
13 | RUN pip3 install /home/appuser/structured_qa
14 |
15 | RUN groupadd --gid 1000 appuser \
16 | && useradd --uid 1000 --gid 1000 -ms /bin/bash appuser
17 |
18 | USER appuser
19 |
20 | EXPOSE 8501
21 | ENTRYPOINT ["./demo/run.sh"]
22 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/4.1. Natural lighting.txt:
--------------------------------------------------------------------------------
1 | Natural light is required for all permanent work stations and for restaurants.
2 | As a general rule, preference should be given to the use of natural lighting; at least 80% of the
3 | surface area of offices should have a daylight factor (according to the International Commission
4 | on Illumination (CIE)) of 1.5% for façades without exterior obstructions, and 0.7% for other
5 | façades16.
6 | See Section I.2.4. Electricity and lighting, paragraph 1 on the conditions which apply to
7 | artificial lighting.
8 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/3.4 Embeddings and Softmax.txt:
--------------------------------------------------------------------------------
1 | Similarly to other sequence transduction models, we use learned embeddings to convert the input
2 | tokens and output tokens to vectors of dimension dmodel. We also use the usual learned linear transfor-
3 | mation and softmax function to convert the decoder output to predicted next-token probabilities. In
4 | our model, we share the same weight matrix between the two embedding layers and the pre-softmax
5 | linear transformation, similar to [ 30 ]. In the embedding layers, we multiply those weights by √dmode
6 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/RAID.txt:
--------------------------------------------------------------------------------
1 | This action allows a player to Raid an opponent’s Action Location
2 | card to immediately exhaust it so that the opponent cannot use its
3 | ability until unexhausted.
4 | A player can only use this action if he has a Raze token. Only Action
5 | Locations may be Raided. Remember that you cannot Raid opponent
6 | that has already passed
7 | TO RAID AN OPPONENT’S LOCATION:
8 | > Choose an opponent and any one of their unexhausted Action
9 | Locations.
10 | > Discard 1 token.
11 | > Exhaust the opponent’s Location (Rotate it 90 degrees to
12 | the right).
13 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/3 Arithmetic Reasoning.txt:
--------------------------------------------------------------------------------
1 | We begin by considering math word problems of the form in Figure 1, which measure the arithmetic
2 | reasoning ability of language models. Though simple for humans, arithmetic reasoning is a task where
3 | language models often struggle (Hendrycks et al., 2021; Patel et al., 2021, inter alia). Strikingly, chain-
4 | of-thought prompting when used with the 540B parameter language model performs comparably with
5 | task-specific finetuned models on several tasks, even achieving new state of the art on the challenging
6 | GSM8K benchmark (Cobbe et al., 2021).
7 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/5.4 Regularization.txt:
--------------------------------------------------------------------------------
1 | We employ three types of regularization during training:
2 | Residual Dropout We apply dropout [ 33] to the output of each sub-layer, before it is added to the
3 | sub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the
4 | positional encodings in both the encoder and decoder stacks. For the base model, we use a rate of
5 | Pdrop = 0.1.
6 | Label Smoothing During training, we employed label smoothing of value ϵls = 0.1 [ 36 ]. This
7 | hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score
8 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/2.1. Toilets.txt:
--------------------------------------------------------------------------------
1 | Toilets must be installed on each level containing office rooms and for each structural unit;
2 | they must be distributed uniformly and located in a central area.
3 | Sinks must be supplied exclusively with cold water.
4 | Accessibility for persons with reduced mobility (PRM)
5 | In the event that a new office building is constructed upon request by the Commission, one
6 | toilet which is accessible for persons with reduced mobility must be installed on each level
7 | containing office rooms or similar.
8 | In other cases, the requirements of the applicable legislation must be observed
9 |
--------------------------------------------------------------------------------
/docs/cli.md:
--------------------------------------------------------------------------------
1 | # Command Line Interface
2 |
3 | Once you have [installed the blueprint](./getting-started.md), you can use it from the CLI.
4 |
5 | You can either provide the path to a configuration file:
6 |
7 | ```bash
8 | structured-qa --from_config "example_data/config.yaml"
9 | ```
10 |
11 | Or provide values to the arguments directly:
12 |
13 |
14 | ```bash
15 | structured-qa \
16 | --question "What learning rate was used?" \
17 | --input_file "example_data/1706.03762v7.pdf" \
18 | --output_dir "example_outputs/1706.03762v7.pdf"
19 | ```
20 |
21 | ---
22 |
23 | ::: structured_qa.cli.structured_qa
24 |
25 | ---
26 |
27 | ::: structured_qa.config.Config
28 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yaml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | paths:
7 | - 'src/**'
8 | - 'tests/**'
9 | pull_request:
10 | paths:
11 | - 'src/**'
12 | - 'tests/**'
13 | workflow_dispatch:
14 |
15 | jobs:
16 | run-tests:
17 | timeout-minutes: 30
18 | runs-on: ubuntu-latest
19 |
20 | steps:
21 | - name: Check out the repository
22 | uses: actions/checkout@v4
23 |
24 | - name: Set up Python
25 | uses: actions/setup-python@v5
26 | with:
27 | python-version: '3.10'
28 | cache: "pip"
29 |
30 | - name: Install
31 | run: pip install -e '.[tests]'
32 |
33 | - name: Run tests
34 | run: pytest -v tests
35 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/3 Model Architecture.txt:
--------------------------------------------------------------------------------
1 | Most competitive neural sequence transduction models have an encoder-decoder structure [ 5, 2 , 35].
2 | Here, the encoder maps an input sequence of symbol representations (x1, ..., xn) to a sequence
3 | of continuous representations z = (z1, ..., zn). Given z, the decoder then generates an output
4 | sequence (y1, ..., ym) of symbols one element at a time. At each step the model is auto-regressive
5 | [10], consuming the previously generated symbols as additional input when generating the next.
6 |
7 | The Transformer follows this overall architecture using stacked self-attention and point-wise, fully
8 | connected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,
9 | respectively.
10 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/CLEANUP PHASE.txt:
--------------------------------------------------------------------------------
1 | NOTE: Skip the Cleanup phase in the final round.
2 | In this phase:
3 | > Players take all Workers from their Clan tile back to their
4 | supply.
5 | > Players unexhaust all cards (rotate them to their initial position)
6 | in front of them.
7 | > Players take back their Clan action pawns from Action tiles.
8 | > Discard any remaining, face-up Island cards and reveal new ones.
9 | > Pass the First player marker to the next player in clockwise order.
10 | > If no one has reached or passed the 25 on the scoreboard
11 | during the Action phase, begin a new round. If anyone reached
12 | 25 or more during the Expedition phase, the next round will
13 | be the last one.
14 | IMPORTANT: Goods are never discarded at the end of the round
15 |
--------------------------------------------------------------------------------
/demo/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:12.2.2-cudnn8-devel-ubuntu22.04
2 |
3 | RUN apt-get update && apt-get install --no-install-recommends -y \
4 | build-essential \
5 | python3.10 \
6 | python3.10-dev \
7 | python3-pip \
8 | git \
9 | && apt-get clean && rm -rf /var/lib/apt/lists/*
10 |
11 | RUN useradd -m -u 1000 user
12 |
13 | USER user
14 |
15 | ENV HOME=/home/user \
16 | PATH=/home/user/.local/bin:$PATH
17 |
18 | WORKDIR $HOME/app
19 |
20 | RUN pip3 install https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp310-cp310-linux_x86_64.whl
21 | RUN pip3 install structured-qa
22 |
23 | COPY --chown=user . $HOME/app
24 |
25 | EXPOSE 8501
26 | ENTRYPOINT ["streamlit", "run", "app.py", "--server.enableXsrfProtection", "false"]
27 |
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | # What's changing
2 |
3 | Provide a clear and concise description of the content changes you're proposing. List all the
4 | changes you are making to the content.
5 |
6 | > If this PR is related to an issue or closes one, please link it here.
7 |
8 | Refs #...
9 | Closes #...
10 |
11 | # How to test it
12 |
13 | Steps to test the changes:
14 |
15 | 1.
16 | 2.
17 | 3.
18 |
19 | # Additional notes for reviewers
20 |
21 | Anything you'd like to add to help the reviewer understand the changes you're proposing.
22 |
23 | # I already...
24 |
25 | - [ ] Tested the changes in a working environment to ensure they work as expected
26 | - [ ] Added some tests for any new functionality
27 | - [ ] Updated the documentation (both comments in code and under `/docs`)
28 |
--------------------------------------------------------------------------------
/.github/workflows/lint.yaml:
--------------------------------------------------------------------------------
1 | name: Lint
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | pull_request:
7 | workflow_dispatch:
8 |
9 | jobs:
10 | run-linter:
11 | timeout-minutes: 30
12 | runs-on: ubuntu-latest
13 |
14 | steps:
15 | - name: Check out the repository
16 | uses: actions/checkout@v4
17 |
18 | - name: Set up Python
19 | uses: actions/setup-python@v5
20 | with:
21 | python-version: '3.10'
22 |
23 | - name: Install pre-commit
24 | run: pip install pre-commit
25 |
26 | - uses: actions/cache@v4
27 | with:
28 | path: ~/.cache/pre-commit/
29 | key: pre-commit-4|${{ env.pythonLocation }}|${{ hashFiles('.pre-commit-config.yaml') }}
30 |
31 | - name: pre-commit
32 | run: pre-commit run --all-files
33 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/5 Bias, Toxicity and Misinformation.txt:
--------------------------------------------------------------------------------
1 | Large language models have been showed to re-
2 | produce and amplify biases that are existing in
3 | the training data (Sheng et al., 2019; Kurita et al.,
4 | 2019), and to generate toxic or offensive con-
5 | tent (Gehman et al., 2020). As our training dataset
6 | contains a large proportion of data from the Web,
7 | we believe that it is crucial to determine the po-
8 | tential for our models to generate such content.
9 | To understand the potential harm of LLaMA-65B,
10 | we evaluate on different benchmarks that measure
11 | toxic content production and stereotypes detection.
12 | While we have selected some of the standard bench-
13 | marks that are used by the language model com-
14 | munity to indicate some of the issues with these
15 | models, these evaluations are not sufficient to fully
16 | understand the risks associated with these models.
17 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/CARBON MONOXIDE DETECTION AND VENTING.txt:
--------------------------------------------------------------------------------
1 | Operating safety
2 | A carbon monoxide (CO) detector must be installed in closed car parks (indoor or
3 | underground) in accordance with the following requirements:
4 | - the number of carbon monoxide detectors on each level must be sufficient to cover the
5 | entire area of the car park;
6 | - the system must allow automatic control of blower and/or extraction fans and audible and
7 | light indications, on the basis of thresholds stipulated by the Commission;
8 | - the carbon monoxide detection control panel must be fitted with a stand-alone power
9 | source in the form of an integrated battery (providing at least one hour of power).
10 | Maintenance and management
11 | Remote management
12 | he carbon monoxide detection control panel must be linked to the centralised technical
13 | management system GTC (control panel alarms and malfunctions).
14 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v5.0.0
4 | hooks:
5 | - id: check-added-large-files
6 | exclude: example_data
7 | - id: check-case-conflict
8 | - id: check-json
9 | - id: check-merge-conflict
10 | args: ['--assume-in-merge']
11 | - id: check-toml
12 | - id: check-yaml
13 | - id: end-of-file-fixer
14 | - id: mixed-line-ending
15 | args: ['--fix=lf']
16 | - id: sort-simple-yaml
17 | - id: trailing-whitespace
18 | - repo: https://github.com/astral-sh/ruff-pre-commit
19 | rev: 'v0.9.4'
20 | hooks:
21 | - id: ruff
22 | args: [--fix, --exit-non-zero-on-fix]
23 | - id: ruff-format
24 | - repo: https://github.com/codespell-project/codespell
25 | rev: v2.4.1
26 | hooks:
27 | - id: codespell
28 | exclude: CODE_OF_CONDUCT.md|benchmark
29 |
--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | release:
5 | types: [published]
6 | workflow_dispatch:
7 |
8 | jobs:
9 | release:
10 | environment: pypi
11 | permissions:
12 | contents: read
13 | id-token: write
14 | runs-on: ubuntu-latest
15 | steps:
16 | - name: Check out the repository
17 | uses: actions/checkout@v4
18 | with:
19 | fetch-depth: 0
20 |
21 | - name: Set up Python
22 | uses: actions/setup-python@v5
23 | with:
24 | python-version: '3.10'
25 |
26 | - name: Upgrade pip
27 | run: |
28 | pip install --upgrade pip
29 | pip --version
30 |
31 | - name: Install
32 | run: python -m pip install build setuptools
33 |
34 | - name: Build package
35 | run: python -m build
36 |
37 | - name: Upload package
38 | if: github.event_name == 'release'
39 | uses: pypa/gh-action-pypi-publish@release/v1
40 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/END OF THE GAME.txt:
--------------------------------------------------------------------------------
1 | There are three immediate victory conditions:
2 | Quest of the Ring
3 | For the Fellowship: If Frodo and Sam reach Mount Doom, they destroy the One Ring and you immediately
4 | win the game.
5 | For Sauron: If the Nazgûl catch Frodo and Sam, they seize the One Ring and you immediately win the game.
6 | Support of the Races
7 | If you gather 6 different Race symbols on your Green cards, you rally the support of the Races
8 | of Middle-earth and immediately win the game.
9 | Note: The Eagle symbol, present on one Alliance token, is an additional Race symbol that counts as 1 of the 6 required
10 | symbols for the Support of the Races victory.
11 | Conquering Middle-earth
12 | If you are present in all 7 regions (with a Fortress and/or at least 1 Unit), you dominate Middle-earth
13 | and immediately win the game.
14 | If none of these three victory conditions are achieved by the end of chapter 3, the player who is present in the most
15 | regions of Middle-earth (with a Fortress and/or at least 1 Unit) wins the game. In case of tie, share the victory.
16 |
--------------------------------------------------------------------------------
/.github/workflows/package.yaml:
--------------------------------------------------------------------------------
1 | name: Create and publish a Docker image
2 |
3 | on:
4 | push:
5 | branches:
6 | - 'main'
7 | tags:
8 | - 'v*'
9 | workflow_dispatch:
10 |
11 | jobs:
12 | build-and-push-image:
13 | runs-on: ubuntu-latest
14 | steps:
15 | - name: Checkout repository
16 | uses: actions/checkout@v4
17 |
18 | - name: Log in to DockerHub
19 | uses: docker/login-action@v3
20 | with:
21 | username: ${{ secrets.DOCKERHUB_USERNAME }}
22 | password: ${{ secrets.DOCKERHUB_TOKEN }}
23 |
24 | - name: Extract metadata (tags, labels) for Docker
25 | id: meta
26 | uses: docker/metadata-action@v5
27 | with:
28 | images: mzdotai/structured_qa
29 | flavor: |
30 | latest=auto
31 |
32 | - name: Build and push Docker image
33 | id: push
34 | uses: docker/build-push-action@v6
35 | with:
36 | context: .
37 | push: true
38 | tags: ${{ steps.meta.outputs.tags }}
39 | labels: ${{ steps.meta.outputs.labels }}
40 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/GAME END.txt:
--------------------------------------------------------------------------------
1 | The game continues until a player reaches or passes the 25 space
2 | on the scoreboard during the Action phase. Once that happens,
3 | the final round is triggered and the game will end at the end of the
4 | current round.
5 | To calculate the final score, each player should:
6 | > add 1 Victory Point for each card within their Empire to their
7 | current score (including Basic Fields, Field upgrades, and
8 | Conquered Islands),
9 | > add 1 Victory Point for every 2 Resources remaining in their
10 | supply (Resources assigned to cards are not counted towards
11 | this scoring).
12 | > add 1 Victory Point for every 1 Gold remaining in their supply
13 | (Gold tokens assigned to cards are not counted).
14 | The player with the most Victory Points is the winner.
15 | TIES
16 | In case of a tie, the player with the most Locations in their Empire
17 | wins. If the players are still tied, the winner is the player with the
18 | most Workers. If still tied, the winner is the player with the most
19 | cards left in their hand. If there is still a tie, the tied players share
20 | the victory!
21 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=48", "setuptools_scm[toml]>=6.3.1"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "structured-qa"
7 | readme = "README.md"
8 | license = {text = "Apache-2.0"}
9 | requires-python = ">=3.10"
10 | dynamic = ["version"]
11 | dependencies = [
12 | "fire",
13 | "huggingface-hub",
14 | "llama-cpp-python",
15 | "loguru",
16 | "pydantic",
17 | "pymupdf4llm",
18 | "pyyaml",
19 | "rapidfuzz",
20 | "streamlit",
21 | ]
22 |
23 | [project.optional-dependencies]
24 | docs = [
25 | "mkdocs",
26 | "mkdocs-material",
27 | "mkdocstrings-python",
28 | ]
29 |
30 | tests = [
31 | "pytest>=8,<9",
32 | "pytest-sugar>=0.9.6",
33 | "pytest-mock>=3.14.0"
34 | ]
35 |
36 | [project.urls]
37 | Documentation = "https://mozilla-ai.github.io/structured-qa/"
38 | Issues = "https://github.com/mozilla-ai/structured-qa/issues"
39 | Source = "https://github.com/mozilla-ai/structured-qa"
40 |
41 | [tool.setuptools.packages.find]
42 | exclude = ["tests", "tests.*"]
43 | where = ["src"]
44 | namespaces = false
45 |
46 | [tool.setuptools_scm]
47 |
48 | [project.scripts]
49 | structured-qa = "structured_qa.cli:main"
50 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/1.2.1. Internal partitions and doors.txt:
--------------------------------------------------------------------------------
1 | Fire resistance of partitions:
2 | - Copy rooms must have vertical partitions with a fire resistance of EI 30 and doors must have
3 | a fire resistance of EI1 30 and close automatically (linked to the fire detection system).
4 | Door retaining devices:
5 | - Certain fire doors for rooms which are accessed or traversed very frequently are kept open
6 | using magnetic retainers linked to the fire detection system (e.g. entrance halls and lift
7 | lobbies, corridor compartment doors, kitchenette doors and doors of copy rooms).
8 | - As a minimum, rooms accommodating kitchenettes must have doors which close
9 | automatically (linked to the fire detection system).
10 | Door closers:
11 | - In addition to the requirements set out in the applicable legislation, access doors to
12 | toilets/washrooms, kitchenettes, copy rooms, etc. must also be fitted with door closers.
13 | Horizontal communication between two buildings:
14 | - In the case of doors forming an airlock between two buildings, an intermittent red light
15 | signal should be placed above or beside the door frames. This signal should light up on the
16 | non-dangerous side to indicate the danger when the alarm is raised.
17 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/CONQUERING MIDDLE-EARTH.txt:
--------------------------------------------------------------------------------
1 | When you place or move one or more Units, two situations are possible:
2 | If no enemy Unit is present: nothing happens.
3 | If one or more enemy Units are present: trigger a conflict. Each player removes one of their Units
4 | and places it back in front of them. Repeat this until at least one player has no more Units in the region.
5 | Note: An enemy Fortress does not trigger conflicts and does not prevent you from placing your Units in its region.
6 | Therefore, it is possible for both players to be present in the same region.
7 |
8 | When you complete multiple movements, you may move the same Unit multiple times, or split your movement
9 | between multiple Units. For each movement, move a Unit to an adjacent region (one with a connection).
10 | You must complete each movement independently, resolving any conflict triggered, one at a time.
11 |
12 | Example: You play a Purple card that provides 3 movements 1 . The first lets you move a Unit from Enedwaith to Rohan 2 .
13 | Since there is an enemy Unit present, you trigger a conflict and each player removes their Unit. You then use
14 | your second and third movements to send another Unit from Enedwaith to Mordor, passing through Rohan 3
15 |
--------------------------------------------------------------------------------
/.github/workflows/docs.yaml:
--------------------------------------------------------------------------------
1 | name: Documentation
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | paths:
7 | - mkdocs.yml
8 | - 'docs/**'
9 | - 'src/**'
10 | pull_request:
11 | paths:
12 | - mkdocs.yml
13 | - 'docs/**'
14 | - 'src/**'
15 | workflow_dispatch:
16 |
17 | jobs:
18 | docs:
19 | permissions:
20 | contents: write
21 | runs-on: ubuntu-latest
22 | steps:
23 | - name: Check out the repository
24 | uses: actions/checkout@v4
25 | with:
26 | fetch-depth: 0
27 |
28 | - name: Set up Python
29 | uses: actions/setup-python@v5
30 | with:
31 | python-version: '3.10'
32 | cache: "pip"
33 |
34 | - name: Configure git
35 | run: |
36 | git config user.name 'github-actions[bot]'
37 | git config user.email 'github-actions[bot]@users.noreply.github.com'
38 |
39 | - name: Install requirements
40 | run: pip install -e '.[docs]'
41 |
42 | - name: Build docs
43 | if: github.event_name == 'pull_request'
44 | run: mkdocs build -s
45 |
46 | - name: Publish docs
47 | if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
48 | run: mkdocs gh-deploy
49 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/ACTION PHASE.txt:
--------------------------------------------------------------------------------
1 | This is the main phase of the game in which players take
2 | their actions to build Locations, use their Clan action
3 | pawn to activate Action tiles, and use actions of different
4 | Locations.
5 | Starting with the First player and continuing clockwise,
6 | each player performs one action at a time. Each player can
7 | take any available action or pass.
8 | AVAILABLE ACTIONS:
9 | > Build a Location
10 | > Use a Clan action pawn
11 | > Raid an opponent
12 | > Use an action from a Location
13 |
14 | All actions are described in detail in a separate chapter
15 | (see pages 9-11).
16 | > Once a player passes in the Action phase they cannot
17 | perform any additional actions in the current round.
18 | Neither can they be targeted by the actions of other
19 | players. For example, a player who has passed cannot
20 | have any of his Locations Raided.
21 | > There is no limit to the number, type, or order of
22 | actions a player may take during the Action phase, so
23 | long as they perform them one action at a time.
24 | > The Action phase continues until all players have
25 | passed. If any player has reached or passed the 25
26 | during this phase, the last round is triggered and the
27 | game will end upon the completion of the Expedition
28 | phase of this round.
29 |
--------------------------------------------------------------------------------
/example_data/config.yaml:
--------------------------------------------------------------------------------
1 | question: "What optimizer was used to train the model?"
2 | input_file: example_data/1706.03762v7.pdf
3 | output_dir: example_outputs/1706.03762v7.pdf
4 | model: "bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf"
5 | find_prompt : |
6 | You are given two pieces of information:
7 | 1. A list of valid section names.
8 | 2. A user question.
9 |
10 | Your task is to:
11 | - Identify exactly one `section_name` from the provided list that seems related to the user question.
12 | - Return the `section_name` exactly as it appears in the list.
13 | - Do NOT answer the question.
14 | - Do NOT return any additional text, explanation, or formatting.
15 | - Do NOT combine multiple section names into a single response.
16 |
17 | Here is the list of valid section names:
18 |
19 | ```
20 | {SECTIONS}
21 | ```
22 |
23 | Now, based on the following question, return the single most relevant `section_name` from the list.
24 | answer_prompt: |
25 | You are a rigorous assistant answering questions.
26 | You must only answer based on the current information available which is:
27 |
28 | ```
29 | {CURRENT_INFO}
30 | ```
31 |
32 | If the current information available not enough to answer the question,
33 | you must return "I need more info" and nothing else.
34 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/3.2.2 Multi-Head Attention.txt:
--------------------------------------------------------------------------------
1 | Instead of performing a single attention function with dmodel-dimensional keys, values and queries,
2 | we found it beneficial to linearly project the queries, keys and values h times with different, learned
3 | linear projections to dk, dk and dv dimensions, respectively. On each of these projected versions of
4 | queries, keys and values we then perform the attention function in parallel, yielding dv -dimensional
5 | output values. These are concatenated and once again projected, resulting in the final values, as
6 | depicted in Figure 2.
7 | Multi-head attention allows the model to jointly attend to information from different representation
8 | subspaces at different positions. With a single attention head, averaging inhibits this.
9 | MultiHead(Q, K, V ) = Concat(head1, ..., headh)W O
10 | where headi = Attention(QW Q
11 | i , KW K
12 | i , V W V
13 | i )
14 | Where the projections are parameter matrices W Q
15 | i ∈ Rdmodel×dk , W K
16 | i ∈ Rdmodel×dk , W V
17 | i ∈ Rdmodel×dv
18 | and W O ∈ Rhdv ×dmodel .
19 | In this work we employ h = 8 parallel attention layers, or heads. For each of these we use
20 | dk = dv = dmodel/h = 64. Due to the reduced dimension of each head, the total computational cost
21 | is similar to that of single-head attention with full dimensionality
22 |
--------------------------------------------------------------------------------
/.github/workflows/sync_hf_space.yaml:
--------------------------------------------------------------------------------
1 | name: Sync to Hugging Face Space
2 |
3 | on:
4 | release:
5 | types: [published]
6 |
7 | workflow_dispatch:
8 |
9 | jobs:
10 | sync-space:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - uses: actions/checkout@v4
14 | with:
15 | fetch-depth: 0
16 |
17 | - run: git clone https://${{ secrets.HF_USERNAME }}:${{ secrets.HF_TOKEN }}@huggingface.co/spaces/mozilla-ai/structured-qa hf-space
18 |
19 | - run: |
20 | cp demo/app.py hf-space/app.py
21 | cp demo/Dockerfile hf-space/Dockerfile
22 |
23 | - run: |
24 | cd hf-space
25 | git config user.name 'github-actions[bot]'
26 | git config user.email 'github-actions[bot]@users.noreply.github.com'
27 | git add .
28 | git commit -m "Sync with https://github.com/mozilla-ai/structured-qa"
29 |
30 | - name: Push to Hugging Face
31 | run: |
32 | cd hf-space
33 | git push https://${{ secrets.HF_USERNAME }}:${{ secrets.HF_TOKEN }}@huggingface.co/spaces/mozilla-ai/structured-qa main
34 |
35 | - name: Reboot Space
36 | if: always()
37 | env:
38 | HF_TOKEN: ${{ secrets.HF_TOKEN }}
39 | run: |
40 | pip install huggingface_hub
41 | python demo/reboot_space.py
42 |
--------------------------------------------------------------------------------
/tests/unit/test_preprocessing.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from structured_qa.preprocessing import split_markdown_by_headings
4 | from structured_qa.preprocessing import document_to_sections_dir
5 |
6 |
7 | def test_document_to_sections_dir(tmp_path, example_data):
8 | output_dir = tmp_path / "output"
9 | document_to_sections_dir(example_data / "1706.03762v7.pdf", output_dir)
10 | sections = list(output_dir.iterdir())
11 | assert all(section.is_file() and section.suffix == ".txt" for section in sections)
12 | assert len(sections) == 12
13 |
14 |
15 | DEFAULT_HEADINGS = """
16 | # Introduction
17 |
18 | This is the introduction.
19 |
20 | ## Related Work
21 |
22 | This is the related work.
23 |
24 | ### Method
25 |
26 | This is the method.
27 | """
28 |
29 | NUMERIC_HEADINGS = """
30 | **1.** **Introduction**
31 |
32 | This is the introduction.
33 |
34 | **2.** **Related Work**
35 |
36 | This is the related work.
37 |
38 | **2.1** **Method**
39 |
40 | This is the method.
41 | """
42 |
43 |
44 | @pytest.mark.parametrize(
45 | ("markdown_text", "n_sections"),
46 | (
47 | (DEFAULT_HEADINGS, 3),
48 | (NUMERIC_HEADINGS, 2),
49 | ),
50 | )
51 | def test_split_markdown_by_headings(markdown_text, n_sections):
52 | sections = split_markdown_by_headings(markdown_text)
53 | assert len(sections) == n_sections
54 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/3.1 Encoder and Decoder Stacks.txt:
--------------------------------------------------------------------------------
1 | Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two
2 | sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-
3 | wise fully connected feed-forward network. We employ a residual connection [ 11 ] around each of
4 | the two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is
5 | LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer
6 | itself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding
7 | layers, produce outputs of dimension dmodel = 512.
8 |
9 | Decoder: The decoder is also composed of a stack of N = 6 identical layers. In addition to the two
10 | sub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head
11 | attention over the output of the encoder stack. Similar to the encoder, we employ residual connections
12 | around each of the sub-layers, followed by layer normalization. We also modify the self-attention
13 | sub-layer in the decoder stack to prevent positions from attending to subsequent positions. This
14 | masking, combined with fact that the output embeddings are offset by one position, ensures that the
15 | predictions for position i can depend only on the known outputs at positions less than i.
16 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/5.2 CrowS-Pairs.txt:
--------------------------------------------------------------------------------
1 | LLaMA GPT3 OPT
2 | Gender 70.6 62.6 65.7
3 | Religion 79.0 73.3 68.6
4 | Race/Color 57.0 64.7 68.6
5 | Sexual orientation 81.0 76.2 78.6
6 | Age 70.1 64.4 67.8
7 | Nationality 64.2 61.6 62.9
8 | Disability 66.7 76.7 76.7
9 | Physical appearance 77.8 74.6 76.2
10 | Socioeconomic status 71.5 73.8 76.2
11 | Average 66.6 67.2 69.5
12 | Table 12: CrowS-Pairs. We compare the level of bi-
13 | ases contained in LLaMA-65B with OPT-175B and
14 | GPT3-175B. Higher score indicates higher bias.
15 | 5.2 CrowS-Pairs
16 | We evaluate the biases in our model on the CrowS-
17 | Pairs (Nangia et al., 2020). This dataset allows to
18 | measure biases in 9 categories: gender, religion,
19 | race/color, sexual orientation, age, nationality, dis-
20 | ability, physical appearance and socioeconomic sta-
21 | tus. Each example is composed of a stereotype and
22 | an anti-stereotype, we measure the model prefer-
23 | ence for the stereotypical sentence using the per-
24 | plexity of both sentences in a zero-shot setting.
25 | Higher scores thus indicate higher bias. We com-
26 | pare with GPT-3 and OPT-175B in Table 12.
27 | LLaMA compares slightly favorably to both
28 | models on average. Our model is particularly bi-
29 | ased in the religion category (+10% compared to
30 | OPT-175B), followed by age and gender. We ex-
31 | pect these biases to come from CommonCrawl de-
32 | spite multiple filtering steps.
33 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: Structured QA
2 | repo_url: https://github.com/mozilla-ai/structured-qa
3 | repo_name: structured-qa
4 |
5 | nav:
6 | - Home: index.md
7 | - Getting Started: getting-started.md
8 | - Step-by-Step Guide: step-by-step-guide.md
9 | - Command Line Interface: cli.md
10 | - API Reference: api.md
11 | - Customization Guide: customization.md
12 | - Future Features & Contributions: future-features-contributions.md
13 |
14 | theme:
15 | icon:
16 | repo: fontawesome/brands/github
17 | name: material
18 | palette:
19 | - scheme: default
20 | primary: black
21 | toggle:
22 | icon: material/lightbulb
23 | name: Switch to dark mode
24 | - scheme: slate
25 | primary: grey
26 | toggle:
27 | icon: material/lightbulb-outline
28 | name: Switch to light mode
29 | logo: images/Blueprint-logo-white.png
30 | favicon: images/Blueprint-logo-black-flavicon.png
31 | extra_css:
32 | - assets/custom.css
33 |
34 | markdown_extensions:
35 | - pymdownx.highlight:
36 | anchor_linenums: true
37 | line_spans: __span
38 | pygments_lang_class: true
39 | - pymdownx.inlinehilite
40 | - pymdownx.snippets
41 | - pymdownx.superfences
42 | - pymdownx.tabbed:
43 | alternate_style: true
44 | plugins:
45 | - search
46 | - mkdocstrings:
47 | handlers:
48 | python:
49 | options:
50 | show_root_heading: true
51 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/3.5 Positional Encoding.txt:
--------------------------------------------------------------------------------
1 | Since our model contains no recurrence and no convolution, in order for the model to make use of the
2 | order of the sequence, we must inject some information about the relative or absolute position of the
3 | tokens in the sequence. To this end, we add "positional encodings" to the input embeddings at the
4 | bottoms of the encoder and decoder stacks. The positional encodings have the same dimension dmodel
5 | as the embeddings, so that the two can be summed. There are many choices of positional encodings,
6 | learned and fixed [9].
7 | In this work, we use sine and cosine functions of different frequencies:
8 | P E(pos,2i) = sin(pos/100002i/dmodel )
9 | P E(pos,2i+1) = cos(pos/100002i/dmodel )
10 | where pos is the position and i is the dimension. That is, each dimension of the positional encoding
11 | corresponds to a sinusoid. The wavelengths form a geometric progression from 2π to 10000 · 2π. We
12 | chose this function because we hypothesized it would allow the model to easily learn to attend by
13 | relative positions, since for any fixed offset k, P Epos+k can be represented as a linear function of
14 | P Epos.
15 | We also experimented with using learned positional embeddings [9] instead, and found that the two
16 | versions produced nearly identical results (see Table 3 row (E)). We chose the sinusoidal version
17 | because it may allow the model to extrapolate to sequence lengths longer than the ones encountered
18 | during training.
19 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/LORA ABSTRACT.txt:
--------------------------------------------------------------------------------
1 | An important paradigm of natural language processing consists of large-scale pre-
2 | training on general domain data and adaptation to particular tasks or domains. As
3 | we pre-train larger models, full fine-tuning, which retrains all model parameters,
4 | becomes less feasible. Using GPT-3 175B as an example – deploying indepen-
5 | dent instances of fine-tuned models, each with 175B parameters, is prohibitively
6 | expensive. We propose Low-Rank Adaptation, or LoRA, which freezes the pre-
7 | trained model weights and injects trainable rank decomposition matrices into each
8 | layer of the Transformer architecture, greatly reducing the number of trainable pa-
9 | rameters for downstream tasks. Compared to GPT-3 175B fine-tuned with Adam,
10 | LoRA can reduce the number of trainable parameters by 10,000 times and the
11 | GPU memory requirement by 3 times. LoRA performs on-par or better than fine-
12 | tuning in model quality on RoBERTa, DeBERTa, GPT-2, and GPT-3, despite hav-
13 | ing fewer trainable parameters, a higher training throughput, and, unlike adapters,
14 | no additional inference latency. We also provide an empirical investigation into
15 | rank-deficiency in language model adaptation, which sheds light on the efficacy of
16 | LoRA. We release a package that facilitates the integration of LoRA with PyTorch
17 | models and provide our implementations and model checkpoints for RoBERTa,
18 | DeBERTa, and GPT-2 at https://github.com/microsoft/LoRA
19 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/LOOKOUT PHASE.txt:
--------------------------------------------------------------------------------
1 | In this phase each player may decide to acquire up to 4 new cards
2 | and add them to their hand.
3 | To execute the Lookout phase follow these steps:
4 | 1. Each player should set aside any cards in their hand for the
5 | time being.
6 | 2. Each player draws 4 cards from the top of their Clan deck and
7 | decides which of these cards to keep.
8 | 3. For each card that they wish to keep, they have to Spend 1 of
9 | their by putting it on their Clan tile.
10 | 4. Any cards they do not want to keep should be placed in the
11 | Clan’s discard pile.
12 | 5. Players retrieve the cards they had set aside.
13 |
14 | Example:
15 | 2 cards
16 | placed
17 | in the
18 | discard
19 | pile
20 | 2 Workers spent to keep 2 cards
21 |
22 | IMPORTANT: Some cards have a Storage effect that may provide
23 | players with extra Goods if enough of the required Resources are
24 | assigned to them. A player gains those Goods at the end of each
25 | Lookout phase, before starting the Action phase (more about this
26 | effect on page 12).
27 | NOTE 1: There’s no limit to the number of cards a player may have
28 | in their hand.
29 | NOTE 2: If there are no cards left in your Clan deck, shuffle the
30 | discard pile to create a new deck.
31 | > Cards in a player’s hand are kept secret from other players
32 |
33 | No production phase!
34 | A reminder, especially to all the fans of Imperial Settlers,
35 | there is no production phase; players will have to use
36 | Harvest action to gain Goods shown on the Field cards.
37 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/CARD AND TILE EFFECTS.txt:
--------------------------------------------------------------------------------
1 | Grey cards provide Skills that let you play other cards and tiles in your play area.
2 | You gain 1 Skill per symbol shown. Each symbol may only be used once per turn, on each of your turns.
3 | Ruse Strength Courage Knowledge Leadership
4 | When multiple Skills are separated by a , you may only use one of them per turn (you choose).
5 |
6 | Yellow cards immediately provide Coins that you will be able to spend to play other cards and tiles
7 | in your play area.
8 | Take, from the reserve, the number of Coins shown in the symbol.
9 |
10 | Blue cards immediately let you advance on the Quest of the Ring track.
11 | Move your character along the Quest of the Ring track, one space per Ring symbol.
12 |
13 | Green cards represent the Races of Middle-earth with whom you may ally:
14 | Elves Ents Hobbits Humans Dwarves Wizards
15 |
16 | Red cards immediately let you place Units in the regions of Middle-earth (see page 7).
17 | Choose one of the two regions shown by the banners and place all Units in the chosen region.
18 | Number of Units to place
19 | Choice of regions where you may place Units
20 |
21 | Purple cards (only available in chapter 3) immediately let you complete various maneuvers.
22 | Move 1 of your Units
23 | to an adjacent region.
24 | Your opponent loses
25 | 1 Coin.
26 | Remove 1 enemy Unit
27 | from any region
28 |
29 | Landmark tiles immediately let you place Fortresses in regions of Middle-earth (see page 7) and benefit
30 | from unique effects (see the Player Aid).
31 | Region where you may place a Fortress
32 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/CARD AND TILE COSTS.txt:
--------------------------------------------------------------------------------
1 | Coins
2 | In order to play them, certain cards have a Coin cost that you must pay to the reserve.
3 |
4 | Skills
5 | In order to play them, tiles and the majority of cards require you to have one
6 | or more Skills (see page 5) in your play area.
7 | If you do not have the required Skills, you may pay 1 Coin to the reserve
8 | per missing Skill symbol.
9 |
10 | Notes:
11 | • There is no limit to the number of Skills you may pay for, to the reserve, on your turn.
12 | • If a card does not require a Skill or a Coin, it has no cost, so you may play it for free.
13 |
14 | Landmark tiles have an additional Coin cost equal to the number
15 | of your Fortress pawns already on the central board.
16 | Therefore, the additional cost of your first tile is 0 Coins.
17 |
18 | Chaining
19 | Starting in chapter 2, certain cards may be played for free through their chaining symbol.
20 | If you have, in your play area, a card with a matching symbol in its top-right corner, you may play the card for free,
21 | without having the required Skills.
22 | Note: If you do not have the matching chaining symbol for a card, you may still play it normally by paying its Skill
23 | and/or Coin cost.
24 |
25 | Example:
26 | In chapter 1, you play
27 | this card for free 1 .
28 | In addition to its effect
29 | (see page 5), it has a chaining
30 | symbol 2 .
31 |
32 | n chapter 2, you may play this
33 | card for free since you have
34 | the matching chaining symbol 3
35 | on one of your played cards.
36 | Otherwise, you would need
37 | to have the required Skills
38 | (or pay 1 Coin per missing Skill
39 | symbol) 4 .
40 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/Risk Perception.txt:
--------------------------------------------------------------------------------
1 | In collaboration with Accenture, this year a team of
2 | Stanford researchers ran a global survey with respondents
3 | from more than 1,000 organizations to assess the global
4 | state of responsible AI. The organizations, with total
5 | revenues of at least $500 million each, were taken
6 | from 20 countries and 19 industries and responded in
7 | February–March 2024.3 The objective of the Global State
8 | of Responsible AI survey was to gain an understanding of
9 | the challenges of adopting responsible AI practices and to
10 | allow for a comparison of responsible AI activities across
11 | 10 dimensions and across surveyed industries and regions.
12 | Respondents were asked which risks were relevant to
13 | them, given their AI adoption strategy; i.e., depending
14 | on whether they develop, deploy, or use generative or
15 | nongenerative AI. They were presented with a list
16 | of 14 risks and could select all that apply to them,
17 | given their AI adoption strategies.4 The researchers
18 | found that privacy and data governance risks, e.g.,
19 | the use of data without the owner’s consent or data
20 | leaks, are the leading concerns across the globe.
21 | Notably, they observe that these concerns are
22 | significantly higher in Asia and Europe compared to
23 | North America. Fairness risks were only selected by
24 | 20% of North American respondents, significantly
25 | less than respondents in Asia (31%) and Europe
26 | (34%) (Figure 3.1.5). Respondents in Asia selected,
27 | on average, the highest number of relevant risks
28 | (4.99), while Latin American respondents selected,
29 | on average, the fewest (3.64).
30 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/EXPEDITION PHASE.txt:
--------------------------------------------------------------------------------
1 | One after another, starting from the first Ship token placed on the arrow,
2 | players choose a single Island to either Pillage or Conquer for each one of
3 | their Ships.
4 | A player with just a Ship on the Expedition board (no Raze token or Fish
5 | assigned) can choose any face-up Nearby Island, or draw from the top of the
6 | Nearby Island deck. To be able to take any Distant Island card, either face-
7 | up or from the top of the deck, the Ship performing that expedition must have
8 | had a fish assigned to it. Rations are needed for the long journey!
9 | A player can choose to Pillage a selected Island card without any additional
10 | cost to gain the Goods presented on the Pillage space. They then put the
11 | Pillaged card on the appropriate discard pile (for Nearby and Distant Islands).
12 | If a player has assigned a Ship with a Raze token, they can choose to Conquer
13 | an Island and add it to their Empire to gain access to its action and/or ability.
14 | Each Conquered Island also provides 1 VP at the end of the game, just like any
15 | other Location.
16 | Once a player has chosen an Island card to either Pillage or Conquer, they
17 | return the used Ship to their supply and discard any Raze token and/or Fish
18 | assigned to that Ship, even if they didn’t use them. Now, if there is another
19 | Ship in the queue, it is their turn to choose an Island card.
20 | NOTE 1: Some cards Feature effects are triggered when a card is Conquered.
21 | This happens when you move the Island card to your Empire, not when you
22 | assign a Ship during a Sail action (see page 10).
23 | NOTE 2: If there are no cards left in the Island deck, shuffle the discarded
24 | cards to make up the new deck.
25 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/Europe.txt:
--------------------------------------------------------------------------------
1 | Data on diversity trends about European CS graduates
2 | comes from Informatics Europe. 3
3 | Informatics, CS, CE, and IT Bachelor’s
4 | Graduates
5 | In the majority of surveyed European nations, there
6 | is a persistent gender disparity among bachelor’s-
7 | level graduates in informatics, computer science,
8 | computer engineering, and information technology.
9 | Despite some narrowing since 2011, men continue to
10 | dominate. For example, France (14.8%), the United
11 | Kingdom (17.8%), and Germany (21.5%) show relatively
12 | low proportions of female graduates in these fields
13 | (Figure 8.1.15). Bulgaria stands out among the surveyed
14 | countries with the highest proportion of female
15 | graduates (35.2%).
16 | Informatics, CS, CE, and IT Master’s Graduates
17 | Similar gender disparities are observed among
18 | European informatics, CS, CE, and IT master’s
19 | graduates, with a significantly greater proportion of
20 | males than females in most surveyed countries. As of
21 | 2022, Estonia (42.0%), Romania (41.9%), and Bulgaria
22 | (40.4%) reported the greatest proportion of female
23 | master’s graduates (Figure 8.1.16). In contrast, Belgium
24 | (13.7%), Italy (14.1%), and Switzerland (15.8%) reported
25 | the smallest proportion of female master’s graduates.
26 | Informatics, CS, CE, and IT PhD Graduates
27 | In all surveyed European countries, informatics,
28 | CS, CE, and IT PhD graduates are predominantly
29 | male. However, in nations such as the United
30 | Kingdom, Germany, and Switzerland, the gender
31 | gap has narrowed over the last decade, with women
32 | constituting a growing share of PhD graduates (Figure
33 | 8.1.17).4 In contrast, countries like Finland and Spain
34 | have seen the gap slightly widen.
35 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/2.4 Recurrent Networks.txt:
--------------------------------------------------------------------------------
1 | As recurrent neural networks (RNNs) can be unrolled to
2 | feed-forward representation, RNNs can also be equivalently
3 | represented as decision trees. We study following recurrent
4 | neural network. Note that we simply omit the bias terms as
5 | they can be represented by concatenating a 1 value to input
6 | vectors.
7 | h(t) = σ(WT h(t−1) + UT x(t))
8 | o(t) = VT h(t) (12)
9 | Similar to previous analysis, one can rewrite h(t) as fol-
10 | lows.
11 | h(t) = a(t) (WT h(t−1) + UT x(t)) (13)
12 | Eq. 13 can be rewritten follows.
13 | h(t) = a(t) (
14 | 1∏
15 | j=(t−1)
16 | (WT a(j)))WT h(0)
17 | +a(t)
18 | t∑
19 | i=1
20 | (
21 | i∏
22 | j=(t−1)
23 | (WT a(j)))UT x(i)
24 | (14)
25 | Note that in Eq. 14, the product operator stands for ma-
26 | trix multiplication, its steps are −1 and we consider the out-
27 | put of product operator to be 1 when i = t. One can rewrite
28 | Eq. 14 by introducing cj ˆWj as follows.
29 | h(t) = a(t) c1 ˆW1WT h(0) + a(t)
30 | t∑
31 | i=1
32 | ci ˆWiUT x(i)
33 | ci ˆWT
34 | i =
35 | i∏
36 | j=(t−1)
37 | (WT a(j)
38 | Combining Eq. 15 and Eq. 12, one can write o(t) as
39 | follows.
40 | o(t) = a(t) ˆVT
41 | c1 ˆW1WT h(0) +a(t) ˆVT t∑
42 | i=1
43 | ci ˆWiUT x(i) (16)
44 | Eq. 16 can be further simplified to the following.
45 | o(t) = c1 ˆZT
46 | 1 WT h(0) +
47 | t∑
48 | i=1
49 | ci ˆZiUT x(i) (17)
50 | In Eq. 17, ci ˆZT
51 | i = a(t) ˆVT
52 | ci ˆWi .As one can observe from
53 | Eq. 17, the RNN output only depends on the categoriza-
54 | tion vector ci, which enables the tree equivalence -similar
55 | to previous analysis.
56 | Note that for RNNs, a popular choice for σ in Eq. 12
57 | is tanh. As mentioned in Section 2.3, in order to provide
58 | finite trees, one might consider using a piece-wise linear
59 | approximation of tanh.
60 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/CHAPTER 5: WHERE DO YOU WANT TO MAKE YOUR WORK AVAILABLE?.txt:
--------------------------------------------------------------------------------
1 | IN ADDITION TO DECIDING HOW “OPEN” YOU
2 | want to make your work, you will also need to decide
3 | where you will make your work openly accessible. This
4 | involves first deciding which open access model (or
5 | models) you will use to disseminate your work. Then,
6 | you need to decide what publication venue (or venues)
7 | within that model best suits your needs.
8 | Open access models are generally divided into two
9 | categories: “Gold Open Access” and “Green Open Access.”
10 | Gold Open Access describes the model by which an open
11 | access publisher makes your work openly accessible. If
12 | you opt to use the Gold Open Access model, you will then
13 | need to decide what open access publisher provides the
14 | best venue for your work. In contrast, Green Open Access
15 | (also called “self-archiving”) describes the model by
16 | which you as an author make your work openly
17 | 64 Understanding Open Access
18 | accessible. If you opt to use the Green Open Access
19 | model, you will then need to select the best online
20 | venue for your work. Some options include your own
21 | website, your departmental website, or an open access
22 | repository.
23 | Gold and Green Open Access are not mutually
24 | exclusive. An author can publish a work with an open
25 | access publisher and upload the same work to an
26 | open access repository or personal website, and vice
27 | versa (depending on the terms of their publishing
28 | agreement).
29 | THIS CHAPTER:
30 | • Describes the key features of Gold Open Access
31 | • Presents factors to consider when selecting an
32 | open access publisher
33 | • Describes the key features of Green Open Access
34 | • Presents factors to consider when deciding where
35 | to self-archive a work.
36 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/LLM Tokenization Introduces Unfairness.txt:
--------------------------------------------------------------------------------
1 | Research from the University of Oxford highlights
2 | how inequality in AI originates at the tokenization
3 | stage. Tokenization, the process of breaking down
4 | text into smaller units for processing and analysis,
5 | exhibits significant variability across languages.
6 | The number of tokens used for the same sentence
7 | can vary up to 15 times between languages. For
8 | instance, Portuguese closely matches English in the
9 | efficiency of the GPT-4 tokenizer, yet it still requires
10 | approximately 50% more tokens to convey the
11 | same content. The Shan language is the furthest
12 | from English, needing 15 times more tokens. Figure
13 | 3.5.9 visualizes the concept of a context window
14 | while figure 3.5.10 illustrates the token consumption
15 | of the same sentence across different languages.
16 | The authors identify three major inequalities that
17 | result from variable tokenization. First, users of
18 | languages that require more tokens than English
19 | for the same content face up to four times higher
20 | inference costs and longer processing times, as
21 | both are dependent on the number of tokens.
22 | Figure 3.5.11 illustrates the variation in token
23 | length and execution time for the same sentence
24 | across different languages or language families.
25 | Second, these users may also experience increased
26 | processing times because models take longer
27 | to process a greater number of tokens. Lastly,
28 | given that models operate within a fixed context
29 | window—a limit on the amount of text or content
30 | that can be input—languages that require more
31 | tokens proportionally use up more of this window.
32 | This can reduce the available context for the model,
33 | potentially diminishing the quality of service for
34 | those users.
35 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/3 Main results.txt:
--------------------------------------------------------------------------------
1 | Following previous work (Brown et al., 2020), we
2 | consider zero-shot and few-shot tasks, and report
3 | results on a total of 20 benchmarks:
4 | • Zero-shot. We provide a textual description
5 | of the task and a test example. The model
6 | either provides an answer using open-ended
7 | generation, or ranks the proposed answers.
8 | • Few-shot. We provide a few examples of the
9 | task (between 1 and 64) and a test example.
10 | The model takes this text as input and gener-
11 | ates the answer or ranks different options.
12 | We compare LLaMA with other foundation mod-
13 | els, namely the non-publicly available language
14 | models GPT-3 (Brown et al., 2020), Gopher (Rae
15 | et al., 2021), Chinchilla (Hoffmann et al., 2022)
16 | and PaLM (Chowdhery et al., 2022), as well as
17 | the open-sourced OPT models (Zhang et al., 2022),
18 | GPT-J (Wang and Komatsuzaki, 2021), and GPT-
19 | Neo (Black et al., 2022). In Section 4, we also
20 | briefly compare LLaMA with instruction-tuned
21 | models such as OPT-IML (Iyer et al., 2022) and
22 | Flan-PaLM (Chung et al., 2022).
23 | We evaluate LLaMA on free-form generation
24 | tasks and multiple choice tasks. In the multiple
25 | choice tasks, the objective is to select the most
26 | appropriate completion among a set of given op-
27 | tions, based on a provided context. We select the
28 | completion with the highest likelihood given the
29 | provided context. We follow Gao et al. (2021)
30 | and use the likelihood normalized by the number
31 | of characters in the completion, except for certain
32 | datasets (OpenBookQA, BoolQ), for which we fol-
33 | low Brown et al. (2020), and select a completion
34 | based on the likelihood normalized by the likeli-
35 | hood of the completion given “Answer:” as context:
36 | P (completion|context)/P (completion|“Answer:”).
37 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yaml:
--------------------------------------------------------------------------------
1 | name: "\U0001F680 Feature Request"
2 | description: Suggest an idea for this project
3 | title: "[FEATURE]: "
4 | projects:
5 | - "Project-name"
6 | labels:
7 | - enhancement
8 | body:
9 | - type: markdown
10 | attributes:
11 | value: |
12 | Thanks for taking the time to fill out this form!
13 | Please make sure you have searched for a similar [issue](https://github.com/mozilla-ai/project-name/issues) before submitting a new one.
14 |
15 | - type: textarea
16 | id: motivation
17 | attributes:
18 | label: Motivation
19 | description: |
20 | A clear and concise description of the proposal and why this is important.
21 | placeholder: |
22 | I'm always frustrated when...
23 | validations:
24 | required: true
25 |
26 | - type: textarea
27 | id: alternatives
28 | attributes:
29 | label: Alternatives
30 | description: |
31 | A clear and concise description of any alternative solutions or features you've considered.
32 | placeholder: |
33 | I've considered...
34 | validations:
35 | required: false
36 |
37 | - type: textarea
38 | id: contribution
39 | attributes:
40 | label: Contribution
41 | description: |
42 | Is there any way that you could help, e.g. by submitting a PR?
43 | Make sure to read the [contribution guidelines](https://github.com/mozilla-ai/project-name/blob/main/CONTRIBUTING.md).
44 | placeholder: |
45 | I could help by...
46 | validations:
47 | required: false
48 |
49 | - type: checkboxes
50 | id: search
51 | attributes:
52 | label: Have you searched for similar issues before submitting this one?
53 | options:
54 | - label: Yes, I have searched for similar issues
55 | required: true
56 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/WHY ARE OPEN ACCESS POLICIES ADOPTED?.txt:
--------------------------------------------------------------------------------
1 | Open access policies maximize the value of investment
2 | in research by ensuring that more readers can access
3 | research results and scholarship than if the works were
4 | available through restricted means alone.
5 | Universities, for example, further their educa-
6 | tional missions by implementing open access policies
7 | that make scholarly works more widely available. Some
8 | faculty members have banded together at their respec-
9 | tive institutions to express their collective commit-
10 | ment to open access, resulting in a growing number of
11 | university open access policies in recent years. Under
12 | Why Make Your Work Openly Accessible? 33
13 | such policies, faculty members typically grant to their
14 | universities the right to deposit faculty-authored works
15 | in institutional repositories.
16 | Under similar policies, government agencies
17 | require grant recipients to deposit their research
18 | findings in open access repositories where they are
19 | available for free public access. The National Institutes of
20 | Health (“NIH”) Public Access Policy is one such policy. 21
21 | The number of federal open access policies is growing,
22 | largely because the Obama administration issued a
23 | policy directive in 2013 designed to increase public
24 | access to the results of federally funded research.22
25 | Under the policy, many federal agencies are required to
26 | develop plans to make the published results of federally
27 | funded research freely available to the public.
28 | Foundations that sponsor research are also
29 | increasingly adopting open access policies. For
30 | example, the Bill and Melinda Gates Foundation imple-
31 | mented an open access policy in 2015 that requires
32 | research resulting from the Foundation’s funding to be
33 | made available under libre open access terms.
34 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/15.3. API Fundamentals.txt:
--------------------------------------------------------------------------------
1 | Graph memory nodes are graph nodes representing either memory allocation or free actions. As a
2 | shorthand, nodes that allocate memory are called allocation nodes. Likewise, nodes that free memory
3 | are called free nodes. Allocations created by allocation nodes are called graph allocations. CUDA as-
4 | signs virtual addresses for the graph allocation at node creation time. While these virtual addresses
5 | are fixed for the lifetime of the allocation node, the allocation contents are not persistent past the
6 | freeing operation and may be overwritten by accesses referring to a different allocation.
7 | Graph allocations are considered recreated every time a graph runs. A graph allocation’s lifetime, which
8 | differs from the node’s lifetime, begins when GPU execution reaches the allocating graph node and
9 | ends when one of the following occurs:
10 | ▶ GPU execution reaches the freeing graph node
11 | ▶ GPU execution reaches the freeing cudaFreeAsync() stream call
12 | ▶ immediately upon the freeing call to cudaFree()
13 | Note: Graph destruction does not automatically free any live graph-allocated memory, even though it
14 | ends the lifetime of the allocation node. The allocation must subsequently be freed in another graph,
15 | or using cudaFreeAsync()∕cudaFree().
16 | Just like other Graph Structure, graph memory nodes are ordered within a graph by dependency edges.
17 | A program must guarantee that operations accessing graph memory:
18 | ▶ are ordered after the allocation node
19 | ▶ are ordered before the operation freeing the memory
20 | Graph allocation lifetimes begin and usually end according to GPU execution (as opposed to API invo-
21 | cation). GPU ordering is the order that work runs on the GPU as opposed to the order that the work
22 | is enqueued or described. Thus, graph allocations are considered ‘GPU ordered.
23 |
--------------------------------------------------------------------------------
/docs/getting-started.md:
--------------------------------------------------------------------------------
1 | Get started with Structured-QA using one of the options below:
2 |
3 | ---
4 |
5 | ## Setup options
6 |
7 | === "☁️ Google Colab (GPU)"
8 |
9 | The easiest way to play with the code on a GPU, for free.
10 |
11 | Click the button below to launch the project directly in Google Colab:
12 |
13 |
14 |
15 | === "☁️ GitHub Codespaces"
16 |
17 | Click the button below to launch the project directly in GitHub Codespaces:
18 |
19 |
20 |
21 | Once the Codespaces environment launches, inside the terminal, start the Streamlit demo by running:
22 |
23 | ```bash
24 | python -m streamlit run demo/app.py
25 | ```
26 |
27 | === "💻 Local Installation"
28 |
29 | You can install the project from Pypi:
30 |
31 | ```bash
32 | pip install structured-qa
33 | ```
34 |
35 | Check the [Command Line Interface](./cli.md) guide.
36 |
37 | ---
38 |
39 | Alternatively, you can clone and install it in editable mode:
40 |
41 | 1. **Clone the Repository**
42 |
43 | ```bash
44 | git clone https://github.com/mozilla-ai/structured-qa.git
45 | cd structured-qa
46 | ```
47 |
48 | 2. **Install the project and its Dependencies**
49 |
50 | ```bash
51 | pip install -e .
52 | ```
53 |
54 | 3. **Run the Demo**
55 |
56 | ```bash
57 | python -m streamlit run demo/app.py
58 | ```
59 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/23.1. What is Lazy Loading?.txt:
--------------------------------------------------------------------------------
1 | Lazy Loading delays loading of CUDA modules and kernels from program initalization closer to kernels
2 | execution. If a program does not use every single kernel it has included, then some kernels will be
3 | loaded unneccesarily. This is very common, especially if you include any libraries. Most of the time,
4 | programs only use a small amount of kernels from libraries they include.
5 | Thanks to Lazy Loading, programs are able to only load kernels they are actually going to use, saving
6 | time on initialization. This reduces memory overhead, both on GPU memory and host memory.
7 | Lazy Loading is enabled by setting the CUDA_MODULE_LOADING environment variable to LAZY.
8 | Firstly, CUDA Runtime will no longer load all modules during program initialization, with the exception
9 | of modules containing managed variables. Each module will be loaded on first usage of a variable or
10 | a kernel from that module. This optimization is only relevant to CUDA Runtime users, CUDA Driver
11 | users who use cuModuleLoad are unaffected. This optimization shipped in CUDA 11.8. The behavior
12 | for CUDA Driver users who use cuLibraryLoad to load module data into memory can be changed by
13 | setting the CUDA_MODULE_DATA_LOADING environment variable.
14 | Secondly, loading a module (cuModuleLoad*() family of functions) will not be loading kernels immedi-
15 | ately, instead it will delay loading of a kernel until cuModuleGetFunction() is called. There are certain
16 | exceptions here, some kernels have to be loaded during cuModuleLoad*(), such as kernels of which
17 | pointers are stored in global variables. This optimization is relevant to both CUDA Runtime and CUDA
18 | Driver users. CUDA Runtime will only call cuModuleGetFunction() when a kernel is used/referenced
19 | for the first time. This optimization shipped in CUDA 11.7.
20 | Both of these optimizations are designed to be invisible to the user, assuming CUDA Programming
21 | Model is followed.
22 |
--------------------------------------------------------------------------------
/docs/future-features-contributions.md:
--------------------------------------------------------------------------------
1 | # 🚀 **Future Features & Contributions**
2 |
3 | This Blueprint is an evolving project designed to grow with the help of the open-source community. Whether you’re an experienced developer or just starting, there are many ways you can contribute and help shape the future of this tool.
4 |
5 | ---
6 |
7 | ## 🌟 **How You Can Contribute**
8 |
9 | ### 🛠️ **Enhance the Blueprint**
10 | - Check the [Issues](https://github.com/mozilla-ai/structured-qa/issues) page to see if there are feature requests you'd like to implement
11 | - Refer to our [Contribution Guide](https://github.com/mozilla-ai/structured-qa/blob/main/CONTRIBUTING.md) for more details on contributions
12 |
13 | ### 🎨 **Extensibility Ideas**
14 |
15 | This Blueprint is designed to be a foundation you can build upon. By extending its capabilities, you can open the door to new applications, improve user experience, and adapt the Blueprint to address other use cases. Here are a few ideas for how you can expand its potential:
16 |
17 |
18 | We’d love to see how you can enhance this Blueprint! If you create improvements or extend its capabilities, consider contributing them back to the project so others in the community can benefit from your work. Check out our [Contributions Guide](https://github.com/mozilla-ai/structured-qa/blob/main/CONTRIBUTING.md) to get started!
19 |
20 | ### 💡 **Share Your Ideas**
21 | Got an idea for how this Blueprint could be improved? You can share your suggestions through [GitHub Issues](https://github.com/mozilla-ai/structured-qa/issues).
22 |
23 | ### 🌍 **Build New Blueprints**
24 | This project is part of a larger initiative to create a collection of reusable starter code solutions that use open-source AI tools. If you’re inspired to create your own Blueprint, you can use the [Blueprint-template](https://github.com/new?template_name=Blueprint-template&template_owner=mozilla-ai) to get started.
25 |
26 | ---
27 |
28 | Your contributions help make this Blueprint better for everyone 🎉
29 |
--------------------------------------------------------------------------------
/docs/customization.md:
--------------------------------------------------------------------------------
1 | # 🎨 **Customization Guide**
2 |
3 | This Blueprint is designed to be flexible and easily adaptable to your specific needs. This guide will walk you through some key areas you can customize to make the Blueprint your own.
4 |
5 | ---
6 |
7 | ## 🖋️ **Customizable Parameters**
8 |
9 | -- **`question`**: The question to be answered.
10 |
11 | - **`input_file`**: The input file specifies the document to be processed. Supports the `pdf` format.
12 |
13 | - **`output_dir`**: Path to the output directory where the extracted sections will be saved.
14 |
15 | - **`model`**: Any model that can be loaded by [`LLama.from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest/#pulling-models-from-hugging-face-hub) can be used here. Format is expected to be `{org}/{repo}/{filename}`. For example: `Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q8_0.gguf`.
16 |
17 | - **`find_prompt`**:The prompt for finding the section. See [`FIND_PROMPT`][structured_qa.config.FIND_PROMPT].
18 |
19 | - **`answer_prompt`**: The prompt for answering the question.See [`ANSWER_PROMPT`][structured_qa.config.ANSWER_PROMPT].
20 |
21 | ## ⌨️ **Customizing When Running via the CLI**
22 |
23 | If you’re running the pipeline from the command line, you can customize the parameters by modifying the **`example_data/config.yaml`** file.
24 |
25 | Running in the CLI:
26 | ```bash
27 | structured-qa --from_config example_data/config.yaml
28 | ```
29 |
30 | ### Steps to Customize
31 | 1. Open the `config.yaml` file.
32 | 2. Locate the parameter you want to adjust.
33 | 3. Update the value and save the file.
34 |
35 | #### Example: Changing the Text-to-Text Model
36 | In `config.yaml`, modify the `model` entry:
37 |
38 | ```yaml
39 | model: "Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q8_0.gguf"
40 | ```
41 |
42 |
43 | ## 🤝 **Contributing to the Blueprint**
44 |
45 | Want to help improve or extend this Blueprint? Check out the **[Future Features & Contributions Guide](future-features-contributions.md)** to see how you can contribute your ideas, code, or feedback to make this Blueprint even better!
46 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/CHAPTER OVERVIEW.txt:
--------------------------------------------------------------------------------
1 | Preparing a chapter
2 | At the start of each chapter (1, 2, then 3), place cards from the corresponding chapter in the central play area,
3 | following the diagram below (reminder on the sides of the box). Be careful, since certain cards are placed faceup
4 | and others facedown. Place the 3 remaining cards, facedown, in the discard.
5 |
6 | Turn overview
7 | The Sauron player begins the game, then both players take turns, until the end of the game.
8 | On your turn, you may either take a Chapter card or take a Landmark tile.
9 |
10 | A. Take a Chapter card
11 | From the central play area, choose an available card,
12 | meaning one that is not partially covered by any other cards.
13 | Then, play it in front of you or discard it.
14 |
15 | Play the card in front of you
16 | Pay the card cost, if any (see page 4), and place it
17 | in your play area. You may immediately benefit
18 | from its effect (see page 5).
19 | Note: Stack cards in front of you by color, making sure
20 | you can still see their effects.
21 | Discard the card
22 | Place the card, facedown, in the discard
23 | and take as many Coins from the reserve
24 | as the current chapter:
25 | Chapter 1: 1 Coin
26 | Chapter 2: 2 Coins
27 | Chapter 3: 3 Coins
28 |
29 | Finally, end your turn by revealing any cards that are now available.
30 |
31 | B. Take a Landmark tile
32 | Choose one of the faceup tiles. Pay its cost (see page 4) and place it in your play area.
33 | Immediately place a Fortress pawn on the corresponding region of the central board and benefit
34 | from its other effects (see the Player Aid).
35 | Finally, end your turn without revealing a new tile.
36 |
37 | End of a chapter
38 | A chapter ends once the final card of this chapter has been taken.
39 | Set up the cards for the next chapter according to the corresponding diagram and reveal new Landmark tiles
40 | until there are, if possible, 3 faceup.
41 | Then continue the game as normal.
42 | Note: Since you alternate taking turns, the player who ends a chapter does not begin the next one, unless they are using
43 | a “Take another turn” effect (see the Player Aid).
44 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/3.4 Robustness of Chain of Thought.txt:
--------------------------------------------------------------------------------
1 | Sensitivity to exemplars is a key consideration of prompt-
2 | ing approaches—for instance, varying the permutation of
3 | few-shot exemplars can cause the accuracy of GPT-3 on
4 | SST-2 to range from near chance (54.3%) to near state of
5 | the art (93.4%) (Zhao et al., 2021). In this final subsec-
6 | tion, we evaluate robustness to chains of thought written
7 | by different annotators. In addition to the results above,
8 | which used chains of thought written by an Annotator
9 | A, two other co-authors of this paper (Annotators B and
10 | C) independently wrote chains of thought for the same
11 | few-shot exemplars (shown in Appendix H). Annotator A
12 | also wrote another chain of thought that was more concise
13 | than the original, following the style of solutions given in
14 | Cobbe et al. (2021).1
15 | Figure 6 shows these results for LaMDA 137B on GSM8K
16 | and MAWPS (ablation results for other datasets are given
17 | in Appendix Table 6 / Table 7). Although there is variance
18 | among different chain of thought annotations, as would be
19 | expected when using exemplar-based prompting (Le Scao
20 | and Rush, 2021; Reynolds and McDonell, 2021; Zhao
21 | et al., 2021), all sets of chain of thought prompts outper-
22 | form the standard baseline by a large margin. This result
23 | implies that successful use of chain of thought does not
24 | depend on a particular linguistic style.
25 | To confirm that successful chain-of-thought prompting
26 | works for other sets of exemplars, we also run experiments
27 | with three sets of eight exemplars randomly sampled from the GSM8K training set, an independent
28 | source (examples in this dataset already included reasoning steps like a chain of thought).2 Fig-
29 | ure 6 shows that these prompts performed comparably with our manually written exemplars, also
30 | substantially outperforming standard prompting.
31 | In addition to robustness to annotators, independently-written chains of thought, different exemplars,
32 | and various language models, we also find that chain-of-thought prompting for arithmetic reasoning
33 | is robust to different exemplar orders and varying numbers of exemplars (see Appendix A.2).
34 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # **Structured-QA Blueprint**
2 |
3 |
4 |
5 |
6 |
7 | Blueprints empower developers to easily integrate AI capabilities into their projects using open-source models and tools.
8 |
9 | These docs are your companion to mastering the **Structured-QA Blueprint** a local-first approach for answering questions about your structured documents.
10 |
11 | ### Built with
12 | - Python 3.10+
13 | - [pymupdf4llm](https://pypi.org/project/pymupdf4llm/) (document preprocessing)
14 | - [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) (question answering)
15 |
16 | ---
17 |
18 | ### 🚀 **Get Started Quickly**
19 | #### _Start building your own Structured-QA pipeline in minutes:_
20 | - **[Getting Started](getting-started.md):** Quick setup and installation instructions.
21 |
22 | ### 🔍 **Understand the System**
23 | #### _Dive deeper into how the Blueprint works:_
24 | - **[Step-by-Step Guide](step-by-step-guide.md):** A detailed breakdown of the system’s design and workflow.
25 | - **[API Reference](api.md):** Explore the technical details of the core modules.
26 |
27 | ### 🎨 **Make It Yours**
28 | #### _Customize the Blueprint to fit your needs:_
29 | - **[Customization Guide](customization.md):** Tailor prompts and settings to adapt your use case.
30 |
31 | ### 🌟 **Join the Community**
32 | #### _Help shape the future of Blueprints:_
33 | - **[Future Features & Contributions](future-features-contributions.md):** Learn about exciting upcoming features and how to contribute to the project.
34 |
35 |
36 | Have more questions? Reach out to us on Discord and we'll see how we can help:
37 |
38 |
39 |
40 |
41 |
42 | ---
43 |
44 | ## **Why Blueprints?**
45 |
46 | Blueprints are more than starter code—they’re your gateway to building AI-powered solutions with confidence. With step-by-step guidance, modular design, and open-source tools, we make AI accessible for developers of all skill levels.
47 |
--------------------------------------------------------------------------------
/src/structured_qa/config.py:
--------------------------------------------------------------------------------
1 | from typing_extensions import Annotated
2 |
3 | from pydantic import BaseModel, DirectoryPath, FilePath
4 | from pydantic.functional_validators import AfterValidator
5 |
6 |
7 | FIND_PROMPT = """
8 | You are given two pieces of information:
9 | 1. A list of valid section names.
10 | 2. A user question.
11 |
12 | Your task is to:
13 | - Identify exactly one `section_name` from the provided list that seems related to the user question.
14 | - Return the `section_name` exactly as it appears in the list.
15 | - Do NOT answer the question.
16 | - Do NOT return any additional text, explanation, or formatting.
17 | - Do NOT combine multiple section names into a single response.
18 |
19 | Here is the list of valid section names:
20 |
21 | ```
22 | {SECTIONS}
23 | ```
24 |
25 | Now, based on the following question, return the single most relevant `section_name` from the list.
26 | """
27 |
28 | ANSWER_PROMPT = """
29 | You are a rigorous assistant answering questions.
30 | You must only answer based on the current information available which is:
31 |
32 | ```
33 | {CURRENT_INFO}
34 | ```
35 |
36 | If the current information available not enough to answer the question,
37 | you must return "I need more info" and nothing else.
38 | """
39 |
40 |
41 | def validate_model(value):
42 | parts = value.split("/")
43 | if len(parts) != 3:
44 | raise ValueError("model must be formatted as `owner/repo/file`")
45 | if not value.endswith(".gguf"):
46 | raise ValueError("model must be a gguf file")
47 | return value
48 |
49 |
50 | def validate_find_prompt(value):
51 | if "{SECTIONS}" not in value:
52 | raise ValueError("find_prompt must contain `{SECTIONS}` placeholder")
53 | return value
54 |
55 |
56 | def answer_prompt(value):
57 | if "{CURRENT_INFO}" not in value:
58 | raise ValueError("answer_prompt must contain `{CURRENT_INFO}` placeholder")
59 | return value
60 |
61 |
62 | class Config(BaseModel):
63 | question: str
64 | input_file: FilePath
65 | output_dir: DirectoryPath
66 | model: Annotated[str, AfterValidator(validate_model)]
67 | answer_prompt: Annotated[str, AfterValidator(answer_prompt)]
68 | find_prompt: Annotated[str, AfterValidator(validate_find_prompt)]
69 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/Procurement in an emerging market.txt:
--------------------------------------------------------------------------------
1 | Commercial agreements
2 | AI is an emerging market. As well as rapidly evolving technology, there are ongoing changes
3 | in the supply base and the products and services it offers. DPSs offer flexibility for new
4 | suppliers to join, which often complement these dynamics well for buyers.
5 | Any public sector buyers interested in shaping CCS’s longer term commercial agreement
6 | portfolio should express their interest via info@crowncommercial.gov.uk
7 | Regulation and policy
8 | Regulation and policy will also evolve to keep pace. However, there are already a number of
9 | legal and regulatory provisions which are relevant to the use of AI technologies.
10 | • UK data protection law: regulation around automated decision making, processing
11 | personal data, processing for the purpose of developing and training AI technologies. In
12 | November 2022, a new Procurement Policy Note was published to provide an update to
13 | this: PPN 03/22 Updated guidance on data protection legislation.
14 | • Online Safety Act: provisions concerning design and use of algorithms are to be
15 | included in a new set of laws to protect children and adults online. It will make social
16 | media companies more responsible for their users’ safety on their platforms.
17 | • A pro-innovation approach to AI regulation: this white paper published in March 2023,
18 | sets out early steps towards establishing a regulatory regime for AI. The white paper
19 | outlines a proportionate pro-innovation framework, including five principles to guide
20 | responsible AI innovation in all sectors.
21 | • Centre for Data Ethics and Innovation (CDEI) AI assurance techniques: the portfolio
22 | of AI assurance techniques has been developed by the CDEI, initially in collaboration with
23 | techUK. The portfolio is useful for anybody involved in designing, developing, deploying
24 | or procuring AI-enabled systems. It shows examples of AI assurance techniques being
25 | used in the real-world to support the development of trustworthy AI.
26 | Further guidance is also available from the Information Commissioner’s Office, Equality
27 | and Human Rights Commission, Medicines and Healthcare products Regulation
28 | Authority and the Health and Safety Executive.
29 |
--------------------------------------------------------------------------------
/tests/unit/test_workflow.py:
--------------------------------------------------------------------------------
1 | from structured_qa.config import ANSWER_PROMPT, FIND_PROMPT
2 | from structured_qa.workflow import find_retrieve_answer
3 |
4 |
5 | def test_find_retrieve_answer_multi_sections(tmp_path, mocker):
6 | model = mocker.MagicMock()
7 |
8 | def side_effect(messages):
9 | if FIND_PROMPT[:10] in messages[0]["content"]:
10 | if "section_1" in messages[0]["content"]:
11 | return "section_1"
12 | else:
13 | return "section_2"
14 | elif "Section 1" in messages[0]["content"]:
15 | return "I need more info."
16 | elif "Section 2" in messages[0]["content"]:
17 | return "Answer in Section 2"
18 |
19 | model.get_response.side_effect = side_effect
20 |
21 | sections_dir = tmp_path / "sections"
22 | sections_dir.mkdir()
23 | (sections_dir / "section_1.txt").write_text("Section 1")
24 | (sections_dir / "section_2.txt").write_text("Section 2")
25 |
26 | question = "What is the answer?"
27 | answer, sections_checked = find_retrieve_answer(
28 | model=model,
29 | sections_dir=sections_dir,
30 | question=question,
31 | find_prompt=FIND_PROMPT,
32 | answer_prompt=ANSWER_PROMPT,
33 | )
34 |
35 | assert answer == "Answer in Section 2"
36 | assert sections_checked == ["section_1", "section_2"]
37 |
38 |
39 | def test_find_retrieve_answer_mispelled_section(tmp_path, mocker):
40 | model = mocker.MagicMock()
41 |
42 | def side_effect(messages):
43 | if FIND_PROMPT[:10] in messages[0]["content"]:
44 | return "SecTION 1"
45 | else:
46 | return "I need more info."
47 |
48 | model.get_response.side_effect = side_effect
49 |
50 | sections_dir = tmp_path / "sections"
51 | sections_dir.mkdir()
52 | (sections_dir / "section_1.txt").write_text("Section 1")
53 |
54 | question = "What is the answer?"
55 | answer, sections_checked = find_retrieve_answer(
56 | model=model,
57 | sections_dir=sections_dir,
58 | question=question,
59 | find_prompt=FIND_PROMPT,
60 | answer_prompt=ANSWER_PROMPT,
61 | )
62 |
63 | assert answer == "NOT FOUND"
64 | assert sections_checked == ["section_1"]
65 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yaml:
--------------------------------------------------------------------------------
1 | name: "\U0001F41B Bug Report"
2 | description: Submit a bug report to help us improve
3 | title: "[BUG]: "
4 | projects:
5 | - "project-name"
6 | labels:
7 | - bug
8 | body:
9 | - type: markdown
10 | attributes:
11 | value: |
12 | Thanks for taking the time to fill out this bug report!
13 | Please make sure you have searched for a similar [issue](https://github.com/mozilla-ai/project-name/issues) before submitting a new one.
14 |
15 | - type: textarea
16 | id: description
17 | attributes:
18 | label: Description
19 | description: A clear and concise description of what the bug is.
20 | placeholder: |
21 | When I try to...
22 | validations:
23 | required: true
24 |
25 | - type: textarea
26 | id: reproduction
27 | attributes:
28 | label: Reproduction
29 | description: |
30 | Provide a numbered list of steps that one can follow to reproduce this behavior.
31 | placeholder: |
32 | Steps to reproduce the behavior:
33 | 1.
34 | 2.
35 | 3.
36 | validations:
37 | required: true
38 |
39 | - type: textarea
40 | id: logs
41 | attributes:
42 | label: Relevant log output
43 | description: |
44 | Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
45 | render: shell
46 | validations:
47 | required: false
48 |
49 | - type: textarea
50 | id: expected-behavior
51 | attributes:
52 | label: Expected behavior
53 | description: A clear description of what you would expect to happen.
54 | placeholder: "I would expect to..."
55 | validations:
56 | required: true
57 |
58 | - type: textarea
59 | id: system-info
60 | attributes:
61 | label: System Info
62 | description: Please share your system info with us.
63 | placeholder: |
64 | - OS: [e.g., macOS Sonoma]
65 | - Project-name version: [e.g. commit SHA]
66 | validations:
67 | required: true
68 |
69 | - type: checkboxes
70 | id: search
71 | attributes:
72 | label: Have you searched for similar issues before submitting this one?
73 | options:
74 | - label: Yes, I have searched for similar issues
75 | required: true
76 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/Training Cost.txt:
--------------------------------------------------------------------------------
1 | A prominent topic in discussions about foundation
2 | models is their speculated costs. While AI
3 | companies seldom reveal the expenses involved
4 | in training their models, it is widely believed that
5 | these costs run into millions of dollars and are
6 | rising. For instance, OpenAI’s CEO, Sam Altman,
7 | mentioned that the training cost for GPT-4 was over
8 | $100 million. This escalation in training expenses
9 | has effectively excluded universities, traditionally
10 | centers of AI research, from developing their own
11 | leading-edge foundation models. In response, policy
12 | initiatives, such as President Biden’s Executive Order
13 | on AI, have sought to level the playing field between
14 | industry and academia by creating a National AI
15 | Research Resource, which would grant nonindustry
16 | actors the compute and data needed to do higher
17 | level AI-research.
18 | Understanding the cost of training AI models is
19 | important, yet detailed information on these costs
20 | remains scarce. The AI Index was among the first to
21 | offer estimates on the training costs of foundation
22 | models in last year’s publication. This year, the AI
23 | Index has collaborated with Epoch AI, an AI research
24 | institute, to substantially enhance and solidify the
25 | robustness of its AI training cost estimates.9 To
26 | estimate the cost of cutting-edge models, the Epoch
27 | team analyzed training duration, as well as the type,
28 | quantity, and utilization rate of the training hardware,
29 | using information from publications, press releases, or
30 | technical reports related to the models.10
31 | Figure 1.3.21 visualizes the estimated training cost
32 | associated with select AI models, based on cloud
33 | compute rental prices. AI Index estimates validate
34 | suspicions that in recent years model training costs
35 | have significantly increased. For example, in 2017,
36 | the original Transformer model, which introduced the
37 | architecture that underpins virtually every modern
38 | LLM, cost around $900 to train.11 RoBERTa Large,
39 | released in 2019, which achieved state-of-the-art
40 | results on many canonical comprehension benchmarks
41 | like SQuAD and GLUE, cost around $160,000 to train.
42 | Fast-forward to 2023, and training costs for OpenAI’s
43 | GPT-4 and Google’s Gemini Ultra are estimated to be
44 | around $78 million and $191 million, respectively.
45 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to mozilla.ai Blueprints
2 |
3 | Thank you for your interest in contributing to this repository! This project is part of the Blueprints initiative, which empowers developers to integrate AI capabilities into their projects using open-source tools and models.
4 |
5 | We welcome all kinds of contributions, from improving customization, to extending capabilities, to fixing bugs. Whether you’re an experienced developer or just starting out, your support is highly appreciated.
6 |
7 | ---
8 |
9 | ## **How to Contribute**
10 |
11 | ### **Customize for your use-case or Extend It** 🔧
12 | - Fork this repo and customize it for your own use-case or even extend its capabilities.
13 | - We'd love to see what you've built! Provided your fork is public, we may reach out to you to feature your work on the [Blueprints Hub](https://developer-hub.mozilla.ai/) .
14 |
15 | ### **Browse Existing Issues** 🔍
16 | - Check the Issues page to see if there are any tasks you'd like to tackle.
17 | - Look for issues labeled **`good first issue`** if you're new to the project—they're a great place to start.
18 |
19 | ### **Report Issues** 🐛
20 | - Found a bug? Open a Bug Report by clicking on 'New Issue'
21 | - Provide as much detail as possible, including the steps to reproduce the issue and Expected vs. actual behavior
22 |
23 | ### **Suggest Features** 🚀
24 | - Have an idea for improving the Blueprint? Open a Feature Request by clicking on 'New Issue'
25 | - Share why the feature is important and any alternative solutions you’ve considered.
26 |
27 | ### **Submit Pull Requests** 💻
28 | - Fork the repository and create a new branch for your changes.
29 | - Install [pre-commit](https://pre-commit.com/) to ensure the code is formatted and standardized correctly, by running `pip install pre-commit` and then `pre-commit install`.
30 | - Ensure your branch is up-to-date with the main branch before submitting the PR
31 | - Please follow the PR template, adding as much detail as possible, including how to test the changes
32 |
33 | ---
34 |
35 | ### **Guidelines for Contributions**
36 |
37 | **Coding Standards**
38 | - Follow PEP 8 for Python formatting.
39 | - Use clear variable and function names and add comments to improve readability.
40 |
41 | **Testing**
42 | - Test changes locally and in GitHub Codespaces to ensure functionality.
43 |
44 | **Documentation**
45 | - Update docs for changes to functionality and maintain consistency with existing docs.
46 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/6.1.1. Compilation Workflow.txt:
--------------------------------------------------------------------------------
1 | 6.1.1.1 Offline Compilation
2 | Source files compiled with nvcc can include a mix of host code (i.e., code that executes on the host)
3 | and device code (i.e., code that executes on the device). nvcc’s basic workflow consists in separating
4 | device code from host code and then:
5 | ▶ compiling the device code into an assembly form (PTX code) and/or binary form (cubin object),
6 | ▶ and modifying the host code by replacing the <<<...>>> syntax introduced in Kernels (and de-
7 | scribed in more details in Execution Configuration) by the necessary CUDA runtime function calls
8 | to load and launch each compiled kernel from the PTX code and/or cubin object.
9 | The modified host code is output either as C++ code that is left to be compiled using another tool or
10 | as object code directly by letting nvcc invoke the host compiler during the last compilation stage.
11 | Applications can then:
12 | ▶ Either link to the compiled host code (this is the most common case),
13 | ▶ Or ignore the modified host code (if any) and use the CUDA driver API (see Driver API) to load and
14 | execute the PTX code or cubin object.
15 | 6.1.1.2 Just-in-Time Compilation
16 | Any PTX code loaded by an application at runtime is compiled further to binary code by the device
17 | driver. This is called just-in-time compilation. Just-in-time compilation increases application load time,
18 | but allows the application to benefit from any new compiler improvements coming with each new
19 | device driver. It is also the only way for applications to run on devices that did not exist at the time the
20 | application was compiled, as detailed in Application Compatibility.
21 | When the device driver just-in-time compiles some PTX code for some application, it automatically
22 | caches a copy of the generated binary code in order to avoid repeating the compilation in subsequent
23 | invocations of the application. The cache - referred to as compute cache - is automatically invalidated
24 | when the device driver is upgraded, so that applications can benefit from the improvements in the
25 | new just-in-time compiler built into the device driver.
26 | Environment variables are available to control just-in-time compilation as described in CUDA Environ-
27 | ment Variables
28 | As an alternative to using nvcc to compile CUDA C++ device code, NVRTC can be used to compile
29 | CUDA C++ device code to PTX at runtime. NVRTC is a runtime compilation library for CUDA C++; more
30 | information can be found in the NVRTC User guide.
31 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/3.2 Results.txt:
--------------------------------------------------------------------------------
1 | The strongest results of chain-of-thought prompting are summarized in Figure 4, with all experimental
2 | outputs for each model collection, model size, and benchmark shown in Table 2 in the Appendix.
3 | There are three key takeaways. First, Figure 4 shows that chain-of-thought prompting is an emergent
4 | ability of model scale (Wei et al., 2022b). That is, chain-of-thought prompting does not positively
5 | impact performance for small models, and only yields performance gains when used with models of
6 | ∼100B parameters. We qualitatively found that models of smaller scale produced fluent but illogical
7 | chains of thought, leading to lower performance than standard prompting.
8 | Second, chain-of-thought prompting has larger
9 | performance gains for more-complicated prob-
10 | lems. For instance, for GSM8K (the dataset
11 | with the lowest baseline performance), perfor-
12 | mance more than doubled for the largest GPT
13 | and PaLM models. On the other hand, for Sin-
14 | gleOp, the easiest subset of MAWPS which only
15 | requires a single step to solve, performance im-
16 | provements were either negative or very small
17 | (see Appendix Table 3).
18 | Third, chain-of-thought prompting via GPT-3
19 | 175B and PaLM 540B compares favorably to
20 | prior state of the art, which typically finetunes a
21 | task-specific model on a labeled training dataset.
22 | Figure 4 shows how PaLM 540B uses chain-of-
23 | thought prompting to achieve new state of the art
24 | on GSM8K, SVAMP, and MAWPS (though note
25 | that standard prompting already passed the prior
26 | best for SVAMP). On the other two datasets,
27 | AQuA and ASDiv, PaLM with chain-of-thought
28 | prompting reaches within 2% of the state of the
29 | art (Appendix Table 2).
30 | To better understand why chain-of-thought
31 | prompting works, we manually examined model-
32 | generated chains of thought by LaMDA 137B
33 | for GSM8K. Of 50 random examples where the
34 | model returned the correct final answer, all of
35 | the generated chains of thought were also log-
36 | ically and mathematically correct except two
37 | that coincidentally arrived at the correct answer
38 | (see Appendix D.1, and Table 8 for examples
39 | of correct model-generated chains of thought).
40 | We also randomly examined 50 random sam-
41 | ples for which the model gave the wrong answer.
42 | The summary of this analysis is that 46% of the
43 | chains of thought were almost correct, barring
44 | minor mistakes (calculator error, symbol map-
45 | ping error, or one reasoning step missing), and that the other 54% of the chains of thought had major
46 | errors in semantic understanding or coherence (see Appendix D.2). To provide a small insight into
47 | why scaling improves chain-of-thought reasoning ability, we performed a similar analysis of errors
48 | made by PaLM 62B and whether those errors were fixed by scaling to PaLM 540B. The summary
49 | is that scaling PaLM to 540B fixes a large portion of one-step missing and semantic understanding
50 | errors in the 62B model (see Appendix A.1).
51 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/3.1 Experimental Setup.txt:
--------------------------------------------------------------------------------
1 | We explore chain-of-thought prompting for various language models on multiple benchmarks.
2 | Benchmarks. We consider the following five math word problem benchmarks: (1) the GSM8K
3 | benchmark of math word problems (Cobbe et al., 2021), (2) the SVAMP dataset of math word
4 | problems with varying structures (Patel et al., 2021), (3) the ASDiv dataset of diverse math word
5 | problems (Miao et al., 2020), (4) the AQuA dataset of algebraic word problems, and (5) the MAWPS
6 | benchmark (Koncel-Kedziorski et al., 2016). Example problems are given in Appendix Table 12
7 | Standard prompting. For the baseline, we consider standard few-shot prompting, popularized by
8 | Brown et al. (2020), in which a language model is given in-context exemplars of input–output pairs
9 | before outputting a prediction for a test-time example. Exemplars are formatted as questions and
10 | answers. The model gives the answer directly, as shown in Figure 1 (left).
11 | Chain-of-thought prompting. Our proposed approach is to augment each exemplar in few-shot
12 | prompting with a chain of thought for an associated answer, as illustrated in Figure 1 (right). As most
13 | of the datasets only have an evaluation split, we manually composed a set of eight few-shot exemplars
14 | with chains of thought for prompting—Figure 1 (right) shows one chain of thought exemplar, and the
15 | full set of exemplars is given in Appendix Table 20. (These particular exemplars did not undergo
16 | prompt engineering; robustness is studied in Section 3.4 and Appendix A.2.) To investigate whether
17 | chain-of-thought prompting in this form can successfully elicit successful reasoning across a range of
18 | math word problems, we used this single set of eight chain of thought exemplars for all benchmarks
19 | except AQuA, which is multiple choice instead of free response. For AQuA, we used four exemplars
20 | and solutions from the training set, as given in Appendix Table 21.
21 | Language models. We evaluate five large language models. The first is GPT-3 (Brown et al.,
22 | 2020), for which we use text-ada-001, text-babbage-001, text-curie-001, and text-davinci-002, which
23 | presumably correspond to InstructGPT models of 350M, 1.3B, 6.7B, and 175B parameters (Ouyang
24 | et al., 2022).The second is LaMDA (Thoppilan et al., 2022), which has models of 422M, 2B, 8B,
25 | 68B, and 137B parameters. The third is PaLM, which has models of 8B, 62B, and 540B parameters.
26 | The fourth is UL2 20B (Tay et al., 2022), and the fifth is Codex (Chen et al., 2021, code-davinci-002
27 | in the OpenAI API). We sample from the models via greedy decoding (though follow-up work shows
28 | chain-of-thought prompting can be improved by taking the majority final answer over many sampled
29 | generations (Wang et al., 2022a)). For LaMDA, we report averaged results over five random seeds,
30 | where each seed had a different randomly shuffled order of exemplars. As LaMDA experiments
31 | did not show large variance among different seeds, to save compute we report results for a single
32 | exemplar order for all other models.
33 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/Limitations of generative AI and LLMs.txt:
--------------------------------------------------------------------------------
1 | LLMs predict the next word in a sequence. They don’t understand the content or meaning
2 | of the words beyond how likely they are to be used in response to a particular question.
3 | This means that even though LLMs can produce plausible responses to requests, there are
4 | limitations on what they can reliably do.
5 | You need to be aware of these limitations and have checks and assurance in place when
6 | using generative AI in your organisation.
7 | • Hallucination (also called confabulation): LLMs are primarily designed to prioritise the
8 | appearance of being plausible rather than focusing on ensuring absolute accuracy,
9 | frequently resulting in the creation of content that appears plausible but may actually be
10 | factually incorrect.
11 | • Critical thinking and judgement: although LLMs can give the appearance of reasoning,
12 | they are simply predicting the next most plausible word in their output, and may produce
13 | inaccurate or poorly-reasoned conclusions.
14 | • Sensitive or ethical context: LLMs can generate offensive, biased, or inappropriate
15 | content if not properly guided, as they will replicate any bias present in the data they
16 | were trained on.
17 | • Domain expertise: unless specifically trained on specialist data, LLMs are not true
18 | domain experts. On their own, they are not a substitute for professional advice,
19 | especially in legal, medical, or other critical areas where precise and contextually relevant
20 | information is essential.
21 | ersonal experience and context: LLMs lack personal experiences and emotions.
22 | Although their outputs may appear as if they come from a person, they do not have true
23 | understanding or a consciousness.
24 | • Dynamic real-time information retrieval: LLMs do not always have real-time access to
25 | the internet or data outside their training set. However, this feature of LLM products is
26 | changing. As of October 2023, ChatGPT, Bard and Bing have been modified to include
27 | access to real-time internet data in their results.
28 | • Short-term memory: LLMs have a limited context window. They might lose track of the
29 | context of a conversation if it’s too long, leading to incoherent responses.
30 | • Explainability: generative AI is based on neural networks, which are so-called ‘black
31 | boxes’. This makes it difficult or impossible to explain the inner workings of the model
32 | which has potential implications if in the future you are challenged to justify decisioning
33 | or guidance based on the model.
34 | These limitations mean that there are types of use cases where you should currently avoid
35 | using generative AI, such as safety-of-life systems or those involving fully automated
36 | decision-making which affects individuals.
37 | However, the capabilities and limitations of generative AI solutions are rapidly changing,
38 | and solution providers are continuously striving to overcome these limitations. This means
39 | that you should make sure that you understand the features of the products and services
40 | you are using and how they are expected to change.
41 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/LOCATION ABILITIES.txt:
--------------------------------------------------------------------------------
1 | There are four different abilities that players will encounter on the Locations in
2 | the game:
3 | FIELDS
4 | These cards don’t have any keywords, only icons shown in the middle of the ability
5 | description box representing the Goods that may be Harvested from this Location.
6 | Fields give the player the depicted Goods immediately when the Field is built, and
7 | each time a Harvest action is declared on that Field.
8 | IMPORTANT: Some Field Locations work only as upgrades. These Fields have
9 | the Resources on the right side of the ability description box. Once built these
10 | Locations have to be attached to an existing Field in your Empire of the same
11 | Goods type. The card is slid underneath the existing Field with just the additional
12 | Good icons on the right visible. When the Field upgrade is built, the player
13 | gains ONLY the Goods provided by that upgrade card, not the entire Field it is
14 | upgrading. When the player Harvests a Field with one or more upgrades, then
15 | they gain Goods from this Field and all of its upgrades. Upgrade cards give 1
16 | at the end of the game just like any other Location
17 | FEATURE
18 | These cards can have various special abilities described on the cards. These card
19 | abilities may be triggered during any player’s turn, and in certain cases they can
20 | grant the player Goods after taking specific actions.
21 | INSTANT
22 | These cards’ ability is resolved immediately when they are played, and the card is
23 | discarded afterward.
24 | ACTION
25 | To use the Location’s ability a player has to exhaust the card by rotating it 90
26 | degrees to the right. They then must pay any cost described in the ability text.
27 | Once the card is exhausted, its ability is no longer available.
28 | IMPORTANT: Only Action Location cards are exhausted after use. Both Feature
29 | and Field location cards are not exhausted, even if a player gained something
30 | from them multiple times, unless stated otherwise.
31 | OTE 1: Several Locations can also
32 | have a Building Bonus which is an
33 | additional one-time ability that
34 | activates when such Location is
35 | placed in a player’s Empire.
36 | NOTE 2: Some abilities in the
37 | game have a ‘/’ divider between
38 | presented choices. This should be
39 | treated as an ‘or.’ A player must
40 | choose one option when using that
41 | ability, for instance ‘Gain 1 / .’
42 | means that a player chooses either
43 | to gain 1 or 1
44 | NOTE 3: Some abilities allow players to Pillage
45 | or Conquer right away, before the Expedition
46 | phase, in such cases a Ship is still required to be
47 | on the Expedition board to use it, along with any
48 | appropriate Goods assigned to it. This ability allows
49 | the Ship to resolve the Pillage or Conquest outside
50 | of the Expedition phase and to gain the card/Goods
51 | in the current Action phase. The Ship is returned to
52 | the player’s supply, and any Raze token and/or Fish
53 | assigned to it is discarded as usual, even if it wasn’t
54 | used. A Ship that was returned in this manner may be
55 | used again during this round to Sail (described later
56 | in this section).
57 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/5 Symbolic Reasoning.txt:
--------------------------------------------------------------------------------
1 | Our final experimental evaluation considers symbolic rea-
2 | soning, which is simple for humans but potentially chal-
3 | lenging for language models. We show that chain-of-
4 | thought prompting not only enables language models to
5 | perform symbolic reasoning tasks that are challenging in
6 | the standard prompting setting, but also facilitates length
7 | generalization to inference-time inputs longer than those
8 | seen in the few-shot exemplars.
9 | Tasks. We use the following two toy tasks.
10 | • Last letter concatenation. This task asks the model
11 | to concatenate the last letters of words in a name (e.g.,
12 | “Amy Brown” → “yn”). It is a more challenging version
13 | of first letter concatenation, which language models can
14 | already perform without chain of thought.3 We generate
15 | full names by randomly concatenating names from the
16 | top one-thousand first and last names from name census
17 | data (https://namecensus.com/).
18 | • Coin flip. This task asks the model to answer whether a
19 | coin is still heads up after people either flip or don’t flip
20 | the coin (e.g., “A coin is heads up. Phoebe flips the coin.
21 | Osvaldo does not flip the coin. Is the coin still heads up?”
22 | → “no”).
23 | As the construction of these symbolic reasoning tasks is
24 | well-defined, for each task we consider an in-domain test
25 | set for which examples had the same number of steps as
26 | the training/few-shot exemplars, as well as an out-of-domain (OOD) test set, for which evaluation
27 | examples had more steps than those in the exemplars. For last letter concatenation, the model only
28 | sees exemplars of names with two words, and then performs last letter concatenation on names with 3
29 | and 4 words.4 We do the same for the number of potential flips in the coin flip task. Our experimental
30 | setup uses the same methods and models as in the prior two sections. We again manually compose
31 | chains of thought for the few-shot exemplars for each task, which are given in Figure 3
32 | Results. The results of these in-domain and OOD evaluations are shown in Figure 8 for PaLM,
33 | with results for LaMDA shown in Appendix Table 5. With PaLM 540B, chain-of-thought prompting
34 | leads to almost 100% solve rates (note that standard prompting already solves coin flip with PaLM
35 | 540, though not for LaMDA 137B). Note that these in-domain evaluations are “toy tasks” in the
36 | sense that perfect solution structures are already provided by the chains of thought in the few-shot
37 | exemplars; all the model has to do is repeat the same steps with the new symbols in the test-time
38 | example. And yet, small models still fail—the ability to perform abstract manipulations on unseen
39 | symbols for these three tasks only arises at the scale of 100B model parameters.
40 | As for the OOD evaluations, standard prompting fails for both tasks. With chain-of-thought prompting,
41 | language models achieve upward scaling curves (though performance is lower than in the in-domain
42 | setting). Hence, chain-of-thought prompting facilitates length generalization beyond seen chains of
43 | thought for language models of sufficient scale.
44 |
--------------------------------------------------------------------------------
/src/structured_qa/preprocessing.py:
--------------------------------------------------------------------------------
1 | import re
2 | from collections import defaultdict
3 | from pathlib import Path
4 |
5 | import pymupdf4llm
6 |
7 | from loguru import logger
8 |
9 |
10 | def split_markdown_by_headings(
11 | markdown_text, heading_patterns: list[str] | None = None
12 | ) -> dict[str, str]:
13 | """Splits a markdown document into sections based on specified heading patterns.
14 |
15 | Args:
16 | markdown_text (str): The markdown document as a single string.
17 | heading_patterns (str, optional): A list of regex patterns representing heading markers
18 | in the markdown document.
19 | Defaults to None.
20 | If None, the default patterns are used.
21 |
22 | Returns:
23 | dict[str, str]: A dictionary where the keys are the section names and the values are the section contents.
24 | """
25 | if heading_patterns is None:
26 | heading_patterns = [
27 | r"^#\s+(.+)$",
28 | r"^##\s+(.+)$",
29 | r"^###\s+(.+)$",
30 | r"^####\s+(.+)$",
31 | r"^\*\*[\d\.]+\.\*\*\s*\*\*(.+)\*\*$",
32 | ]
33 |
34 | sections = defaultdict(str)
35 |
36 | heading_text = "INTRO"
37 | for line in markdown_text.splitlines():
38 | line = line.strip()
39 | if not line:
40 | continue
41 | for pattern in heading_patterns:
42 | match = re.match(pattern, line)
43 | if match:
44 | heading_text = match.group(1)[:100]
45 | break
46 | sections[heading_text] += f"{line}\n"
47 |
48 | return sections
49 |
50 |
51 | @logger.catch(reraise=True)
52 | def document_to_sections_dir(input_file: str, output_dir: str) -> list[str]:
53 | """
54 | Convert a document to a directory of sections.
55 |
56 | Uses [pymupdf4llm](https://pypi.org/project/pymupdf4llm/) to convert input_file to markdown.
57 | Then uses [`split_markdown_by_headings`][structured_qa.preprocessing.split_markdown_by_headings] to split the markdown into sections based on the headers.
58 |
59 | Args:
60 | input_file: Path to the input document.
61 | output_dir: Path to the output directory.
62 | Structure of the output directory:
63 |
64 | ```
65 | output_dir/
66 | section_1.txt
67 | section_2.txt
68 | ...
69 | ```
70 |
71 | Returns:
72 | List of section names.
73 | """
74 |
75 | logger.info(f"Converting {input_file}")
76 | md_text = pymupdf4llm.to_markdown(input_file)
77 | Path("debug.md").write_text(md_text)
78 | logger.success("Converted")
79 |
80 | logger.info("Extracting sections")
81 | sections = split_markdown_by_headings(
82 | md_text,
83 | )
84 | logger.success(f"Found {len(sections)} sections")
85 | logger.info(f"Writing sections to {output_dir}")
86 | output_dir = Path(output_dir)
87 | output_dir.mkdir(exist_ok=True, parents=True)
88 |
89 | for section_name, section_content in sections.items():
90 | (output_dir / f"{section_name.replace('/', '_')}.txt").write_text(
91 | section_content
92 | )
93 | logger.success("Done")
94 |
95 | return sections.keys()
96 |
--------------------------------------------------------------------------------
/src/structured_qa/cli.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import yaml
4 | from fire import Fire
5 | from loguru import logger
6 |
7 | from structured_qa.config import Config, ANSWER_PROMPT, FIND_PROMPT
8 | from structured_qa.model_loaders import load_llama_cpp_model
9 | from structured_qa.preprocessing import document_to_sections_dir
10 | from structured_qa.workflow import find_retrieve_answer
11 |
12 |
13 | @logger.catch(reraise=True)
14 | def structured_qa(
15 | question: str | None = None,
16 | input_file: str | None = None,
17 | output_dir: str | None = None,
18 | model: str
19 | | None = "bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf",
20 | find_prompt: str = FIND_PROMPT,
21 | answer_prompt: str = ANSWER_PROMPT,
22 | from_config: str | None = None,
23 | ):
24 | """
25 | Structured Question Answering.
26 |
27 | Split the input document into sections and answer the question based on the sections.
28 |
29 | Args:
30 | input_file: Path to the input document.
31 | output_dir: Path to the output directory.
32 | Structure of the output directory:
33 |
34 | ```
35 | output_dir/
36 | section_1.txt
37 | section_2.txt
38 | ...
39 | ```
40 | model: Model identifier formatted as `owner/repo/file`.
41 | Must be hosted at the HuggingFace Hub in GGUF format.
42 | Defaults to bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf.
43 | question: The question to answer.
44 | find_prompt: The prompt for finding the section.
45 |
46 | See [`FIND_PROMPT`][structured_qa.config.FIND_PROMPT].
47 | answer_prompt: The prompt for answering the question.
48 |
49 | See [`ANSWER_PROMPT`][structured_qa.config.ANSWER_PROMPT].
50 | from_config: The path to the config file.
51 |
52 | If provided, all other arguments will be ignored.
53 | """
54 | if from_config:
55 | raw_config = yaml.safe_load(Path(from_config).read_text())
56 | Path(raw_config["output_dir"]).mkdir(exist_ok=True, parents=True)
57 | config = Config.model_validate(raw_config)
58 | else:
59 | Path(output_dir).mkdir(exist_ok=True, parents=True)
60 | config = Config(
61 | question=question,
62 | input_file=input_file,
63 | output_dir=output_dir,
64 | model=model,
65 | find_prompt=find_prompt,
66 | answer_prompt=answer_prompt,
67 | )
68 |
69 | logger.info("Loading and converting to sections")
70 | document_to_sections_dir(config.input_file, config.output_dir)
71 | logger.success("Done")
72 |
73 | logger.info("Loading Model")
74 | model = load_llama_cpp_model(config.model)
75 | logger.success("Done")
76 |
77 | logger.info("Answering")
78 | answer, sections_checked = find_retrieve_answer(
79 | model=model,
80 | sections_dir=config.output_dir,
81 | question=config.question,
82 | find_prompt=config.find_prompt,
83 | answer_prompt=config.answer_prompt,
84 | )
85 | logger.success("Done")
86 |
87 | logger.info("Sections checked:")
88 | logger.info(sections_checked)
89 | logger.info("Answer:")
90 | logger.info(answer)
91 |
92 |
93 | def main():
94 | Fire(structured_qa)
95 |
--------------------------------------------------------------------------------
/demo/app.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO
2 | from pathlib import Path
3 |
4 | import pymupdf
5 | import streamlit as st
6 |
7 | from structured_qa.config import ANSWER_PROMPT, FIND_PROMPT
8 | from structured_qa.model_loaders import load_llama_cpp_model
9 | from structured_qa.preprocessing import document_to_sections_dir
10 | from structured_qa.workflow import find_retrieve_answer
11 |
12 |
13 | @st.cache_resource
14 | def load_model():
15 | return load_llama_cpp_model(
16 | "bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf"
17 | )
18 |
19 |
20 | @st.cache_resource
21 | def convert_to_sections(uploaded_file, output_dir):
22 | return document_to_sections_dir(
23 | pymupdf.open("type", BytesIO(uploaded_file.read())),
24 | output_dir,
25 | )
26 |
27 |
28 | st.title("Structured QA")
29 |
30 | st.header("Uploading Data")
31 |
32 | uploaded_file = st.file_uploader(
33 | "Choose a file", type=["pdf", "html", "txt", "docx", "md"]
34 | )
35 |
36 | if uploaded_file is not None:
37 | st.divider()
38 | st.header("Loading and converting to sections")
39 | st.markdown("[Docs for this Step]()")
40 | st.divider()
41 |
42 | try:
43 | with st.spinner("Converting document to sections..."):
44 | section_names = convert_to_sections(
45 | uploaded_file, f"example_outputs/{uploaded_file.name}"
46 | )
47 | sections = [
48 | f.stem for f in Path(f"example_outputs/{uploaded_file.name}").iterdir()
49 | ]
50 |
51 | # Provide feedback about segmentation
52 | st.success(
53 | f"Successfully extracted {len(sections)} sections from the document."
54 | )
55 |
56 | # Check for potential segmentation issues
57 | if len(sections) == 1:
58 | st.warning(
59 | "⚠️ Only one section was found. This might indicate that the document structure wasn't properly detected."
60 | )
61 | elif len(sections) == 0:
62 | st.error(
63 | "❌ No sections were found in the document. The document might not have a clear structure or might be in an unsupported format."
64 | )
65 | elif "INTRO" in sections and len(sections) < 3:
66 | st.warning(
67 | "⚠️ Only found default sections. The document structure might not have been properly detected."
68 | )
69 |
70 | # Show sections
71 | st.text("Detected Sections:")
72 | st.json(sections)
73 |
74 | model = load_model()
75 | question = st.text_input("Enter a question:")
76 | if question:
77 | with st.spinner("Answering..."):
78 | answer, sections_checked = find_retrieve_answer(
79 | model=model,
80 | sections_dir=f"example_outputs/{uploaded_file.name}",
81 | question=question,
82 | find_prompt=FIND_PROMPT,
83 | answer_prompt=ANSWER_PROMPT,
84 | )
85 | st.text("Sections checked:")
86 | st.json(sections_checked)
87 | st.text("Answer:")
88 | st.text(answer)
89 | except Exception as e:
90 | st.error(f"❌ Error processing document: {str(e)}")
91 | st.info(
92 | "💡 Try uploading a different document or check if the file is corrupted."
93 | )
94 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/U.S. Regulation.txt:
--------------------------------------------------------------------------------
1 | This section examines AI-related regulations enacted
2 | by American regulatory agencies between 2016 and
3 | 2023. It provides an analysis of the total number of
4 | regulations, as well as their topics, scope, regulatory
5 | intent, and originating agencies. To compile this
6 | data, the AI Index team performed a keyword search
7 | for “artificial intelligence” on the Federal Register, a
8 | comprehensive repository of government documents
9 | from nearly all branches of the American government,
10 | encompassing more than 436 agencies.8
11 | Overview
12 | The number of AI-related regulations has risen
13 | significantly, both in the past year and over the last five
14 | years (Figure 7.4.1). In 2023, there were 25 AI-related
15 | regulations, a stark increase from just one in 2016. Last
16 | year alone, the total number of AI-related regulations
17 | grew by 56.3%.
18 | By Relevance
19 | The AI Index categorized AI-related regulations—
20 | those mentioning AI—into three levels of relevance:
21 | low, medium, and high.9 In 2023, the number of
22 | high and medium relevance AI-related regulations
23 | increased compared to 2022. For instance, a high
24 | relevance AI regulation was the Copyright Office
25 | and Library of Congress’ Copyright Registration
26 | Guidance: Works Containing Material Generated by
27 | Artificial Intelligence. This policy statement clarified
28 | registration practices for works incorporating AI-
29 | generated material. Meanwhile, a medium-relevance
30 | example is the Securities and Exchange Commission’s
31 | Cybersecurity Risk Management Strategy, Governance,
32 | and Incident Disclosure, which established
33 | standardized disclosure practices for public companies
34 | concerning cybersecurity risk management, strategy,
35 | governance, and incidents.
36 | Figure 7.4.2 categorizes AI-related regulations in the
37 | United States based on their relevance to AI. A growing
38 | proportion of these regulations is highly relevant to
39 | AI. Among the 25 AI-related regulations enacted in
40 | 2023, four were identified as being highly relevant, the
41 | greatest amount since tracking began in 2016.
42 | By Agency10
43 | Which agencies are the primary sources of AI
44 | regulations? In 2023, the Executive Office of the
45 | President and the Commerce Department led with
46 | five AI-related regulations each, followed by the
47 | Health and Human Services Department and the
48 | Industry and Security Bureau, with each issuing four
49 | (Figure 7.4.3). Furthermore, the number of agencies
50 | issuing AI regulations increased from 17 in 2022 to 21 in
51 | 2023, indicating a growing need for clarity and concern
52 | regarding AI among a broader array of American
53 | regulatory bodies.
54 | By Approach
55 | The AI Index categorized regulations based on their
56 | approach: whether they expanded or restricted AI
57 | capabilities.11 Over time, the trend in AI regulations
58 | in the United States has shifted significantly toward
59 | restriction (Figure 7.4.4). In 2023, there were 10
60 | restrictive AI regulations compared to just three that
61 | were expansive. Conversely in 2020, there were four
62 | regulations that were expansive and one that was
63 | restrictive.
64 | By Subject Matter
65 | In 2023, American AI regulations were categorized by
66 | primary subject matter. The most prevalent subject
67 | matter in AI-related regulation was foreign trade and
68 | international finance, with three instances. Three
69 | topics tied for second place, with two occurrences
70 | each: health; commerce; and science, technology, and
71 | communications (Figure 7.4.5).
72 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the team at mozilla.ai. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/SECTION I. INTRODUCTION.txt:
--------------------------------------------------------------------------------
1 | UNTIL VERY RECENTLY, AUTHORS WHO
2 | wanted their works to be widely available had little
3 | choice but to submit their works to publishers who
4 | took assignments of the authors’ copyrights and
5 | exercised them according to a proprietary “all rights
6 | reserved” model.1 The advent of global digital networks
7 | now provides authors who write to be read with excit-
8 | ing new options for communicating their ideas broadly.
9 | One of these options is open access.
10 | The basic idea of open access is that it makes
11 | copyrightable works available without all of the access
12 | barriers associated with the “all rights reserved”
13 | model. These can take the form of price barriers and
14 | permission barriers. 2 Open access typically comes in
15 | two forms. What has come to be known as gratis open
16 | access is the practice of making a work available online
17 | ree of charge (also called public access). The term
18 | libre open access (also called full open access) refers to
19 | the practice of making a work available online free of
20 | charge and with some additional reuse rights, typically
21 | granted through a Creative Commons license. Gratis
22 | open access removes price barriers, whereas libre open
23 | access additionally removes at least some permission
24 | barriers, allowing users to copy, redistribute, and/or
25 | adapt a work. Open access contrasts with more tradi-
26 | tional models of restricted-access publishing in which
27 | copies of works are made directly available only to
28 | paying customers.
29 | Authors who are interested in increasing access to
30 | their works may want to understand whether elimi-
31 | nating cost and permissions barriers is a good option
32 | for them and, if so, how they might release their
33 | works under open access terms. Other authors may
34 | be required by their employer or funding agency to
35 | comply with an open access policy. Still other authors
36 | may be skeptical about whether open access is compat-
37 | ible with their publication goals—including rigorous
38 | peer review, prestige, or monetary compensation—and
39 | want to learn more.
40 | A note on terminology: Many open access proponents and some
41 | research funders
42 | 3 do not consider a work truly openly accessible
43 | if it only meets gratis open access requirements. Indeed, only
44 | libre open access is compliant with most major international
45 | statements that define open access.4 For readability, we use the
46 | term open access in this guide to describe the practice of making
47 | a work available to readers free of charge on the Internet, regard-
48 | less of whether subsequent reuse is permitted. The distinction is
49 | important, however, and we try to make clear in our discussion
50 | below whether we are referring to removal of only price, or both
51 | price and permission barriers. Another way to think about open
52 | access is along a continuum that considers variables including
53 | both price and permissions barriers. If you would like to learn
54 | more about the spectrum of open access, we recommend the
55 | guide How Open Is It?.
56 | Authors Alliance is a nonprofit organization that
57 | promotes authorship for the public good by supporting
58 | authors who write to be read.6 Pursuant to this mission,
59 | Authors Alliance created this guide to help authors
60 | understand and evaluate opportunities to make their
61 | works openly accessible. In this way, Authors Alliance
62 | seeks to help authors further their interest in
63 | disseminating knowledge and products of the
64 | imagination broadly and to enhance the public’s
65 | access to and reuse of these works.
66 |
--------------------------------------------------------------------------------
/src/structured_qa/model_loaders.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import time
3 |
4 |
5 | from loguru import logger
6 |
7 |
8 | def gpu_available():
9 | try:
10 | subprocess.check_output("nvidia-smi")
11 | return True
12 | except Exception:
13 | return False
14 |
15 |
16 | class LlamaModel:
17 | def __init__(self, model):
18 | self.model = model
19 |
20 | def get_response(self, messages):
21 | result = self.model.create_chat_completion(messages)
22 | return result["choices"][0]["message"]["content"]
23 |
24 |
25 | def load_llama_cpp_model(model_id: str) -> LlamaModel:
26 | """
27 | Loads the given model_id using Llama.from_pretrained.
28 |
29 | Examples:
30 | >>> model = load_llama_cpp_model("allenai/OLMoE-1B-7B-0924-Instruct-GGUF/olmoe-1b-7b-0924-instruct-q8_0.gguf")
31 |
32 | Args:
33 | model_id (str): The model id to load.
34 | Format is expected to be `{org}/{repo}/{filename}`.
35 |
36 | Returns:
37 | Llama: The loaded model.
38 | """
39 | from llama_cpp import Llama
40 |
41 | org, repo, filename = model_id.split("/")
42 | model = Llama.from_pretrained(
43 | repo_id=f"{org}/{repo}",
44 | filename=filename,
45 | n_ctx=0, # 0 means that the model limit will be used, instead of the default (512) or other hardcoded value
46 | verbose=False,
47 | n_gpu_layers=-1 if gpu_available() else 0,
48 | )
49 | return LlamaModel(model=model)
50 |
51 |
52 | class UnslothModel:
53 | def __init__(self, model, tokenizer):
54 | self.model = model
55 | self.tokenizer = tokenizer
56 |
57 | def get_response(self, messages):
58 | inputs = self.tokenizer.apply_chat_template(
59 | messages,
60 | tokenize=True,
61 | add_generation_prompt=True,
62 | return_tensors="pt",
63 | ).to("cuda")
64 | outputs = self.model.generate(input_ids=inputs)
65 | response = self.tokenizer.batch_decode(
66 | outputs[:, len(inputs[0]) :], skip_special_tokens=True
67 | )[0]
68 | return response
69 |
70 |
71 | def load_unsloth_model(
72 | model_id: str, chat_template: str, load_in_4bit: bool = True, **kwargs
73 | ) -> UnslothModel:
74 | from unsloth import FastLanguageModel
75 | from unsloth.chat_templates import get_chat_template
76 |
77 | model, tokenizer = FastLanguageModel.from_pretrained(
78 | model_name=model_id,
79 | load_in_4bit=load_in_4bit,
80 | **kwargs,
81 | )
82 | tokenizer = get_chat_template(
83 | tokenizer,
84 | chat_template=chat_template,
85 | )
86 | FastLanguageModel.for_inference(model)
87 | return UnslothModel(model=model, tokenizer=tokenizer)
88 |
89 |
90 | class GeminiModel:
91 | def __init__(self, model):
92 | self.model = model
93 | self.current_calls = 0
94 |
95 | def get_response(self, messages):
96 | logger.info(f"Current calls: {self.current_calls}")
97 | stacked_message = "\n".join(message["content"] for message in messages)
98 | if self.current_calls >= 9:
99 | logger.info("Waiting for 60 seconds")
100 | time.sleep(60)
101 | self.current_calls = 0
102 | response = self.model.generate_content(stacked_message)
103 | self.current_calls += 1
104 | return response.text
105 |
106 |
107 | def load_gemini_model(model_id: str, system_prompt: str, **kwargs) -> GeminiModel:
108 | import google.generativeai as genai
109 |
110 | model = genai.GenerativeModel(
111 | model_name=model_id,
112 | system_instruction=system_prompt,
113 | **kwargs,
114 | )
115 | return GeminiModel(model=model)
116 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/1 INTRODUCTION.txt:
--------------------------------------------------------------------------------
1 | Many applications in natural language processing rely on adapt-
2 | ing one large-scale, pre-trained language model to multiple down-
3 | stream applications. Such adaptation is usually done via fine-tuning,
4 | which updates all the parameters of the pre-trained model. The ma-
5 | jor downside of fine-tuning is that the new model contains as many
6 | parameters as in the original model. As larger models are trained
7 | every few months, this changes from a mere “inconvenience” for
8 | GPT-2 (Radford et al., b) or RoBERTa large (Liu et al., 2019) to a
9 | critical deployment challenge for GPT-3 (Brown et al., 2020) with
10 | 175 billion trainable parameters.1
11 | Many sought to mitigate this by adapting only some parameters or
12 | learning external modules for new tasks. This way, we only need
13 | to store and load a small number of task-specific parameters in ad-
14 | dition to the pre-trained model for each task, greatly boosting the
15 | operational efficiency when deployed. However, existing techniques
16 | ften introduce inference latency (Houlsby et al., 2019; Rebuffi et al., 2017) by extending model
17 | depth or reduce the model’s usable sequence length (Li & Liang, 2021; Lester et al., 2021; Ham-
18 | bardzumyan et al., 2020; Liu et al., 2021) (Section 3). More importantly, these method often fail to
19 | match the fine-tuning baselines, posing a trade-off between efficiency and model quality.
20 | We take inspiration from Li et al. (2018a); Aghajanyan et al. (2020) which show that the learned
21 | over-parametrized models in fact reside on a low intrinsic dimension. We hypothesize that the
22 | change in weights during model adaptation also has a low “intrinsic rank”, leading to our proposed
23 | Low-Rank Adaptation (LoRA) approach. LoRA allows us to train some dense layers in a neural
24 | network indirectly by optimizing rank decomposition matrices of the dense layers’ change during
25 | adaptation instead, while keeping the pre-trained weights frozen, as shown in Figure 1. Using GPT-3
26 | 175B as an example, we show that a very low rank (i.e., r in Figure 1 can be one or two) suffices even
27 | when the full rank (i.e., d) is as high as 12,288, making LoRA both storage- and compute-efficient.
28 | LoRA possesses several key advantages.
29 | • A pre-trained model can be shared and used to build many small LoRA modules for dif-
30 | ferent tasks. We can freeze the shared model and efficiently switch tasks by replacing the
31 | matrices A and B in Figure 1, reducing the storage requirement and task-switching over-
32 | head significantly.
33 | • LoRA makes training more efficient and lowers the hardware barrier to entry by up to 3
34 | times when using adaptive optimizers since we do not need to calculate the gradients or
35 | maintain the optimizer states for most parameters. Instead, we only optimize the injected,
36 | much smaller low-rank matrices.
37 | • Our simple linear design allows us to merge the trainable matrices with the frozen weights
38 | when deployed, introducing no inference latency compared to a fully fine-tuned model, by
39 | construction.
40 | • LoRA is orthogonal to many prior methods and can be combined with many of them, such
41 | as prefix-tuning. We provide an example in Appendix E.
42 | Terminologies and Conventions We make frequent references to the Transformer architecture
43 | and use the conventional terminologies for its dimensions. We call the input and output di-
44 | mension size of a Transformer layer dmodel. We use Wq , Wk, Wv , and Wo to refer to the
45 | query/key/value/output projection matrices in the self-attention module. W or W0 refers to a pre-
46 | trained weight matrix and ∆W its accumulated gradient update during adaptation. We use r to
47 | denote the rank of a LoRA module. We follow the conventions set out by (Vaswani et al., 2017;
48 | Brown et al., 2020) and use Adam (Loshchilov & Hutter, 2019; Kingma & Ba, 2017) for model
49 | optimization and use a Transformer MLP feedforward dimension df f n = 4 × dmodel.
50 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 |
163 | .idea/
164 | .vscode/
165 |
166 | example_outputs
167 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
29 |
30 | # Structured-QA: a Blueprint by Mozilla.ai for answering questions about structured documents.
31 |
32 | This Blueprint demonstrates how to use open-source models and a simple LLM workflow to answer questions based on structured documents.
33 |
34 | It is designed to showcase a simpler alternative to more complex and/or resource-demanding alternatives, such as RAG systems that rely on vector databases and/or long-context models with large token windows.
35 |
36 |
37 |
38 |
39 | ## Quick-start
40 |
41 | Get started with structured-qa using one of the options below:
42 |
43 | | Google Colab | HuggingFace Spaces | GitHub Codespaces |
44 | | -------------| ------------------- | ----------------- |
45 | | [](https://colab.research.google.com/github/mozilla-ai/structured-qa/blob/main/demo/notebook.ipynb) | [](https://huggingface.co/spaces/mozilla-ai/structured-qa) | [](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=904169776&skip_quickstart=true&machine=standardLinux32gb) |
46 |
47 | You can also install and use the blueprint locally:
48 |
49 |
50 | ### Command Line Interface
51 |
52 | ```bash
53 | pip install structured-qa
54 | ```
55 |
56 | ```bash
57 | structured-qa \
58 | --question "What optimizer was used to train the model?" \
59 | --input_file "example_data/1706.03762v7.pdf" \
60 | --output_dir "example_outputs/1706.03762v7.pdf"
61 | ```
62 |
63 | ### Graphical Interface App
64 |
65 | ```bash
66 | git clone https://github.com/mozilla-ai/structured-qa.git
67 | cd structured-qa
68 | pip install -e .
69 | ```
70 |
71 | ```bash
72 | python -m streamlit run demo/app.py
73 | ```
74 |
75 |
76 | ## License
77 |
78 | This project is licensed under the Apache 2.0 License. See the [LICENSE](LICENSE) file for details.
79 |
80 | ## Contributing
81 |
82 | Contributions are welcome! To get started, you can check out the [CONTRIBUTING.md](CONTRIBUTING.md) file.
83 |
--------------------------------------------------------------------------------
/docs/step-by-step-guide.md:
--------------------------------------------------------------------------------
1 | # **Step-by-Step Guide: How the Structured-QA Blueprint Works**
2 |
3 | ## **Overview**
4 |
5 | This system has the following core stages:
6 |
7 |
8 | 📑 **1. Document Pre-Processing**
9 | Prepare the input document by extracting the different sections that compose the structure of the document.
10 | Split the sections and save them to separate files.
11 |
12 | 🔎 **2. Find Relevant and Retrieve Section**
13 | Given a list of sections and the input question, use the LLM to identify the section that looks more relevant. Load the individual section to be passed to the next step.
14 |
15 | 📗 **3. Answer Question**
16 | Use the LLM to answer the question based on the information available in the retrieved section.
17 |
18 |
19 | In case the LLM can't find an answer to the question, the stages 2 to 3 run on a loop until the LLM finds the answer.
20 |
21 | ---
22 |
23 | ## **Document Pre-Processing**
24 |
25 | The process begins with preparing the input document for AI processing.
26 | The document is first converted to markdown and then split into sections based on the markdown headings.
27 |
28 | **Markdown Conversion**
29 |
30 | - Uses [pymupdf4llm](https://pypi.org/project/pymupdf4llm/)
31 |
32 | **Section Splitting**
33 |
34 | - Uses [split_markdown_by_headings](api.md/#structured_qa.preprocessing.split_markdown_by_headings)
35 |
36 | - Each section is saved to a separate file.
37 |
38 | ### 🔍 **API Example**
39 |
40 | ```py
41 | from structured_qa.preprocessing import document_to_sections_dir
42 |
43 | section_names = document_to_sections_dir(
44 | "example_data/1706.03762v7.pdf", "example_outputs/1706.03762v7"
45 | )
46 | print(section_names)
47 | """
48 | ['attention is all you need', '1 introduction', '2 background', '3 model architecture', '4 why self-attention', '5 training', '6 results', '7 conclusion', 'references', 'attention visualizations']
49 | """
50 | ```
51 |
52 | ## **Find, Retrieve, Answer**
53 |
54 | These steps run in a loop until an answer is found or the maximum number of iterations is reached.
55 | An input `model` previously loaded with [`load_llama_cpp_model`](api.md/#structured_qa.model_loaders.load_llama_cpp_model) must
56 | be provided.
57 |
58 | The loop is defined in [`find_retrieve_answer`](api.md/#structured_qa.workflow.find_retrieve_answer)
59 |
60 | **Find**
61 |
62 | - Using the `section_names` from the pre-processing, calls the `model` with the [`FIND_PROMPT`](api.md/#structured_qa.config.FIND_PROMPT)
63 |
64 | **Retrieve**
65 |
66 | - Loads the `section` file picked by the `model`.
67 |
68 | **Answer**
69 |
70 | - Calls the `model` with the [`ANSWER_PROMPT`](api.md/#structured_qa.config.ANSWER_PROMPT)
71 |
72 | ### 🔍 **API Example**
73 |
74 | ```py
75 | from structured_qa.config import ANSWER_PROMPT, FIND_PROMPT
76 | from structured_qa.model_loaders import load_llama_cpp_model
77 | from structured_qa.workflow import find_retrieve_answer
78 |
79 | # Load the model
80 | model = load_llama_cpp_model(
81 | "bartowski/Qwen2.5-3B-Instruct-GGUF/Qwen2.5-3B-Instruct-f16.gguf"
82 | )
83 |
84 | answer, sections_checked = find_retrieve_answer(
85 | question="What optimizer was using for training?",
86 | model=model,
87 | sections_dir="example_outputs/1706.03762v7",
88 | find_prompt=FIND_PROMPT,
89 | answer_prompt=ANSWER_PROMPT
90 | )
91 | print(answer)
92 | """
93 | The optimizer used during training was Adam, with parameters β1 = 0.9, β2 = 0.98, and ϵ = 10^−9.
94 | """
95 | print(sections_checked)
96 | """
97 | ['5 training']
98 | """
99 | ```
100 |
101 | ## 🎨 **Customizing the Blueprint**
102 |
103 | To better understand how you can tailor this Blueprint to suit your specific needs, please visit the **[Customization Guide](customization.md)**.
104 |
105 | ## 🤝 **Contributing to the Blueprint**
106 |
107 | Want to help improve or extend this Blueprint? Check out the **[Future Features & Contributions Guide](future-features-contributions.md)** to see how you can contribute your ideas, code, or feedback to make this Blueprint even better!
108 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/Accountability and responsibility.txt:
--------------------------------------------------------------------------------
1 | Accountability and responsibility
2 | Ensuring accountability for generative AI means that individuals and organisations can be
3 | held accountable for the AI systems they develop, deploy, or use, and that human oversight
4 | is maintained. To establish accountable practices across the AI lifecycle, you should
5 | consider three key elements.
6 | • Answerability: you should establish a chain of human responsibility across the generative
7 | AI project lifecycle, including responsibility throughout the supply chain. In cases of
8 | harm or errors caused by generative AI, recourse and feedback mechanisms need to be
9 | established for affected individuals. Identifying the specific actors involved in generative AI
10 | systems is vital to answerability. This includes model developers, application developers,
11 | policymakers, regulators, system operators and end-users. The roles and responsibilities
12 | of each must be clearly defined and aligned with legal and ethical standards.
13 | • Auditability: you should demonstrate the responsibility and trustworthiness of
14 | the development and deployment practices by upholding robust reporting and
15 | documentation protocols, and retaining traceability throughout the AI lifecycle. This refers
16 | to the process by which all stages of the generative AI innovation lifecycle from data
17 | collection and base model training to implementation, fine-tuning, system deployment,
18 | updating, and retirement are documented in a way that is accessible to relevant
19 | stakeholders and easily understood.
20 | • Liability: you should make sure that all parties involved in the generative AI project
21 | lifecycle, from vendors and technical teams to system users, are acting lawfully and
22 | understand their respective legal obligations.
23 | As an end-user, being accountable means taking responsibility for a system’s outputs and
24 | generated content and its potential consequences. This includes checking that these are
25 | factual, truthful, non-discriminatory, non-harmful, and do not violate existing legal provisions,
26 | guidelines, policies or the providers’ terms of use. It entails putting the necessary oversight
27 | and human-in-the-loop processes in place to validate output in situations with high impact
28 | or risk. Where these risks are too high, you must consider if generative AI should be used.
29 | Ultimately, responsibility for any output or decision made or supported by an AI system
30 | always rests with the public organisation. Where generative AI is bought commercially,
31 | ensure that vendors understand their responsibilities and liabilities, put the required risk
32 | mitigations in place and share all relevant information. Refer to the Buying generative AI
33 | section for further guidance.
34 | Practical recommendations
35 | Follow existing legal provisions, guidelines and policies as well as the provider’s
36 | terms of use when developing, deploying or using generative AI.
37 | As an end-user, assume responsibility for output produced by generative AI tools
38 | when used to support everyday tasks, such as drafting emails and reports.
39 | Clearly define responsibilities, accountability, and liability across all actors involved
40 | in the AI lifecycle. Where the generative AI is bought commercially, define detailed
41 | responsibilities and liability contractually.
42 | Nominate a Senior Responsible Owner who will be accountable for the use of
43 | generative AI in a specific project.
44 | Where generative AI is used in situations of high impact or risk, establish a
45 | human-in-the-loop to oversee and validate outputs.
46 | Adopt a risk-based approach to the use of AI-generated content and put
47 | strategies in place to minimise the risk of inaccurate or harmful outputs. Where
48 | the potential risks and harmful impacts are too high, consider whether human-in-
49 | the-loop approaches offer sufficient mitigation or if generative AI should be used.
50 | Provide routes for appeal and actionable redress and put feedback
51 | channels into place.
52 | Use assurance techniques to evaluate the performance of generative AI systems.
53 | The CDEI AI assurance guide provides a useful starting point, and the CDEI
54 | portfolio of AI assurance techniques offers real-world examples.
55 |
--------------------------------------------------------------------------------
/src/structured_qa/workflow.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | from loguru import logger
4 | from rapidfuzz import process
5 |
6 | from structured_qa.model_loaders import LlamaModel
7 |
8 |
9 | def get_matching_section(response, section_names):
10 | """
11 | Use string similarity to find the most similar section_name.
12 | """
13 | return process.extractOne(response, section_names)[0]
14 |
15 |
16 | def find_retrieve_answer(
17 | question: str,
18 | model: LlamaModel,
19 | sections_dir: str,
20 | find_prompt: str,
21 | answer_prompt: str,
22 | max_sections_to_check: int | None = None,
23 | ) -> tuple[str, list[str]] | tuple[None, list[str]]:
24 | """
25 | Workflow to find the relevant section, retrieve the information, and answer the question.
26 |
27 | Args:
28 | question (str): The question to answer.
29 | model (LlamaModel): The model to use for generating completions.
30 | sections_dir (str): The directory containing the sections.
31 | See [`document_to_sections_dir`][structured_qa.preprocessing.document_to_sections_dir].
32 | Structure of the sections directory:
33 |
34 | ```
35 | sections_dir/
36 | section_1.txt
37 | section_2.txt
38 | ...
39 | ```
40 | find_prompt (str): The prompt for finding the section.
41 |
42 | See [`FIND_PROMPT`][structured_qa.config.FIND_PROMPT].
43 | answer_prompt (str): The prompt for answering the question.
44 |
45 | See [`ANSWER_PROMPT`][structured_qa.config.ANSWER_PROMPT].
46 | max_sections_to_check (int, optional): The maximum number of sections to check before giving up.
47 | Defaults to None.
48 | If None, it will check up to a maximum of 20 sections until it finds the answer.
49 |
50 | Returns:
51 | tuple[str, list[str]] | tuple[None, list[str]]:
52 |
53 | If the answer is found, the tuple contains the answer and the sections checked.
54 | If the answer is not found, the tuple contains None and the sections checked
55 | """
56 | sections_dir = Path(sections_dir)
57 | sections_names = [section.stem for section in sections_dir.glob("*.txt")]
58 | current_info = None
59 | current_section = None
60 |
61 | if max_sections_to_check is None:
62 | max_sections_to_check = min(20, len(sections_names))
63 |
64 | sections_checked = []
65 | while len(sections_checked) <= max_sections_to_check:
66 | logger.debug(f"Current information available: {current_info}")
67 | if not current_info:
68 | logger.debug("Finding section")
69 | finding_section = True
70 | question_part, *options = question.split("?")
71 | messages = [
72 | {
73 | "role": "system",
74 | "content": find_prompt.format(SECTIONS="\n".join(sections_names)),
75 | },
76 | {"role": "user", "content": question_part},
77 | ]
78 | else:
79 | logger.debug("Answering question")
80 | finding_section = False
81 | messages = [
82 | {
83 | "role": "system",
84 | "content": answer_prompt.format(CURRENT_INFO=current_info),
85 | },
86 | {"role": "user", "content": question},
87 | ]
88 |
89 | try:
90 | response = model.get_response(messages)
91 | except Exception as e:
92 | logger.error(f"Failed to generate completion: {e}")
93 | return "Generation Error", sections_checked
94 |
95 | if finding_section:
96 | response = response.strip()
97 | if not sections_names:
98 | return "NOT FOUND", sections_checked
99 | section_name = get_matching_section(response, sections_names)
100 | logger.debug(f"Retrieving section: {section_name}")
101 | section_content = (sections_dir / f"{section_name}.txt").read_text()
102 | current_section = section_name
103 | current_info = section_content
104 | sections_checked.append(section_name)
105 |
106 | else:
107 | if "MORE INFO" in response.upper():
108 | current_info = None
109 | sections_names.remove(current_section)
110 | continue
111 | else:
112 | return response, sections_checked
113 |
114 | return "NOT FOUND", sections_checked
115 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/5.2. Thread Hierarchy.txt:
--------------------------------------------------------------------------------
1 | For convenience, threadIdx is a 3-component vector, so that threads can be identified using a one-
2 | dimensional, two-dimensional, or three-dimensional thread index, forming a one-dimensional, two-
3 | dimensional, or three-dimensional block of threads, called a thread block. This provides a natural way
4 | to invoke computation across the elements in a domain such as a vector, matrix, or volume.
5 | The index of a thread and its thread ID relate to each other in a straightforward way: For a one-
6 | dimensional block, they are the same; for a two-dimensional block of size (Dx, Dy), the thread ID of
7 | a thread of index (x, y) is (x + y Dx); for a three-dimensional block of size (Dx, Dy, Dz), the thread ID of a
8 | thread of index (x, y, z) is (x + y Dx + z Dx Dy).
9 | As an example, the following code adds two matrices A and B of size NxN and stores the result into
10 | matrix C.
11 | ∕∕ Kernel definition
12 | __global__ void MatAdd(float A[N][N], float B[N][N],
13 | float C[N][N])
14 | {
15 | int i = threadIdx.x;
16 | int j = threadIdx.y;
17 | C[i][j] = A[i][j] + B[i][j];
18 | }
19 | int main()
20 | {
21 | ...
22 | ∕∕ Kernel invocation with one block of N * N * 1 threads
23 | int numBlocks = 1;
24 | dim3 threadsPerBlock(N, N);
25 | MatAdd<<>>(A, B, C);
26 | ...
27 | }
28 | There is a limit to the number of threads per block, since all threads of a block are expected to reside
29 | on the same streaming multiprocessor core and must share the limited memory resources of that
30 | core. On current GPUs, a thread block may contain up to 1024 threads.
31 | However, a kernel can be executed by multiple equally-shaped thread blocks, so that the total number
32 | of threads is equal to the number of threads per block times the number of blocks.
33 | Blocks are organized into a one-dimensional, two-dimensional, or three-dimensional grid of thread
34 | blocks as illustrated by Figure 4. The number of thread blocks in a grid is usually dictated by the size
35 | of the data being processed, which typically exceeds the number of processors in the system.
36 | The number of threads per block and the number of blocks per grid specified in the <<<...>>> syntax
37 | can be of type int or dim3. Two-dimensional blocks or grids can be specified as in the example above.
38 | Each block within the grid can be identified by a one-dimensional, two-dimensional, or three-
39 | dimensional unique index accessible within the kernel through the built-in blockIdx variable. The
40 | dimension of the thread block is accessible within the kernel through the built-in blockDim variable.
41 | Extending the previous MatAdd() example to handle multiple blocks, the code becomes as follows.
42 | ∕∕ Kernel definition
43 | __global__ void MatAdd(float A[N][N], float B[N][N],
44 | float C[N][N])
45 | {
46 | int i = blockIdx.x * blockDim.x + threadIdx.x;
47 | int j = blockIdx.y * blockDim.y + threadIdx.y;
48 | if (i < N && j < N)
49 | C[i][j] = A[i][j] + B[i][j];
50 | }
51 | int main()
52 | {
53 | ...
54 | ∕∕ Kernel invocation
55 | dim3 threadsPerBlock(16, 16);
56 | dim3 numBlocks(N ∕ threadsPerBlock.x, N ∕ threadsPerBlock.y);
57 | MatAdd<<>>(A, B, C);
58 | ...
59 | }
60 | A thread block size of 16x16 (256 threads), although arbitrary in this case, is a common choice. The
61 | grid is created with enough blocks to have one thread per matrix element as before. For simplicity,
62 | this example assumes that the number of threads per grid in each dimension is evenly divisible by the
63 | number of threads per block in that dimension, although that need not be the case.
64 | Thread blocks are required to execute independently. It must be possible to execute blocks in any
65 | order, in parallel or in series. This independence requirement allows thread blocks to be scheduled in
66 | any order and across any number of cores as illustrated by Figure 3, enabling programmers to write
67 | code that scales with the number of cores.
68 | Threads within a block can cooperate by sharing data through some shared memory and by synchroniz-
69 | ing their execution to coordinate memory accesses. More precisely, one can specify synchronization
70 | points in the kernel by calling the __syncthreads() intrinsic function; __syncthreads() acts as a
71 | barrier at which all threads in the block must wait before any is allowed to proceed. Shared Memory
72 | gives an example of using shared memory. In addition to __syncthreads(), the Cooperative Groups
73 | API provides a rich set of thread-synchronization primitives.
74 | For efficient cooperation, shared memory is expected to be a low-latency memory near each processor
75 | core (much like an L1 cache) and __syncthreads() is expected to be lightweight.
76 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/HOW DO YOU CHOOSE AN OPEN ACCESS PUBLISHER?.txt:
--------------------------------------------------------------------------------
1 | In order to select an open access publisher, you will
2 | first need to know the range of open access publish-
3 | ers available to you. For authors of articles, a good
4 | place to start is the Directory of Open Access Journals
5 | (“DOAJ”), an online directory that screens and indexes
6 | over 10,000 peer-reviewed open access journals.41 You
7 | may also consult the Open Access Scholarly Publishers
8 | Association, whose membership includes both article
9 | and monograph open access publishers.42 From there,
10 | you should consider a number of factors to determine
11 | which publisher, if any, best suits your needs,
12 | including:
13 | The Impact (Factor) of the Journal
14 | Regardless of whether a journal is restricted or open
15 | access, authors may find it important to consider the
16 | journal’s “impact factor.” Generally, the impact factor
17 | measures the frequency with which the average article
18 | in a journal is cited over a particular period of time.
19 | Many academics, including tenure committees, use this
20 | metric as a proxy for the prestige, quality of scholar-
21 | ship, and competitiveness of a given journal.
22 | While impact factor comparisons currently favor
23 | well-established, conventional publishers, alternative
24 | metrics (sometimes referred to as “altmetrics”) have
25 | recently emerged as a way to incorporate new data
26 | sources—such as the number of downloads and page
27 | views, media coverage, or even social media dissemina-
28 | tion—to measure the impact of a journal or of a work in
29 | light of recent technological developments.43 Authors
30 | can use these alternative metrics to complement
31 | citation-based metrics as a signal of the wide and
32 | diverse impact of their works.44
33 | Authors may also be able to find an open access journal
34 | associated with a prominent conventional publisher in
35 | their field, allowing them to enjoy the benefits of both a
36 | well-respected brand and open access.
37 | Although the development of alternative metrics
38 | is promising, some authors may not want to put
39 | important employment decisions at risk if their insti-
40 | tutions heavily rely on journals’ impact factors. Authors
41 | with a particular concern about impact factors may
42 | alternatively consider publishing with a high-
43 | impact-factor, conventional journal and negotiating to
44 | retain the right to self-archive, as discussed in
45 | Chapter 6. Some conventional publishers also offer
46 | “hybrid” options whereby articles published in a
47 | subscription journal are also made openly accessible,
48 | typically in exchange for a fee.
49 | The Journal’s Reputation for
50 | Responsible Business Practices
51 | Some journals are better than others at editing man-
52 | uscripts, getting issues to press in a timely manner,
53 | and other aspects of providing service to authors and
54 | readers. Before you commit your work to a journal, you
55 | should be familiar with its recent publishing record. If
56 | 72 Understanding Open Access
57 | you know other authors who have published there, you
58 | might ask them about their experience.
59 | The Open Access Licensing Terms Available
60 | Through the Publisher
61 | A given open access publisher may have only one type
62 | of license that it automatically applies to all the works
63 | it publishes. Thus, authors wishing to fine-tune the
64 | “openness” of their works (see Chapter 4) should
65 | research the licensing policies of the open access
66 | journals in which they are interested. The Directory of
67 | Open Access Journals (“DOAJ”) allows authors to search
68 | by licensing terms for easy comparison.
69 | The Technical Openness of the Publication
70 | Authors interested in making sure that their works are
71 | more technically open should consider the technical
72 | capabilities of different publishers. (See Chapter 4.)
73 | Whether The Publisher Charges Author-Side Fees
74 | and its Policy Regarding Exemptions
75 | As discussed above, some, but not all, open access pub-
76 | lishers charge publication fees. The DOAJ allows users
77 | to filter search results for journals that do not charge
78 | publication fees.
79 | Even if your desired journal charges a fee, you may
80 | not need to pay out-of-pocket. Consider the following
81 | alternate strategies to cover the fee:
82 | • Ask your institution or funding entity if it has
83 | earmarked funds available to pay for open access
84 | publication fees.46
85 | • Apply for grant funding specifically designated
86 | for open access publishing (for example, from a
87 | government agency, private foundation, or insti-
88 | tutional library).47
89 | • Partake in the journal’s fee assistance program48
90 | or institutional discount.49
91 | • Apply for a publication fee waiver if the journal
92 | offers one.
93 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/2.1 Pre-training Data.txt:
--------------------------------------------------------------------------------
1 | 2 Approach
2 | Our training approach is similar to the methods
3 | described in previous work (Brown et al., 2020;
4 | Chowdhery et al., 2022), and is inspired by the
5 | Chinchilla scaling laws (Hoffmann et al., 2022).
6 | We train large transformers on a large quantity of
7 | textual data using a standard optimizer.
8 | 2.1 Pre-training Data
9 | Our training dataset is a mixture of several sources,
10 | reported in Table 1, that cover a diverse set of do-
11 | mains. For the most part, we reuse data sources
12 | that have been leveraged to train other LLMs, with
13 | the restriction of only using data that is publicly
14 | available, and compatible with open sourcing. This
15 | leads to the following mixture of data and the per-
16 | centage they represent in the training set:
17 | English CommonCrawl [67%]. We preprocess
18 | five CommonCrawl dumps, ranging from 2017
19 | to 2020, with the CCNet pipeline (Wenzek et al.,
20 | 2020). This process deduplicates the data at the
21 | line level, performs language identification with
22 | a fastText linear classifier to remove non-English
23 | pages and filters low quality content with an n-
24 | gram language model. In addition, we trained a
25 | linear model to classify pages used as references
26 | in Wikipedia v.s. randomly sampled pages, and
27 | discarded pages not classified as references.
28 | C4 [15%]. During exploratory experiments, we
29 | observed that using diverse pre-processed Com-
30 | monCrawl datasets improves performance. We thus
31 | included the publicly available C4 dataset (Raffel
32 | et al., 2020) in our data. The preprocessing of C4
33 | also contains deduplication and language identifi-
34 | cation steps: the main difference with CCNet is
35 | the quality filtering, which mostly relies on heuris-
36 | tics such as presence of punctuation marks or the
37 | number of words and sentences in a webpage.
38 | Github [4.5%]. We use the public GitHub
39 | dataset available on Google BigQuery. We only
40 | kept projects that are distributed under the Apache,
41 | BSD and MIT licenses. Additionally, we filtered
42 | low quality files with heuristics based on the line
43 | length or proportion of alphanumeric characters,
44 | and removed boilerplate, such as headers, with reg-
45 | ular expressions. Finally, we deduplicate the result-
46 | ing dataset at the file level, with exact matches.
47 | Wikipedia [4.5%]. We add Wikipedia dumps
48 | from the June-August 2022 period, covering 20
49 | Dataset Sampling prop. Epochs Disk size
50 | CommonCrawl 67.0% 1.10 3.3 TB
51 | C4 15.0% 1.06 783 GB
52 | Github 4.5% 0.64 328 GB
53 | Wikipedia 4.5% 2.45 83 GB
54 | Books 4.5% 2.23 85 GB
55 | ArXiv 2.5% 1.06 92 GB
56 | StackExchange 2.0% 1.03 78 GB
57 | Table 1: Pre-training data. Data mixtures used for pre-
58 | training, for each subset we list the sampling propor-
59 | tion, number of epochs performed on the subset when
60 | training on 1.4T tokens, and disk size. The pre-training
61 | runs on 1T tokens have the same sampling proportion.
62 | languages, which use either the Latin or Cyrillic
63 | scripts: bg, ca, cs, da, de, en, es, fr, hr, hu, it,
64 | nl, pl, pt, ro, ru, sl, sr, sv, uk. We process the
65 | data to remove hyperlinks, comments and other
66 | formatting boilerplate.
67 | Gutenberg and Books3 [4.5%]. We include
68 | two book corpora in our training dataset: the Guten-
69 | berg Project, which contains books that are in the
70 | public domain, and the Books3 section of TheP-
71 | ile (Gao et al., 2020), a publicly available dataset
72 | for training large language models. We perform
73 | deduplication at the book level, removing books
74 | with more than 90% content overlap.
75 | ArXiv [2.5%]. We process arXiv Latex files
76 | to add scientific data to our dataset. Following
77 | Lewkowycz et al. (2022), we removed everything
78 | before the first section, as well as the bibliography.
79 | We also removed the comments from the .tex files,
80 | and inline-expanded definitions and macros written
81 | by users to increase consistency across papers.
82 | Stack Exchange [2%]. We include a dump of
83 | Stack Exchange, a website of high quality ques-
84 | tions and answers that covers a diverse set of do-
85 | mains, ranging from computer science to chemistry.
86 | We kept the data from the 28 largest websites, re-
87 | moved the HTML tags from text and sorted the
88 | answers by score (from highest to lowest).
89 | Tokenizer. We tokenize the data with the byte-
90 | pair encoding (BPE) algorithm (Sennrich et al.,
91 | 2015), using the implementation from Sentence-
92 | Piece (Kudo and Richardson, 2018). Notably, we
93 | split all numbers into individual digits, and fallback
94 | to bytes to decompose unknown UTF-8 characters.
95 | Overall, our entire training dataset contains
96 | roughly 1.4T tokens after tokenization. For most of
97 | our training data, each token is used only once dur-
98 | ing training, with the exception of the Wikipedia
99 | and Books domains, over which we perform ap-
100 | proximately two epochs.
101 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/3 Experimental Results.txt:
--------------------------------------------------------------------------------
1 | First, we make a toy experiment where we fit a neural
2 | network to: y = x2 equation. The neural network has 3
3 | dense layers with 2 filters each, except for last layer which
4 | has 1 filter. The network uses leaky-ReLU activations after
5 | fully connected layers, except for the last layer which has
6 | no post-activation. We have used negative slope of 0.3 for
7 | leaky-ReLU which is the default value in Tensorflow [1].
8 | The network was trained with 5000 (x, y) pairs where x was
9 | regularly sampled from [−2.5, 2.5] interval. Fig. 2 shows
10 | the decision tree corresponding to the neural network. In the
11 | tree, every black rectangle box indicates a rule, left child
12 | from the box means the rule does not hold, and the right
13 | child means the rule holds. For better visualization, the
14 | rules are obtained via converting wT x + β > 0 to direct
15 | inequalities acting on x. This can be done for the partic-
16 | ular regression y = x2, since x is a scalar. In every leaf,
17 | the network applies a linear function -indicated by a red
18 | rectangle- based on the decisions so far. We have avoided
19 | writing these functions explicitly due to limited space. At
20 | first glance, the tree representation of a neural network in
21 | this example seems large due to the 2∑n−2
22 | i mi = 24 = 16
23 | categorizations. However, we notice that a lot of the rules
24 | in the decision tree is redundant, and hence some paths in
25 | the decision tree becomes invalid. An example to redundant
26 | rule is checking x < 0.32 after x < −1.16 rule holds. This
27 | directly creates the invalid left child for this node. Hence,
28 | the tree can be cleaned via removing the left child in this
29 | case, and merging the categorization rule to the stricter one :
30 | x < −1.16 in the particular case. Via cleaning the decision
31 | tree in Fig. 2, we obtain the simpler tree in Fig. 3a, which
32 | only consists of 5 categories instead of 16. The 5 categories
33 | are directly visible also from the model response in Fig. 3b.
34 | The interpretation of the neural network is thus straightfor-
35 | ward: for each region whose boundaries are determined via
36 | the decision tree representation, the network approximates
37 | the non-linear y = x2 equation by a linear equation. One
38 | can clearly interpret and moreover make deduction from the
39 | decision tree, some of which are as follows. The neural
40 | network is unable to grasp the symmetrical nature of the
41 | regression problem which is evident from the fact that the
42 | decision boundaries are asymmetrical. The region in below
43 | −1.16 and above 1 is unbounded and thus neural decisions
44 | lose accuracy as x goes beyond these boundaries.
45 |
46 | y = x2 Half-Moon
47 | Param. Comp. Mult./Add. Param. Comp. Mult./Add.
48 | Tree 14 2.6 2 39 4.1 8.2
49 | NN 13 4 16 15 5 25
50 | Table 1. Computation and memory analysis of toy problems
51 |
52 | Next, we investigate another toy problem of classifying
53 | half-moons and analyse the decision tree produced by a neu-
54 | ral network. We train a fully connected neural network with
55 | 3 layers with leaky-ReLU activations, except for last layer
56 | which has sigmoid activation. Each layer has 2 filters ex-
57 | cept for the last layer which has 1. The cleaned decision
58 | tree induced by the trained network is shown in Fig. 4. The
59 | decision tree finds many categories whose boundaries are
60 | determined by the rules in the tree, where each category
61 | is assigned a single class. In order to better visualize the
62 | categories, we illustrate them with different colors in Fig.
63 | 5. One can make several deductions from the decision tree
64 | such as some regions are very well-defined, bounded and
65 | the classifications they make are perfectly in line with the
66 | training data, thus these regions are very reliable. There are
67 | unbounded categories which help obtaining accurate classi-
68 | fication boundaries, yet fail to provide a compact represen-
69 | tation of the training data, these may correspond to inaccu-
70 | rate extrapolations made by neural decisions. There are also
71 | some categories that emerged although none of the training
72 | data falls to them.
73 | Besides the interpretability aspect, the decision tree rep-
74 | resentation also provides some computational advantages.
75 | In Table 1, we compare the number of parameters, float-
76 | point comparisons and multiplication or addition operations
77 | of the neural network and the tree induced by it. Note that
78 | the comparisons, multiplications and additions in the tree
79 | representation are given as expected values, since per each
80 | category depth of the tree is different. As the induced tree
81 | is an unfolding of the neural network, it covers all possi-
82 | ble routes and keeps all possible effective filters in mem-
83 | ory. Thus, as expected, the number of parameters in the tree
84 | representation of a neural network is larger than that of the
85 | network. In the induced tree, in every layer i, a maximum
86 | of mi filters are applied directly on the input, whereas in the
87 | neural network always mi filters are applied on the previous
88 | feature, which is usually much larger than the input in the
89 | feature dimension. Thus, computation-wise, the tree repre-
90 | sentation is advantageous compared to the neural network
91 | one.
92 |
--------------------------------------------------------------------------------
/demo/notebook.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Structured Q&A"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "Source code: https://github.com/mozilla-ai/structured-qa"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "Docs: https://mozilla-ai.github.io/structured-qa"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "## GPU Check"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "First, you'll need to enable GPUs for the notebook:\n",
36 | "\n",
37 | "- Navigate to `Edit`→`Notebook Settings`\n",
38 | "- Select T4 GPU from the Hardware Accelerator section\n",
39 | "- Click `Save` and accept.\n",
40 | "\n",
41 | "Next, we'll confirm that we can connect to the GPU:"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "import torch\n",
51 | "\n",
52 | "if not torch.cuda.is_available():\n",
53 | " raise RuntimeError(\"GPU not available\")\n",
54 | "else:\n",
55 | " print(\"GPU is available!\")"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "## Installing dependencies"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": null,
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "%pip install --quiet https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu122/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl\n",
72 | "%pip install --quiet structured-qa"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "## Uploading data"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "from google.colab import files\n",
89 | "\n",
90 | "uploaded = files.upload()"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "## Converting document to a directory of sections"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "from pathlib import Path\n",
107 | "from structured_qa.preprocessing import document_to_sections_dir\n",
108 | "\n",
109 | "input_file = list(uploaded.keys())[0]\n",
110 | "sections_dir = f\"output/{Path(input_file).stem}\"\n",
111 | "section_names = document_to_sections_dir(input_file, sections_dir)\n",
112 | "section_names"
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "## Loading model"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "from structured_qa.model_loaders import load_llama_cpp_model\n",
129 | "\n",
130 | "model = load_llama_cpp_model(\n",
131 | " \"bartowski/Qwen2.5-7B-Instruct-GGUF/Qwen2.5-7B-Instruct-Q8_0.gguf\"\n",
132 | ")"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "## Find, Retrieve, and Answer"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "FIND_PROMPT = \"\"\"\n",
149 | "You are given two pieces of information:\n",
150 | "1. A list of valid section names.\n",
151 | "2. A user question.\n",
152 | "\n",
153 | "Your task is to:\n",
154 | "- Identify exactly one `section_name` from the provided list that seems related to the user question.\n",
155 | "- Return the `section_name` exactly as it appears in the list.\n",
156 | "- Do NOT answer the question.\n",
157 | "- Do NOT return any additional text, explanation, or formatting.\n",
158 | "- Do NOT combine multiple section names into a single response.\n",
159 | "\n",
160 | "Here is the list of valid section names:\n",
161 | "\n",
162 | "```\n",
163 | "{SECTIONS}\n",
164 | "```\n",
165 | "\n",
166 | "Now, based on the following question, return the single most relevant `section_name` from the list.\n",
167 | "\"\"\""
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "ANSWER_PROMPT = \"\"\"\n",
177 | "You are a rigorous assistant answering questions.\n",
178 | "You must only answer based on the current information available which is:\n",
179 | "\n",
180 | "```\n",
181 | "{CURRENT_INFO}\n",
182 | "```\n",
183 | "\n",
184 | "If the current information available not enough to answer the question,\n",
185 | "you must return \"I need more info\" and nothing else.\n",
186 | "\"\"\""
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "QUESTION = \"What optimizer was used to train the model?\""
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "from structured_qa.workflow import find_retrieve_answer\n",
205 | "\n",
206 | "find_retrieve_answer(\n",
207 | " question=QUESTION,\n",
208 | " model=model,\n",
209 | " sections_dir=sections_dir,\n",
210 | " find_prompt=FIND_PROMPT,\n",
211 | " answer_prompt=ANSWER_PROMPT,\n",
212 | ")"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": []
219 | }
220 | ],
221 | "metadata": {
222 | "kernelspec": {
223 | "display_name": ".venv",
224 | "language": "python",
225 | "name": "python3"
226 | },
227 | "language_info": {
228 | "name": "python",
229 | "version": "3.10.12"
230 | }
231 | },
232 | "nbformat": 4,
233 | "nbformat_minor": 2
234 | }
235 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/OVERCOMING RESERVATIONS ABOUT OPEN ACCESS.txt:
--------------------------------------------------------------------------------
1 | Some authors who see the potential of open access may
2 | still have reservations about whether open access is
3 | right for them. Some of these reservations are based
4 | on myths about open access and can be resolved by
5 | clearing up misunderstandings. Other reservations
6 | reflect the current limitations of open access options
7 | available to some authors. Fortunately, open access
8 | opportunities are growing as the open access move-
9 | ment spreads through more author communities.
10 | Existing solutions and encouraging developments are
11 | detailed below to address authors’ common reserva-
12 | tions about open access.
13 | Open access is compatible
14 | with peer review and prestige
15 | Peer review, selective submission standards, and other
16 | attributes of prestigious publication are indepen-
17 | dent of the openness of the publication. Some open
18 | access publications apply the highest standards of
19 | quality control, some proprietary publications publish
20 | low-quality works, and vice versa.
21 | Introduction 11
22 | It is true, however, that some new open access
23 | publications do not yet command the same level of
24 | prestige as the best-established, conventional pub-
25 | lications. The prestige of publishing a work with a
26 | leading conventional publisher may dissuade some
27 | authors from publishing with an open access publisher.
28 | This is particularly true of junior faculty whose tenure
29 | prospects may depend on a publication record in top
30 | proprietary outlets.
31 | We expect this will lessen as open access pub-
32 | lishers establish their reputations and proliferate
33 | across disciplines, as existing publishers adopt more
34 | open practices, as more research funders require open
35 | access to the works they fund, and as senior scholars
36 | signal the value of open access. Encouragingly, an
37 | increasing number of open access journals have already
38 | achieved high regard in their disciplines, as described
39 | in Chapter 5.
40 | In the meantime, conventional publication and
41 | open access are not mutually exclusive. For example,
42 | many conventional publishers allow authors who
43 | publish with them to also upload the authors’ final
44 | versions of their works to open access repositories. In
45 | such cases, authors can benefit from the imprint of
46 | a well-established print publisher while still making
47 | their works openly accessible. (For more information,
48 | please see Chapter 7.)
49 | Authors do not always have to pay to
50 | make their works openly accessible
51 | Some authors have reservations about open access
52 | because they think they will need to pay to make
53 | their works openly accessible. This is not always true.
54 | Although some open access publishers do charge a fee
55 | to cover the cost of publishing a work, many authors
56 | make their works openly accessible without incurring
57 | any costs. In fact, the majority of open access journals
58 | charge no author-side fees at all.7 Even where pub-
59 | lishers charge publication fees, there are many ways
60 | that authors can mitigate those costs, as discussed in
61 | Chapter 5. Moreover, depositing a work in an institu-
62 | tional open access repository is always free for authors.
63 | Open access options are available
64 | for book authors
65 | The traditional practice in book publishing has been
66 | for authors to work with conventional publishers,
67 | typically assigning their copyrights in exchange for
68 | royalty streams from the sales of their books. Publish-
69 | ers may be reluctant to agree to open access because
70 | they believe that it will undermine books sales. Authors
71 | who depend on royalties likely share this concern.
72 | Moreover, this book publishing convention still works
73 | well for many authors.
74 | However, some authors are excited by the
75 | potential of open access book publishing to increase the
76 | audience for their works. Open access book publishing
77 | options are increasing for these authors.
78 | Many publishers are developing programs to
79 | make books openly accessible.8 For example, the
80 | University of California Press recently launched Luminos,
81 | an open access publishing program for monographs.9
82 | Authors who publish with Luminos can make digital
83 | editions of their books openly accessible under the Uni-
84 | versity of California Press imprint. Open Humanities Press
85 | has also launched an open access program for mono-
86 | graphs, making the books it publishes in print available
87 | as full-text digital editions published under open
88 | licenses.
89 | 10 Additionally, many university presses make
90 | academic books openly available in the Open Access Pub-
91 | lishing in European Networks (“OAPEN”) Library. 14
92 | Authors can also make their self-published books
93 | openly accessible by uploading electronic versions to
94 | open access repositories or personal websites. Institu-
95 | tions that host repositories will sometimes also offer
96 | book-formatting resources for authors who deposit
97 | book-length works in their repositories. For example,
98 | eScholarship, the University of California’s institutional
99 | repository, provides authors tools to create digital
100 | versions of their books and also provides University of
101 | California authors print-on-demand services.12 (For
102 | more information on open access repositories, please
103 | see Chapter 5.)
104 | Additionally, book authors who are interested
105 | in open access may choose to negotiate with conven-
106 | tional publishers to publish their books in print but
107 | also retain the rights to openly license their books,
108 | as described in Chapter 7. Authors who have already
109 | assigned their rights to conventional publishers may be
110 | able to exercise or negotiate for rights reversions that
111 | would allow them to make their books openly accessi-
112 | ble. For more on this possibility, please see the
113 | Authors who make their works openly accessible
114 | can require attribution
115 | Some authors are concerned that open access neces-
116 | sarily means others will be allowed to use their works
117 | without giving them credit. This is not true. Although
118 | some authors opt to allow others to use their openly
119 | accessible work without retaining a legal right to insist
120 | on credit, the vast majority of authors select license
121 | terms that require others to give them credit for their
122 | works. (Please see Chapter 4 to learn more about open
123 | access licensing.) Furthermore, even if unattributed
124 | copying of an open access work does not amount to
125 | copyright infringement, it may still amount to plagia-
126 | rism—thus running afoul of longstanding norms within
127 | scholarly and publishing communities.
128 | Authors who make their works openly accessible
129 | can still preserve the integrity of their works
130 | Some authors are concerned that the integrity of their
131 | works will be compromised if they make their works
132 | openly accessible. An author might worry, for example,
133 | that her work will be modified in a way that distorts
134 | its meaning and discredits her. However, authors can
135 | use license terms to control how others are allowed
136 | to use their works (subject to some limitations, such
137 | as fair use). Open access licenses often include pro-
138 | visions that protect against misuse, prevent loss of
139 | integrity, and protect author reputation. For example,
140 | Creative Commons licenses require attribution, unless
141 | the author does not want to be attributed; include an
142 | obligation to indicate whether an author’s work has
143 | been modified or not, even if those modifications are
144 | trivial; and require users to link back to the original
145 | if a link is provided. In addition, authors who do not
146 | want to permit others to modify their works can select
147 | license terms that allow free access and distribution of
148 | verbatim copies but not adaptations. More information
149 | on open access licenses can be found in Chapter 4.
150 | Finally, scholarly norms for citation and regarding pla-
151 | giarism are not supplanted when authors openly license
152 | their works.
153 |
--------------------------------------------------------------------------------
/benchmark/perfect_context/Prohibited AI Practices.txt:
--------------------------------------------------------------------------------
1 | 1. The following AI practices shall be prohibited:
2 | (a) the placing on the market, the putting into service or the use of an AI system that deploys subliminal techniques beyond
3 | a person’s consciousness or purposefully manipulative or deceptive techniques, with the objective, or the effect of
4 | materially distorting the behaviour of a person or a group of persons by appreciably impairing their ability to make an
5 | informed decision, thereby causing them to take a decision that they would not have otherwise taken in a manner that
6 | causes or is reasonably likely to cause that person, another person or group of persons significant harm;
7 | (b) the placing on the market, the putting into service or the use of an AI system that exploits any of the vulnerabilities of
8 | a natural person or a specific group of persons due to their age, disability or a specific social or economic situation, with
9 | the objective, or the effect, of materially distorting the behaviour of that person or a person belonging to that group in
10 | a manner that causes or is reasonably likely to cause that person or another person significant harm;
11 | (c) the placing on the market, the putting into service or the use of AI systems for the evaluation or classification of natural
12 | persons or groups of persons over a certain period of time based on their social behaviour or known, inferred or
13 | predicted personal or personality characteristics, with the social score leading to either or both of the following:
14 | (i) detrimental or unfavourable treatment of certain natural persons or groups of persons in social contexts that are
15 | unrelated to the contexts in which the data was originally generated or collected;
16 | (ii) detrimental or unfavourable treatment of certain natural persons or groups of persons that is unjustified or
17 | disproportionate to their social behaviour or its gravity;
18 | (d) the placing on the market, the putting into service for this specific purpose, or the use of an AI system for making risk
19 | assessments of natural persons in order to assess or predict the risk of a natural person committing a criminal offence,
20 | based solely on the profiling of a natural person or on assessing their personality traits and characteristics; this
21 | prohibition shall not apply to AI systems used to support the human assessment of the involvement of a person in
22 | a criminal activity, which is already based on objective and verifiable facts directly linked to a criminal activity;
23 | (e) the placing on the market, the putting into service for this specific purpose, or the use of AI systems that create or
24 | expand facial recognition databases through the untargeted scraping of facial images from the internet or CCTV footage;
25 | (f) the placing on the market, the putting into service for this specific purpose, or the use of AI systems to infer emotions
26 | of a natural person in the areas of workplace and education institutions, except where the use of the AI system is
27 | intended to be put in place or into the market for medical or safety reasons;
28 | OJ L, 12.7.2024 EN
29 | ELI: http://data.europa.eu/eli/reg/2024/1689/oj(g) the placing on the market, the putting into service for this specific purpose, or the use of biometric categorisation
30 | systems that categorise individually natural persons based on their biometric data to deduce or infer their race, political
31 | opinions, trade union membership, religious or philosophical beliefs, sex life or sexual orientation; this prohibition does
32 | not cover any labelling or filtering of lawfully acquired biometric datasets, such as images, based on biometric data or
33 | categorizing of biometric data in the area of law enforcement;
34 | (h) the use of ‘real-time’ remote biometric identification systems in publicly accessible spaces for the purposes of law
35 | enforcement, unless and in so far as such use is strictly necessary for one of the following objectives:
36 | (i) the targeted search for specific victims of abduction, trafficking in human beings or sexual exploitation of human
37 | beings, as well as the search for missing persons;
38 | (ii) the prevention of a specific, substantial and imminent threat to the life or physical safety of natural persons or
39 | a genuine and present or genuine and foreseeable threat of a terrorist attack;
40 | (iii) the localisation or identification of a person suspected of having committed a criminal offence, for the purpose of
41 | conducting a criminal investigation or prosecution or executing a criminal penalty for offences referred to in
42 | Annex II and punishable in the Member State concerned by a custodial sentence or a detention order for
43 | a maximum period of at least four years.
44 | Point (h) of the first subparagraph is without prejudice to Article 9 of Regulation (EU) 2016/679 for the processing of
45 | biometric data for purposes other than law enforcement.
46 | 2. The use of ‘real-time’ remote biometric identification systems in publicly accessible spaces for the purposes of law
47 | enforcement for any of the objectives referred to in paragraph 1, first subparagraph, point (h), shall be deployed for the
48 | purposes set out in that point only to confirm the identity of the specifically targeted individual, and it shall take into
49 | account the following elements:
50 | (a) the nature of the situation giving rise to the possible use, in particular the seriousness, probability and scale of the harm
51 | that would be caused if the system were not used;
52 | (b) the consequences of the use of the system for the rights and freedoms of all persons concerned, in particular the
53 | seriousness, probability and scale of those consequences.
54 | In addition, the use of ‘real-time’ remote biometric identification systems in publicly accessible spaces for the purposes of
55 | law enforcement for any of the objectives referred to in paragraph 1, first subparagraph, point (h), of this Article shall
56 | comply with necessary and proportionate safeguards and conditions in relation to the use in accordance with the national
57 | law authorising the use thereof, in particular as regards the temporal, geographic and personal limitations. The use of the
58 | ‘real-time’ remote biometric identification system in publicly accessible spaces shall be authorised only if the law
59 | enforcement authority has completed a fundamental rights impact assessment as provided for in Article 27 and has
60 | registered the system in the EU database according to Article 49. However, in duly justified cases of urgency, the use of such
61 | systems may be commenced without the registration in the EU database, provided that such registration is completed
62 | without undue delay.
63 | 3. For the purposes of paragraph 1, first subparagraph, point (h) and paragraph 2, each use for the purposes of law
64 | enforcement of a ‘real-time’ remote biometric identification system in publicly accessible spaces shall be subject to a prior
65 | authorisation granted by a judicial authority or an independent administrative authority whose decision is binding of the
66 | Member State in which the use is to take place, issued upon a reasoned request and in accordance with the detailed rules of
67 | national law referred to in paragraph 5. However, in a duly justified situation of urgency, the use of such system may be
68 | commenced without an authorisation provided that such authorisation is requested without undue delay, at the latest
69 | within 24 hours. If such authorisation is rejected, the use shall be stopped with immediate effect and all the data, as well as
70 | the results and outputs of that use shall be immediately discarded and deleted.
71 | The competent judicial authority or an independent administrative authority whose decision is binding shall grant the
72 | authorisation only where it is satisfied, on the basis of objective evidence or clear indications presented to it, that the use of
73 | the ‘real-time’ remote biometric identification system concerned is necessary for, and proportionate to, achieving one of the
74 | EN OJ L, 12.7.2024
75 | 52/144objectives specified in paragraph 1, first subparagraph, point (h), as identified in the request and, in particular, remains
76 | limited to what is strictly necessary concerning the period of time as well as the geographic and personal scope. In deciding
77 | on the request, that authority shall take into account the elements referred to in paragraph 2. No decision that produces an
78 | adverse legal effect on a person may be taken based solely on the output of the ‘real-time’ remote biometric identification
79 | system.
80 | 4. Without prejudice to paragraph 3, each use of a ‘real-time’ remote biometric identification system in publicly
81 | accessible spaces for law enforcement purposes shall be notified to the relevant market surveillance authority and the
82 | national data protection authority in accordance with the national rules referred to in paragraph 5. The notification shall, as
83 | a minimum, contain the information specified under paragraph 6 and shall not include sensitive operational data.
84 | 5. A Member State may decide to provide for the possibility to fully or partially authorise the use of ‘real-time’ remote
85 | biometric identification systems in publicly accessible spaces for the purposes of law enforcement within the limits and
86 | under the conditions listed in paragraph 1, first subparagraph, point (h), and paragraphs 2 and 3. Member States concerned
87 | shall lay down in their national law the necessary detailed rules for the request, issuance and exercise of, as well as
88 | supervision and reporting relating to, the authorisations referred to in paragraph 3. Those rules shall also specify in respect
89 | of which of the objectives listed in paragraph 1, first subparagraph, point (h), including which of the criminal offences
90 | referred to in point (h)(iii) thereof, the competent authorities may be authorised to use those systems for the purposes of
91 | law enforcement. Member States shall notify those rules to the Commission at the latest 30 days following the adoption
92 | thereof. Member States may introduce, in accordance with Union law, more restrictive laws on the use of remote biometric
93 | identification systems.
94 | 6. National market surveillance authorities and the national data protection authorities of Member States that have been
95 | notified of the use of ‘real-time’ remote biometric identification systems in publicly accessible spaces for law enforcement
96 | purposes pursuant to paragraph 4 shall submit to the Commission annual reports on such use. For that purpose, the
97 | Commission shall provide Member States and national market surveillance and data protection authorities with a template,
98 | including information on the number of the decisions taken by competent judicial authorities or an independent
99 | administrative authority whose decision is binding upon requests for authorisations in accordance with paragraph 3 and
100 | their result.
101 | 7. The Commission shall publish annual reports on the use of real-time remote biometric identification systems in
102 | publicly accessible spaces for law enforcement purposes, based on aggregated data in Member States on the basis of the
103 | annual reports referred to in paragraph 6. Those annual reports shall not include sensitive operational data of the related
104 | law enforcement activities.
105 | 8. This Article shall not affect the prohibitions that apply where an AI practice infringes other Union law.
106 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------