├── NOTICE.md
├── lib
    ├── textpair_graph
    │   ├── textpair_graph.egg-info
    │   │   ├── dependency_links.txt
    │   │   ├── top_level.txt
    │   │   ├── entry_points.txt
    │   │   ├── requires.txt
    │   │   ├── SOURCES.txt
    │   │   └── PKG-INFO
    │   ├── textpair_graph
    │   │   ├── __init__.py
    │   │   └── __main__.py
    │   ├── README.md
    │   └── pyproject.toml
    ├── textpair_llm
    │   ├── textpair_llm.egg-info
    │   │   ├── dependency_links.txt
    │   │   ├── top_level.txt
    │   │   ├── requires.txt
    │   │   ├── entry_points.txt
    │   │   ├── SOURCES.txt
    │   │   └── PKG-INFO
    │   ├── textpair_llm
    │   │   ├── __init__.py
    │   │   └── llama_server_manager.py
    │   ├── pyproject.toml
    │   └── README.md
    ├── textpair.egg-info
    │   └── entry_points.txt
    ├── MANIFEST.in
    ├── core
    │   ├── binary
    │   │   ├── aarch64
    │   │   │   └── compareNgrams
    │   │   └── x86_64
    │   │   │   └── compareNgrams
    │   └── src
    │   │   └── compareNgrams
    │   │       ├── go.mod
    │   │       └── go.sum
    ├── textpair
    │   ├── __init__.py
    │   ├── sequence_alignment
    │   │   ├── __init__.py
    │   │   └── generate_ngrams.py
    │   ├── utils.py
    │   ├── text_parser.py
    │   ├── passage_classifier.py
    │   └── vector_space_alignment
    │   │   └── structures.py
    └── pyproject.toml
├── example.png
├── web-app
    ├── src
    │   ├── .eslintrc
    │   ├── assets
    │   │   ├── logo.png
    │   │   └── theme.module.scss
    │   ├── store
    │   │   └── index.js
    │   ├── components
    │   │   ├── diffStrings.js
    │   │   ├── home.vue
    │   │   ├── header.vue
    │   │   ├── citations.vue
    │   │   ├── reportSwitcher.vue
    │   │   ├── alignmentGroup.vue
    │   │   ├── sortedResults.vue
    │   │   ├── searchArguments.vue
    │   │   ├── passagePair.vue
    │   │   ├── timeSeries.vue
    │   │   └── searchResults.vue
    │   ├── router
    │   │   └── index.js
    │   ├── main.js
    │   └── App.vue
    ├── public
    │   └── favicon.ico
    ├── README.md
    ├── vite.config.js
    ├── index.html
    └── package.json
├── textpair
├── config
    ├── global_settings.ini
    ├── appConfig.json
    └── config.ini
├── api_server
    ├── textpair.service
    └── web_server.sh
├── docker_autostart.sh
├── Dockerfile
├── docs
    └── ubuntu_installation.md
├── REFERENCES.md
├── extras
    ├── backup_database.py
    └── restore_database.py
├── install.sh
└── README.md


/NOTICE.md:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/textpair_graph/textpair_graph.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/lib/textpair_llm/textpair_llm.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/lib/textpair_llm/textpair_llm.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | textpair_llm
2 | 


--------------------------------------------------------------------------------
/lib/textpair_graph/textpair_graph.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | textpair_graph
2 | 


--------------------------------------------------------------------------------
/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ARTFL-Project/text-pair/HEAD/example.png


--------------------------------------------------------------------------------
/web-app/src/.eslintrc:
--------------------------------------------------------------------------------
1 | {
2 |     "rules": {
3 |         "no-console": 0
4 |     }
5 | }


--------------------------------------------------------------------------------
/lib/textpair_llm/textpair_llm.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | aiohttp>=3.12
2 | requests
3 | tqdm
4 | 


--------------------------------------------------------------------------------
/lib/textpair.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | textpair = textpair.__main__:main
3 | 


--------------------------------------------------------------------------------
/web-app/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ARTFL-Project/text-pair/HEAD/web-app/public/favicon.ico


--------------------------------------------------------------------------------
/web-app/src/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ARTFL-Project/text-pair/HEAD/web-app/src/assets/logo.png


--------------------------------------------------------------------------------
/textpair:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | source /var/lib/text-pair/textpair_env/bin/activate
4 | python3 -m textpair "$@"
5 | deactivate


--------------------------------------------------------------------------------
/config/global_settings.ini:
--------------------------------------------------------------------------------
1 | ## DATABASE SETTINGS ##
2 | database_name = textpair
3 | database_user = textpair
4 | database_password =


--------------------------------------------------------------------------------
/lib/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | recursive-exclude * __pycache__
3 | recursive-exclude * *.py[co]
4 | recursive-exclude * .DS_Store


--------------------------------------------------------------------------------
/lib/core/binary/aarch64/compareNgrams:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ARTFL-Project/text-pair/HEAD/lib/core/binary/aarch64/compareNgrams


--------------------------------------------------------------------------------
/lib/core/binary/x86_64/compareNgrams:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ARTFL-Project/text-pair/HEAD/lib/core/binary/x86_64/compareNgrams


--------------------------------------------------------------------------------
/lib/textpair_graph/textpair_graph.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | textpair-graph = textpair_graph.__main__:main
3 | 


--------------------------------------------------------------------------------
/lib/textpair_llm/textpair_llm.egg-info/entry_points.txt:
--------------------------------------------------------------------------------
1 | [console_scripts]
2 | textpair_llama_server = textpair_llm.llama_server_manager:main
3 | 


--------------------------------------------------------------------------------
/lib/textpair_graph/textpair_graph/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | TextPair Graph - Graph building and clustering for text alignment data.
3 | """
4 | 
5 | __version__ = "0.1.0"
6 | 


--------------------------------------------------------------------------------
/api_server/textpair.service:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=TextPAIR start-up
3 | After=network.service
4 | 
5 | [Service]
6 | ExecStart=/var/lib/text-pair/api_server/web_server.sh
7 | 
8 | [Install]
9 | WantedBy=default.target


--------------------------------------------------------------------------------
/lib/core/src/compareNgrams/go.mod:
--------------------------------------------------------------------------------
1 | module compareNgrams
2 | 
3 | go 1.19
4 | 
5 | require github.com/dolthub/swiss v0.1.0
6 | 
7 | require github.com/dolthub/maphash v0.0.0-20221220182448-74e1e1ea1577 // indirect
8 | 


--------------------------------------------------------------------------------
/docker_autostart.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | service postgresql stop
3 | rm -f /var/lib/postgresql/14/main/postmaster.pid
4 | service postgresql restart
5 | cd /var/lib/text-pair/api_server
6 | sh /var/lib/text-pair/api_server/web_server.sh &
7 | /bin/bash
8 | 


--------------------------------------------------------------------------------
/web-app/src/store/index.js:
--------------------------------------------------------------------------------
 1 | import Vue from 'vue'
 2 | import Vuex from 'vuex'
 3 | 
 4 | Vue.use(Vuex)
 5 | 
 6 | const debug = process.env.NODE_ENV !== 'production'
 7 | 
 8 | export default new Vuex.Store({
 9 |   strict: debug,
10 |   state: {
11 |       currentQuery: {}
12 |   }
13 | })


--------------------------------------------------------------------------------
/lib/textpair_llm/textpair_llm/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | TextPair LLM - LLM server management and evaluation utilities.
 3 | """
 4 | 
 5 | __version__ = "0.1.0"
 6 | 
 7 | from .llama_server_manager import LlamaServerManager
 8 | from .llm_evaluation import AsyncLLMEvaluator
 9 | 
10 | __all__ = ["AsyncLLMEvaluator", "LlamaServerManager"]
11 | 


--------------------------------------------------------------------------------
/lib/textpair_graph/textpair_graph.egg-info/requires.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | scipy
 3 | scikit-learn
 4 | umap-learn
 5 | hdbscan
 6 | networkx
 7 | sentence-transformers
 8 | torch
 9 | numba
10 | lz4
11 | orjson
12 | tqdm
13 | textpair_llm
14 | faiss-cpu
15 | 
16 | [cuda]
17 | cuml-cu12==25.10.*
18 | cugraph-cu12==25.10.*
19 | nx-cugraph-cu12==25.10.*
20 | 


--------------------------------------------------------------------------------
/web-app/src/components/diffStrings.js:
--------------------------------------------------------------------------------
1 | import DiffMatchPatch from "diff-match-patch";
2 | addEventListener("message", (event) => {
3 |     let diff = new DiffMatchPatch();
4 |     diff.Diff_Timeout = 0;
5 |     let diffs = diff.diff_main(event.data[0], event.data[1]);
6 |     diff.diff_cleanupSemantic(diffs);
7 |     postMessage(diffs);
8 | });
9 | 


--------------------------------------------------------------------------------
/lib/textpair/__init__.py:
--------------------------------------------------------------------------------
1 | """Global imports for main textpair function"""
2 | from .parse_config import get_config
3 | from .passage_classifier import classify_passages
4 | from .sequence_alignment import Ngrams, banality_auto_detect, merge_alignments, phrase_matcher
5 | from .text_parser import parse_files
6 | from .utils import get_text
7 | from .vector_space_alignment import run_vsa
8 | from .web_loader import create_web_app
9 | 


--------------------------------------------------------------------------------
/lib/textpair_llm/textpair_llm.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | README.md
 2 | pyproject.toml
 3 | textpair_llm/__init__.py
 4 | textpair_llm/llama_server_manager.py
 5 | textpair_llm/llm_evaluation.py
 6 | textpair_llm.egg-info/PKG-INFO
 7 | textpair_llm.egg-info/SOURCES.txt
 8 | textpair_llm.egg-info/dependency_links.txt
 9 | textpair_llm.egg-info/entry_points.txt
10 | textpair_llm.egg-info/requires.txt
11 | textpair_llm.egg-info/top_level.txt


--------------------------------------------------------------------------------
/lib/textpair_graph/textpair_graph.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | README.md
 2 | pyproject.toml
 3 | textpair_graph/__init__.py
 4 | textpair_graph/__main__.py
 5 | textpair_graph/build_graph_model.py
 6 | textpair_graph/label_clusters.py
 7 | textpair_graph.egg-info/PKG-INFO
 8 | textpair_graph.egg-info/SOURCES.txt
 9 | textpair_graph.egg-info/dependency_links.txt
10 | textpair_graph.egg-info/entry_points.txt
11 | textpair_graph.egg-info/requires.txt
12 | textpair_graph.egg-info/top_level.txt


--------------------------------------------------------------------------------
/web-app/README.md:
--------------------------------------------------------------------------------
 1 | # text-pair
 2 | 
 3 | > A Vue.js project
 4 | 
 5 | ## Build Setup
 6 | 
 7 | ``` bash
 8 | # install dependencies
 9 | npm install
10 | 
11 | # serve with hot reload at localhost:8080
12 | npm run dev
13 | 
14 | # build for production with minification
15 | npm run build
16 | 
17 | # build for production and view the bundle analyzer report
18 | npm run build --report
19 | ```
20 | 
21 | For detailed explanation on how things work, checkout the [guide](http://vuejs-templates.github.io/webpack/) and [docs for vue-loader](http://vuejs.github.io/vue-loader).
22 | 


--------------------------------------------------------------------------------
/lib/textpair/sequence_alignment/__init__.py:
--------------------------------------------------------------------------------
 1 | """Sequence alignment submodule containing alignment merger, banality filter, and ngram generation."""
 2 | 
 3 | from .alignment_merger import merge_alignments
 4 | from .banality_finder import (
 5 |     banality_auto_detect,
 6 |     banality_llm_post_eval,
 7 |     phrase_matcher,
 8 |     separate_banalities,
 9 | )
10 | from .generate_ngrams import Ngrams
11 | 
12 | __all__ = [
13 |     'merge_alignments',
14 |     'banality_auto_detect',
15 |     'banality_llm_post_eval',
16 |     'phrase_matcher',
17 |     'separate_banalities',
18 |     'Ngrams',
19 | ]


--------------------------------------------------------------------------------
/lib/textpair_llm/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "textpair_llm"
 7 | version = "0.1.0"
 8 | description = "LLM server management and evaluation utilities for TextPair"
 9 | readme = "README.md"
10 | requires-python = ">=3.9"
11 | authors = [
12 |     {name = "ARTFL Project", email = "artfl@uchicago.edu"}
13 | ]
14 | license = {text = "MIT"}
15 | 
16 | dependencies = [
17 |     "aiohttp>=3.12",
18 |     "requests",
19 |     "tqdm",
20 | ]
21 | 
22 | [project.scripts]
23 | textpair_llama_server = "textpair_llm.llama_server_manager:main"
24 | 
25 | [tool.setuptools.packages.find]
26 | where = ["."]
27 | include = ["textpair_llm*"]
28 | 


--------------------------------------------------------------------------------
/web-app/vite.config.js:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from "vite";
 2 | import vue from "@vitejs/plugin-vue";
 3 | import { fileURLToPath, URL } from "node:url";
 4 | 
 5 | export default defineConfig({
 6 |     plugins: [vue()],
 7 |     resolve: {
 8 |         alias: {
 9 |             "@": fileURLToPath(new URL("./src", import.meta.url)),
10 |         },
11 |         // TODO: Remove by explicitely adding extension in imports
12 |         extensions: [".js", ".json", ".vue"],
13 |     },
14 |     base: process.env.NODE_ENV === "production" ? getAppPath() : "/",
15 |     server: {
16 |         cors: true,
17 |     },
18 | });
19 | 
20 | function getAppPath() {
21 |     const globalConfig = require("./appConfig.json");
22 |     console.log(globalConfig.appPath);
23 |     return globalConfig.appPath;
24 | }
25 | 


--------------------------------------------------------------------------------
/lib/core/src/compareNgrams/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 2 | github.com/dolthub/maphash v0.0.0-20221220182448-74e1e1ea1577 h1:SegEguMxToBn045KRHLIUlF2/jR7Y2qD6fF+3tdOfvI=
 3 | github.com/dolthub/maphash v0.0.0-20221220182448-74e1e1ea1577/go.mod h1:gkg4Ch4CdCDu5h6PMriVLawB7koZ+5ijb9puGMV50a4=
 4 | github.com/dolthub/swiss v0.1.0 h1:EaGQct3AqeP/MjASHLiH6i4TAmgbG/c4rA6a1bzCOPc=
 5 | github.com/dolthub/swiss v0.1.0/go.mod h1:BeucyB08Vb1G9tumVN3Vp/pyY4AMUnr9p7Rz7wJ7kAQ=
 6 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 7 | github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
 8 | github.com/thepudds/swisstable v0.0.0-20221011152303-9c77dc657777 h1:5u+6YWU2faS+Sr/x8j9yalMpSDUkatNOZWXV3wMUCGQ=
 9 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
10 | 


--------------------------------------------------------------------------------
/web-app/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |     <head>
 4 |         <meta charset="utf-8" />
 5 |         <meta
 6 |             name="viewport"
 7 |             content="width=device-width, initial-scale=1, shrink-to-fit=no"
 8 |         />
 9 |         <link rel="icon" href="/favicon.ico" />
10 |         <link rel="preconnect" href="https://fonts.googleapis.com" />
11 |         <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
12 |         <link
13 |             href="https://fonts.googleapis.com/css2?family=Open+Sans:ital@0;1&family=Source+Serif+Pro:ital,wght@0,400;0,600;1,400;1,600&display=swap"
14 |             rel="stylesheet"
15 |         />
16 |         <script type="module" src="/src/main.js"></script>
17 |         <title>Search sequence alignment results</title>
18 |     </head>
19 | 
20 |     <body>
21 |         <div id="app"></div>
22 |     </body>
23 | </html>
24 | 


--------------------------------------------------------------------------------
/lib/textpair_graph/README.md:
--------------------------------------------------------------------------------
 1 | # TextPair Graph
 2 | 
 3 | Graph building and clustering for TextPair alignment data.
 4 | 
 5 | ## Installation
 6 | 
 7 | ```bash
 8 | # Basic installation
 9 | uv pip install -e .
10 | 
11 | # With CUDA support (for faster UMAP/HDBSCAN)
12 | uv pip install -e ".[cuda]"
13 | ```
14 | 
15 | ## Usage
16 | 
17 | ### Build Graph Model
18 | 
19 | ```bash
20 | python -m textpair_graph build <alignments_file> <output_dir> --model <sbert_model>
21 | ```
22 | 
23 | ### Generate Cluster Labels
24 | 
25 | ```bash
26 | python -m textpair_graph label <graph_data_dir> --model <llm_model>
27 | ```
28 | 
29 | ## Dependencies
30 | 
31 | - numpy, scipy, scikit-learn
32 | - umap-learn, hdbscan (clustering)
33 | - networkx (graph manipulation)
34 | - sentence-transformers, torch (embeddings)
35 | - numba (performance)
36 | - lz4, orjson (data I/O)
37 | - textpair (for LLM evaluation)
38 | 
39 | ### Optional
40 | 
41 | - cuml-cu12 (CUDA-accelerated UMAP/HDBSCAN)
42 | 


--------------------------------------------------------------------------------
/api_server/web_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Define the number of workers for your app. Between 4 and 12 should be fine.
 4 | WORKERS=4
 5 | 
 6 | # Define on which port the webserver will be listening. If you have a webserver already listening to port 80
 7 | # you should proxy requests to below port to the app
 8 | PORT=80
 9 | 
10 | # If using an https connection (you should), define your SSL keys and certificate locations here
11 | KEYFILE=
12 | CERTFILE=
13 | 
14 | if [ -z "$KEYFILE" ]
15 | then
16 |     /var/lib/text-pair/textpair_env/bin/gunicorn -k uvicorn.workers.UvicornWorker -b :$PORT -w 4 --access-logfile=/var/lib/text-pair/api_server/access.log --error-logfile=/var/lib/text-pair/api_server/error.log --chdir /var/lib/text-pair/api/ text_pair:app
17 | else
18 |     /var/lib/text-pair/textpair_env/bin/gunicorn --keyfile=$KEYFILE --certfile=$CERTFILE -k uvicorn.workers.UvicornWorker -b :$PORT -w 4 --access-logfile=/var/lib/text-pair/api_server/access.log --error-logfile=/var/lib/text-pair/api_server/error.log --chdir /var/lib/text-pair/api/ text_pair:app
19 | fi


--------------------------------------------------------------------------------
/web-app/src/components/home.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |     <div class="card mt-4 shadow-1" style="font-size: 1rem;">
 3 | 
 4 |         <div class="card-body">
 5 |             <h5 class="card-title">Some basic statistics on current alignment:</h5>
 6 |             <p class="card-text">
 7 |             <ul>
 8 |                 <li>This database contains {{ stats.pairs_count.toLocaleString() }} passage pairs.</li>
 9 |                 <li v-if="globalConfig.matchingAlgorithm == 'sa'">These pairs were generated from {{
10 |                     stats.group_count.toLocaleString() }} source passages
11 |                     borrowed from
12 |                     {{
13 |                         stats.author_group_count.toLocaleString() }}
14 |                     authors in {{
15 |                         stats.title_group_count.toLocaleString() }} titles.
16 |                 </li>
17 | 
18 |             </ul>
19 |             </p>
20 |         </div>
21 | 
22 |     </div>
23 | </template>
24 | <script>
25 | import stats from '../../stats.json'
26 | 
27 | export default {
28 |     name: 'home',
29 |     data() {
30 |         return {
31 |             stats: stats,
32 |             globalConfig: this.$globalConfig,
33 |         }
34 |     }
35 | }
36 | </script>


--------------------------------------------------------------------------------
/lib/textpair_llm/README.md:
--------------------------------------------------------------------------------
 1 | # TextPair LLM
 2 | 
 3 | LLM server management and evaluation utilities for TextPair projects.
 4 | 
 5 | ## Installation
 6 | 
 7 | ```bash
 8 | uv pip install -e .
 9 | ```
10 | 
11 | ## Features
12 | 
13 | - **LlamaServerManager**: Start and manage llama.cpp server instances
14 | - **AsyncLLMEvaluator**: Async LLM evaluation with concurrent request handling
15 | - **LLMEvaluator**: Synchronous LLM evaluation
16 | 
17 | ## Usage
18 | 
19 | ### Manage Llama Server
20 | 
21 | ```bash
22 | textpair-llama-server start --model <model_path> --port 8080
23 | textpair-llama-server stop
24 | ```
25 | 
26 | ### Python API
27 | 
28 | ```python
29 | from textpair_llm import AsyncLLMEvaluator
30 | 
31 | evaluator = AsyncLLMEvaluator(
32 |     model_path="unsloth/gemma-3-4b-it-qat-GGUF",
33 |     port=8081,
34 |     context_window=4096
35 | )
36 | 
37 | evaluator.start_server()
38 | 
39 | # Async usage
40 | result = await evaluator._make_completion_request(prompt, params)
41 | 
42 | evaluator.stop_server()
43 | ```
44 | 
45 | ## Dependencies
46 | 
47 | Minimal dependencies for maximum flexibility:
48 | - aiohttp (async HTTP requests)
49 | - requests (sync HTTP requests)
50 | - tqdm (progress bars)
51 | 
52 | Requires llama.cpp server binary (not included).
53 | 


--------------------------------------------------------------------------------
/lib/textpair_graph/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "textpair_graph"
 7 | version = "0.1.0"
 8 | description = "Graph building and clustering for TextPair alignment data"
 9 | readme = "README.md"
10 | requires-python = ">=3.9"
11 | authors = [
12 |     {name = "ARTFL Project", email = "artfl@uchicago.edu"}
13 | ]
14 | license = {text = "MIT"}
15 | 
16 | dependencies = [
17 |     "numpy",
18 |     "scipy",
19 |     "scikit-learn",
20 |     "umap-learn",
21 |     "hdbscan",
22 |     "networkx",
23 |     "sentence-transformers",
24 |     "torch",
25 |     "numba",
26 |     "lz4",
27 |     "orjson",
28 |     "tqdm",
29 |     "textpair_llm",
30 |     "faiss-cpu",
31 | ]
32 | 
33 | [project.optional-dependencies]
34 | cuda = [
35 |     "cuml-cu12==25.10.*",
36 |     "cugraph-cu12==25.10.*",
37 |     "nx-cugraph-cu12==25.10.*",
38 | ]
39 | 
40 | [project.scripts]
41 | textpair-graph = "textpair_graph.__main__:main"
42 | 
43 | [tool.setuptools.packages.find]
44 | where = ["."]
45 | include = ["textpair_graph*"]
46 | 
47 | [tool.uv.sources]
48 | textpair_llm = { path = "../textpair_llm", editable = true }
49 | cuml-cu12 = [
50 |     { index = "nvidia-pypi", extra = "cuda" },
51 | ]
52 | cugraph-cu12 = [
53 |     { index = "nvidia-pypi", extra = "cuda" },
54 | ]
55 | nx-cugraph-cu12 = [
56 |     { index = "nvidia-pypi", extra = "cuda" },
57 | ]
58 | 
59 | [[tool.uv.index]]
60 | name = "nvidia-pypi"
61 | url = "https://pypi.nvidia.com"
62 | explicit = true
63 | 


--------------------------------------------------------------------------------
/web-app/src/components/header.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |     <nav class="navbar navbar-expand-lg navbar-light bg-light shadow-sm" style="height: 53px" v-once>
 3 |         <div class="collapse navbar-collapse top-links">
 4 |             <ul class="navbar-nav me-auto mb-2 mb-lg-0">
 5 |                 <li class="nav-item" v-for="brand in globalConfig.branding" :key="brand.label">
 6 |                     <a class="nav-link" :href="brand.link">{{ brand.label }}</a>
 7 |                 </li>
 8 |             </ul>
 9 |         </div>
10 |         <router-link class="navbar-brand" to="/" v-html="globalConfig.databaseLabel"></router-link>
11 |         <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNav"
12 |             aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
13 |             <span class="navbar-toggler-icon"></span>
14 |         </button>
15 |     </nav>
16 | </template>
17 | 
18 | <script>
19 | export default {
20 |     name: "main-header",
21 |     data: function () {
22 |         return {
23 |             globalConfig: this.$globalConfig,
24 |             home: this.$route.fullPath.replace(this.$route.query.path, ""),
25 |         };
26 |     },
27 | };
28 | </script>
29 | 
30 | <style>
31 | .top-links {
32 |     margin-left: -0.25rem;
33 |     font-size: 80%;
34 |     margin-top: -2rem;
35 |     font-variant: small-caps;
36 | }
37 | 
38 | .navbar-brand {
39 |     font-weight: 700;
40 |     font-size: 1.6rem !important;
41 |     font-variant: small-caps;
42 |     position: absolute;
43 |     left: 50%;
44 |     transform: translateX(-50%);
45 |     line-height: 80%;
46 | }
47 | </style>
48 | 


--------------------------------------------------------------------------------
/lib/textpair_llm/textpair_llm.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 2.4
 2 | Name: textpair_llm
 3 | Version: 0.1.0
 4 | Summary: LLM server management and evaluation utilities for TextPair
 5 | Author-email: ARTFL Project <artfl@uchicago.edu>
 6 | License: MIT
 7 | Requires-Python: >=3.9
 8 | Description-Content-Type: text/markdown
 9 | Requires-Dist: aiohttp>=3.12
10 | Requires-Dist: requests
11 | Requires-Dist: tqdm
12 | 
13 | # TextPair LLM
14 | 
15 | LLM server management and evaluation utilities for TextPair projects.
16 | 
17 | ## Installation
18 | 
19 | ```bash
20 | uv pip install -e .
21 | ```
22 | 
23 | ## Features
24 | 
25 | - **LlamaServerManager**: Start and manage llama.cpp server instances
26 | - **AsyncLLMEvaluator**: Async LLM evaluation with concurrent request handling
27 | - **LLMEvaluator**: Synchronous LLM evaluation
28 | 
29 | ## Usage
30 | 
31 | ### Manage Llama Server
32 | 
33 | ```bash
34 | textpair-llama-server start --model <model_path> --port 8080
35 | textpair-llama-server stop
36 | ```
37 | 
38 | ### Python API
39 | 
40 | ```python
41 | from textpair_llm import AsyncLLMEvaluator
42 | 
43 | evaluator = AsyncLLMEvaluator(
44 |     model_path="unsloth/gemma-3-4b-it-qat-GGUF",
45 |     port=8081,
46 |     context_window=4096
47 | )
48 | 
49 | evaluator.start_server()
50 | 
51 | # Async usage
52 | result = await evaluator._make_completion_request(prompt, params)
53 | 
54 | evaluator.stop_server()
55 | ```
56 | 
57 | ## Dependencies
58 | 
59 | Minimal dependencies for maximum flexibility:
60 | - aiohttp (async HTTP requests)
61 | - requests (sync HTTP requests)
62 | - tqdm (progress bars)
63 | 
64 | Requires llama.cpp server binary (not included).
65 | 


--------------------------------------------------------------------------------
/web-app/src/components/citations.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |     <p class="pt-2 mb-2 px-3">
 3 |         <span class="citation" v-for="(cite, citeIndex) in citation" :key="citeIndex">
 4 |             <router-link :to="href" :style="cite.style" v-if="linkToDoc == cite.field">{{ alignment[cite.field]
 5 |             }}</router-link>
 6 |             <span :style="cite.style" v-else v-if="alignment[cite.field]">{{ alignment[cite.field] }}</span>
 7 |             <span class="separator px-2" v-if="alignment[cite.field] && citeIndex != citation.length - 1">&#9679;</span>
 8 |         </span>
 9 |     </p>
10 | </template>
11 | <script>
12 | export default {
13 |     name: "citations",
14 |     props: {
15 |         "citation": Object, "alignment": Object, "linkToDoc": { type: String, default: "" }
16 |     },
17 |     data() {
18 |         return {
19 |             globalConfig: this.$globalConfig
20 |         }
21 |     },
22 |     computed: {
23 |         href() {
24 |             if (this.linkToDoc.length == 0) return;
25 |             if (this.linkToDoc.indexOf("source_") != -1) {
26 |                 return `/text-view/?db_table=${this.globalConfig.databaseName}&philo_path=${this.globalConfig.sourcePhiloDBPath}&philo_id=${this.alignment.source_philo_id}&directionSelected=source`
27 |             }
28 |             return `/text-view/?db_table=${this.globalConfig.databaseName}&philo_path=${this.globalConfig.targetPhiloDBPath}&philo_id=${this.alignment.target_philo_id}&directionSelected=target`
29 |         }
30 |     }
31 | };
32 | </script>
33 | <style scoped>
34 | .separator {
35 |     font-size: 0.75rem;
36 |     vertical-align: 0.05rem;
37 | }
38 | 
39 | .citation {
40 |     font-weight: 600;
41 | }
42 | </style>


--------------------------------------------------------------------------------
/lib/textpair/utils.py:
--------------------------------------------------------------------------------
 1 | """Various utilities for textpair"""
 2 | 
 3 | 
 4 | from html import unescape as unescape_html
 5 | from xml.sax.saxutils import unescape as unescape_xml
 6 | 
 7 | import regex as re
 8 | 
 9 | TAGS = re.compile(r"<[^>]+>")
10 | PHILO_TEXT_OBJECT_LEVELS = {"doc": 1, "div1": 2, "div2": 3, "div3": 4, "para": 5, "sent": 6, "word": 7}
11 | 
12 | 
13 | 
14 | def clean_text(text: str) -> str:
15 |     """Cleaning text function which removes tags and converts entities"""
16 |     text = TAGS.sub("", text)
17 |     text = unescape_xml(text)
18 |     text = unescape_html(text)
19 |     text = text.replace("\n", " ")
20 |     text = text.strip()
21 |     return text
22 | 
23 | 
24 | def get_text(start_byte: int, end_byte: int, filename: str, length: int = 300) -> str:
25 |     """Grab all texts"""
26 |     if start_byte < 0:
27 |         start_byte = 0
28 |     length = end_byte - start_byte
29 |     with open(filename, "rb") as text_file:
30 |         text_file.seek(start_byte)
31 |         text: str = text_file.read(length).decode("utf8", "ignore")
32 | 
33 |     # Remove leading and closing tags
34 |     if text.startswith("<"):
35 |         text = re.sub(r"^<[^>]+>", "", text, count=1).strip()
36 |     if text.endswith(">"):
37 |         text = re.sub(r"<[^>]+>$", "", text, count=1).strip()
38 |     # Remove unclosed tags at the end
39 |     text = re.sub(r"<[^>]+$", "", text).strip()
40 |     return clean_text(text)
41 | 
42 | 
43 | def text_object_upper_bound(config) -> str:
44 |     """Find the text object level above the one specified in the config"""
45 |     object_type_to_level = {v: k for k, v in PHILO_TEXT_OBJECT_LEVELS.items()}
46 |     text_object_level = PHILO_TEXT_OBJECT_LEVELS[config["text_object_type"]]
47 |     if text_object_level == 1:
48 |         return "doc"
49 |     return object_type_to_level[text_object_level - 1]
50 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:24.04
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | RUN mkdir /textpair
 6 | COPY api /textpair/api
 7 | COPY api_server /textpair/api_server
 8 | COPY config /textpair/config
 9 | COPY extras /textpair/extras
10 | COPY lib /textpair/lib
11 | COPY web-app /textpair/web-app
12 | COPY install.sh /textpair/install.sh
13 | COPY docker_autostart.sh /usr/local/bin/init_textpair_db
14 | COPY textpair /textpair/textpair
15 | 
16 | RUN apt update && apt install -y postgresql postgresql-contrib curl git locales libpq-dev python3-pip sudo ripgrep liblz4-tool && curl -sL https://deb.nodesource.com/setup_16.x | sudo -E bash && apt-get install -y nodejs && apt-get clean && rm -rf /var/lib/apt && cd /textpair && ./install.sh && mkdir -p /var/www/html/text-pair && echo "[WEB_APP]\nweb_app_path = /var/www/html/text-pair\napi_server = http://localhost/text-pair-api\n[DATABASE]\ndatabase_name = textpair\ndatabase_user = textpair\ndatabase_password = textpair" > /etc/text-pair/global_settings.ini && sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && locale-gen
17 | RUN mkdir /etc/philologic && echo 'database_root = "/var/www/html/philologic"\nurl_root = "http://localhost/philologic/"\nweb_app_dir = "/var/lib/philologic4/web_app/"' > /etc/philologic/philologic4.cfg
18 | 
19 | USER postgres
20 | RUN service postgresql start && sleep 5 && \
21 |     psql --command "CREATE DATABASE textpair;" && \
22 |     psql --command "CREATE ROLE textpair WITH LOGIN PASSWORD 'textpair';" && \
23 |     psql --command "GRANT ALL PRIVILEGES ON DATABASE textpair TO textpair;" && \
24 |     psql -d textpair --command "CREATE EXTENSION pg_trgm;" && \
25 |     perl -pi -e 's/^(local.*)peer$/$1 md5/;' /etc/postgresql/14/main/pg_hba.conf
26 | 
27 | USER root
28 | ENV LANG=en_US.UTF-8
29 | ENV LANGUAGE=en_US:en
30 | ENV LC_ALL=en_US.UTF-8
31 | 
32 | CMD ["/usr/local/bin/init_textpair_db"]


--------------------------------------------------------------------------------
/lib/textpair_graph/textpair_graph.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 2.4
 2 | Name: textpair_graph
 3 | Version: 0.1.0
 4 | Summary: Graph building and clustering for TextPair alignment data
 5 | Author-email: ARTFL Project <artfl@uchicago.edu>
 6 | License: MIT
 7 | Requires-Python: >=3.9
 8 | Description-Content-Type: text/markdown
 9 | Requires-Dist: numpy
10 | Requires-Dist: scipy
11 | Requires-Dist: scikit-learn
12 | Requires-Dist: umap-learn
13 | Requires-Dist: hdbscan
14 | Requires-Dist: networkx
15 | Requires-Dist: sentence-transformers
16 | Requires-Dist: torch
17 | Requires-Dist: numba
18 | Requires-Dist: lz4
19 | Requires-Dist: orjson
20 | Requires-Dist: tqdm
21 | Requires-Dist: textpair_llm
22 | Requires-Dist: faiss-cpu
23 | Provides-Extra: cuda
24 | Requires-Dist: cuml-cu12==25.10.*; extra == "cuda"
25 | Requires-Dist: cugraph-cu12==25.10.*; extra == "cuda"
26 | Requires-Dist: nx-cugraph-cu12==25.10.*; extra == "cuda"
27 | 
28 | # TextPair Graph
29 | 
30 | Graph building and clustering for TextPair alignment data.
31 | 
32 | ## Installation
33 | 
34 | ```bash
35 | # Basic installation
36 | uv pip install -e .
37 | 
38 | # With CUDA support (for faster UMAP/HDBSCAN)
39 | uv pip install -e ".[cuda]"
40 | ```
41 | 
42 | ## Usage
43 | 
44 | ### Build Graph Model
45 | 
46 | ```bash
47 | python -m textpair_graph build <alignments_file> <output_dir> --model <sbert_model>
48 | ```
49 | 
50 | ### Generate Cluster Labels
51 | 
52 | ```bash
53 | python -m textpair_graph label <graph_data_dir> --model <llm_model>
54 | ```
55 | 
56 | ## Dependencies
57 | 
58 | - numpy, scipy, scikit-learn
59 | - umap-learn, hdbscan (clustering)
60 | - networkx (graph manipulation)
61 | - sentence-transformers, torch (embeddings)
62 | - numba (performance)
63 | - lz4, orjson (data I/O)
64 | - textpair (for LLM evaluation)
65 | 
66 | ### Optional
67 | 
68 | - cuml-cu12 (CUDA-accelerated UMAP/HDBSCAN)
69 | 


--------------------------------------------------------------------------------
/web-app/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "text-pair",
 3 |     "version": "2.1.0",
 4 |     "private": true,
 5 |     "description": "text-pair web app build with Vue.js",
 6 |     "author": "Clovis <clovisgladstone@gmail.com>",
 7 |     "scripts": {
 8 |         "serve": "npm i && vite preview",
 9 |         "build": "npm i && vite build && rm -rf node_modules",
10 |         "host": "npm i && vite --host"
11 |     },
12 |     "dependencies": {
13 |         "@popperjs/core": "^2.11.8",
14 |         "@sigma/node-border": "^3.0.0",
15 |         "axios": "^0.21.1",
16 |         "bootstrap": "^5.2.1",
17 |         "chart.js": "^2.9.3",
18 |         "diff-match-patch": "^1.0.5",
19 |         "graphology": "^0.26.0",
20 |         "graphology-communities-louvain": "^2.0.2",
21 |         "graphology-layout": "^0.6.1",
22 |         "graphology-layout-forceatlas2": "^0.10.1",
23 |         "graphology-layout-noverlap": "^0.4.2",
24 |         "sigma": "^3.0.0",
25 |         "tiny-emitter": "^2.1.0",
26 |         "velocity-animate": "^1.5.2",
27 |         "vue": "^3.2.0",
28 |         "vue-router": "^4.0.13",
29 |         "vue-scrollto": "^2.20.0"
30 |     },
31 |     "devDependencies": {
32 |         "@vitejs/plugin-vue": "^4.2.3",
33 |         "@vue/compiler-sfc": "^3.1.0",
34 |         "eslint": "^7.32.0",
35 |         "eslint-plugin-vue": "^8.5.0",
36 |         "sass": "^1.54.9",
37 |         "vite": "^4.3.7"
38 |     },
39 |     "eslintConfig": {
40 |         "root": true,
41 |         "env": {
42 |             "node": true
43 |         },
44 |         "extends": [
45 |             "plugin:vue/essential",
46 |             "eslint:recommended"
47 |         ],
48 |         "rules": {},
49 |         "parserOptions": {
50 |             "ecmaVersion": 12,
51 |             "sourceType": "module"
52 |         }
53 |     },
54 |     "browserslist": [
55 |         "> 1%",
56 |         "last 2 versions"
57 |     ]
58 | }
59 | 


--------------------------------------------------------------------------------
/docs/ubuntu_installation.md:
--------------------------------------------------------------------------------
 1 | ## Ubuntu installation instructions (tested on 18.04)
 2 | 
 3 | ### Install dependencies
 4 | 
 5 | ```console
 6 | sudo apt install postgresql nodejs npm python3-pip apache2  apache2-dev libapache2-mod-wsgi-py3 libssl1.0-dev
 7 | ```
 8 | 
 9 | ### Configure PostgreSQL
10 | 
11 | -   Login as postgres user:
12 | 
13 | ```console
14 | sudo -i -u postgres
15 | ```
16 | 
17 | -   Open postgres shell:
18 | 
19 | ```console
20 | psql
21 | ```
22 | 
23 | -   Create database and user:
24 | 
25 | ```sql
26 | 
27 | CREATE DATABASE your_db_name;
28 | CREATE USER your_user WITH PASSWORD ‘your_password’;
29 | GRANT ALL PRIVILEGES ON DATABASE your_db_name TO your_user;
30 | ```
31 | 
32 | -   Add extension for fulltext search:
33 | 
34 | ```sql
35 | \c your_db_name
36 | CREATE EXTENSION pg_trgm;
37 | \q
38 | ```
39 | 
40 | -   You may need to change local connections from peer to md5 in postgres config file (with vim or your prefered text editor):
41 | 
42 | ```console
43 | sudo vim /etc/postgresql/10/pg_hba.conf
44 | ```
45 | 
46 | Note that the path the pg_hba.conf may vary based on your postgres version.
47 | 
48 | Fill in the database info in text-pair config: `sudo vim /etc/text-pair/config/global_settings.ini`
49 | 
50 | ### Create webspace with proper permissions
51 | 
52 | ```console
53 | sudo mkdir /var/www/html/text-pair/
54 | sudo chown -R $(whoami) /var/www/html/text-pair/
55 | ```
56 | 
57 | ### Apache configuration (may require extra work)
58 | 
59 | ```console
60 | sudo a2enmod rewrite
61 | sudo vim /etc/apache2/apache2.conf
62 | ```
63 | 
64 | Change AllowOverride from None to All
65 | 
66 | Add the following at the bottom of the file (to execute the wsgi search script):
67 | 
68 | `Include /etc/text-pair/*conf`
69 | 
70 | -   Restart Apache: `sudo apachectl graceful`
71 | 
72 | #### NOTE
73 | 
74 | If you get a syntax error for the Include directive in Apache, replace that line with:
75 | `IncludeOptional /etc/text-pair/*conf`
76 | 
77 | ### Run install script
78 | 
79 | Run the following script at the root of the text-pair folder:
80 | 
81 | `sh install.sh`
82 | 


--------------------------------------------------------------------------------
/web-app/src/router/index.js:
--------------------------------------------------------------------------------
 1 | import { createRouter, createWebHistory } from "vue-router";
 2 | import globalConfig from "../../appConfig.json";
 3 | import home from "../components/home";
 4 | 
 5 | const searchResults = () => import("../components/searchResults");
 6 | const timeSeries = () => import("../components/timeSeries");
 7 | const sortedResults = () => import("../components/sortedResults");
 8 | const alignmentGroup = () => import("../components/alignmentGroup");
 9 | const textNavigation = () => import("../components/textNavigation");
10 | const networkGraph = () => import("../components/networkGraph");
11 | const semanticGraph = () => import("../components/semanticGraph");
12 | 
13 | const router = createRouter({
14 |     history: createWebHistory(globalConfig.appPath),
15 |     routes: [
16 |         { path: "/", name: "home", component: home },
17 |         {
18 |             path: "/search",
19 |             name: "searchResults",
20 |             component: searchResults,
21 |         },
22 |         {
23 |             path: "/time",
24 |             name: "timeSeries",
25 |             component: timeSeries,
26 |         },
27 |         {
28 |             path: "/sorted-results",
29 |             name: "sortedResults",
30 |             component: sortedResults,
31 |         },
32 |         {
33 |             path: "/network",
34 |             name: "networkGraph",
35 |             component: networkGraph,
36 |         },
37 |         {
38 |             path: "/semantic",
39 |             name: "semanticGraph",
40 |             component: semanticGraph,
41 |         },
42 |         {
43 |             path: "/group/:groupId",
44 |             name: "alignmentGroup",
45 |             component: alignmentGroup,
46 |         },
47 |         {
48 |             path: "/text-view/",
49 |             name: "textNavigation",
50 |             component: textNavigation,
51 |         },
52 |     ],
53 |     scrollBehavior(to, from, savedPosition) {
54 |         if (savedPosition) {
55 |             return savedPosition;
56 |         } else {
57 |             return {
58 |                 left: 0,
59 |                 top: 0,
60 |             };
61 |         }
62 |     },
63 | });
64 | export default router;
65 | 


--------------------------------------------------------------------------------
/web-app/src/main.js:
--------------------------------------------------------------------------------
 1 | import axios from "axios";
 2 | import emitter from "tiny-emitter/instance";
 3 | import { createApp } from "vue";
 4 | import globalConfig from "../appConfig.json";
 5 | import App from "./App.vue";
 6 | import router from "./router";
 7 | 
 8 | const app = createApp(App);
 9 | app.config.globalProperties.$globalConfig = globalConfig;
10 | app.config.globalProperties.emitter = emitter;
11 | app.provide("$http", axios);
12 | 
13 | app.mixin({
14 |     methods: {
15 |         paramsToUrl: function (params) {
16 |             const urlParams = [];
17 |             for (const [key, value] of Object.entries(params)) {
18 |                 if (value !== "" && value !== null && value !== undefined) {
19 |                     if (Array.isArray(value)) {
20 |                         // Handle array values (multiple parameters with same name)
21 |                         value.forEach((val) => {
22 |                             if (val) {
23 |                                 urlParams.push(
24 |                                     `${encodeURIComponent(
25 |                                         key
26 |                                     )}=${encodeURIComponent(val)}`
27 |                                 );
28 |                             }
29 |                         });
30 |                     } else {
31 |                         urlParams.push(
32 |                             `${encodeURIComponent(key)}=${encodeURIComponent(
33 |                                 value
34 |                             )}`
35 |                         );
36 |                     }
37 |                 }
38 |             }
39 |             return urlParams.join("&");
40 |         },
41 |     },
42 | });
43 | app.directive("scroll", {
44 |     mounted: function (el, binding) {
45 |         el.scrollHandler = function (evt) {
46 |             if (binding.value(evt, el)) {
47 |                 window.removeEventListener("scroll", el.scrollHandler);
48 |             }
49 |         };
50 |         window.addEventListener("scroll", el.scrollHandler);
51 |     },
52 |     unmounted: function (el) {
53 |         window.removeEventListener("scroll", el.scrollHandler);
54 |     },
55 | });
56 | app.use(router);
57 | 
58 | router.isReady().then(() => app.mount("#app"));
59 | 


--------------------------------------------------------------------------------
/web-app/src/App.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |     <div id="app">
  3 |         <main-header></main-header>
  4 |         <div class="container-fluid">
  5 |             <search-form></search-form>
  6 |             <router-view></router-view>
  7 |         </div>
  8 |     </div>
  9 | </template>
 10 | 
 11 | <script>
 12 | import mainHeader from "./components/header";
 13 | import searchForm from "./components/searchForm";
 14 | 
 15 | export default {
 16 |     name: "app",
 17 |     components: {
 18 |         mainHeader,
 19 |         searchForm,
 20 |     },
 21 |     data: function () {
 22 |         return {
 23 |             globalConfig: this.$globalConfig,
 24 |         };
 25 |     },
 26 | };
 27 | </script>
 28 | 
 29 | <style lang="scss">
 30 | @import "./assets/theme.module.scss";
 31 | @import "../node_modules/bootstrap/scss/bootstrap.scss";
 32 | 
 33 | body,
 34 | .btn {
 35 |     font-size: 0.9rem;
 36 | }
 37 | 
 38 | .passages {
 39 |     font-size: 1rem;
 40 | }
 41 | 
 42 | a {
 43 |     text-decoration: none;
 44 | }
 45 | 
 46 | #app {
 47 |     font-family: "Source Serif Pro", serif;
 48 | }
 49 | 
 50 | .shadow-1 {
 51 |     box-shadow: 0 2px 5px 0 rgba(0, 0, 0, 0.16), 0 2px 10px 0 rgba(0, 0, 0, 0.12);
 52 | }
 53 | 
 54 | .btn-light {
 55 |     border-color: #ddd;
 56 | }
 57 | 
 58 | .corner-btn {
 59 |     position: absolute;
 60 |     transition: all 0.1s ease-out;
 61 |     padding: 5px;
 62 |     line-height: 1;
 63 |     top: 0;
 64 |     overflow: hidden;
 65 |     border-color: #ddd;
 66 |     border-style: solid;
 67 | }
 68 | 
 69 | .corner-btn.right {
 70 |     right: 0;
 71 |     border-width: 0px 0px 1px 1px;
 72 | }
 73 | 
 74 | .corner-btn.left {
 75 |     left: 0;
 76 |     border-width: 0px 1px 1px 0px;
 77 | }
 78 | 
 79 | .corner-btn.destroy:hover {
 80 |     background-color: #565656;
 81 |     color: #f8f8f8;
 82 |     cursor: pointer;
 83 | }
 84 | 
 85 | .diff-btn,
 86 | .group-diff-btn {
 87 |     display: inline-block;
 88 |     padding: 0.2rem;
 89 |     margin-bottom: 2px;
 90 |     border: solid 1px #ddd;
 91 |     cursor: pointer;
 92 |     font-family: "Open-Sans", sans-serif;
 93 | }
 94 | 
 95 | .diff-btn:hover,
 96 | .group-diff-btn:hover {
 97 |     color: #565656 !important;
 98 |     background-color: #f8f8f8;
 99 | }
100 | 
101 | .my-dropdown {
102 |     position: relative;
103 | }
104 | 
105 | .my-dropdown .btn:focus,
106 | my-dropdown .btn:active {
107 |     outline: none !important;
108 | }
109 | 
110 | .my-dropdown-menu {
111 |     position: absolute;
112 |     display: none;
113 |     left: 0;
114 |     top: 38px;
115 |     background-color: #fff;
116 |     width: auto;
117 |     line-height: 200%;
118 |     z-index: 5;
119 |     list-style-type: none;
120 |     margin: 0;
121 |     padding: 0;
122 | }
123 | 
124 | .my-dropdown-item {
125 |     cursor: pointer;
126 |     padding: 0 0.75rem;
127 | }
128 | 
129 | .my-dropdown-item:hover {
130 |     background: #ddd;
131 | }
132 | </style>
133 | 


--------------------------------------------------------------------------------
/web-app/src/assets/theme.module.scss:
--------------------------------------------------------------------------------
  1 | // Custom Bootstrap changes: don't edit
  2 | $popover-max-width: 50%;
  3 | .custom-popover {
  4 |     overflow: auto;
  5 |     text-align: justify !important;
  6 |     max-height: 60%;
  7 | }
  8 | 
  9 | // Theme colors
 10 | $header-color: rgb(95, 2, 2);
 11 | $button-color: rgb(173, 66, 66);
 12 | $button-color-active: rgba(160, 55, 55, 0.9);
 13 | $link-color: rgb(142, 50, 50);
 14 | $passage-color: rgb(155, 65, 55);
 15 | $card-header-color: rgb(173, 66, 66);
 16 | $removed-color: rgb(120, 157, 74);
 17 | $added-color: rgb(0, 115, 150);
 18 | $graph-btn-panel-color: rgb(200, 100, 100);
 19 | // Themed elements
 20 | 
 21 | nav.navbar {
 22 |     background-color: $header-color !important;
 23 | }
 24 | .navbar-brand {
 25 |     color: #fff !important;
 26 | }
 27 | 
 28 | $secondary: $button-color;
 29 | .btn-secondary.active {
 30 |     background-color: $button-color-active !important;
 31 | }
 32 | 
 33 | .btn-outline-secondary.active {
 34 |     color: #fff !important;
 35 | }
 36 | 
 37 | $info: $button-color;
 38 | .btn-light {
 39 |     border: solid 1px rgb(206, 212, 218) !important;
 40 | }
 41 | 
 42 | .btn-light.active {
 43 |     background-color: #eee !important;
 44 | }
 45 | 
 46 | .link-color {
 47 |     color: $link-color
 48 | }
 49 | 
 50 | a {
 51 |     color: $link-color !important
 52 | }
 53 | 
 54 | a.btn-secondary {
 55 |     color: #fff !important;
 56 | }
 57 | 
 58 | .number,
 59 | .card-header {
 60 |     background-color: $header-color !important;
 61 |     color: #fff !important;
 62 | }
 63 | 
 64 | .input-group-text,
 65 | .custom-control-input:checked~.custom-control-label::before,
 66 | .custom-control-input:focus~.custom-control-label::before {
 67 |     color: $link-color !important;
 68 |     background-color: #fff !important;
 69 |     border-color: $link-color !important;
 70 | }
 71 | 
 72 | .metadata-args,
 73 | .remove-metadata,
 74 | .term-groups,
 75 | .close-pill {
 76 |     border-color: $link-color !important;
 77 | }
 78 | 
 79 | .metadata-label,
 80 | .remove-metadata:hover,
 81 | .term-group-word:hover,
 82 | .close-pill:hover {
 83 |     background-color: $button-color !important;
 84 |     color: #fff !important;
 85 | }
 86 | 
 87 | .custom-control-label::after {
 88 |     background-color: $button-color !important;
 89 | }
 90 | 
 91 | .custom-select:focus,
 92 | .custom-control-input:checked~.custom-control-label::before,
 93 | .custom-control-input:focus~.custom-control-label::before,
 94 | input[type="text"]:focus {
 95 |     box-shadow: 0 0 0 0.05rem $button-color !important;
 96 |     border-color: $button-color !important;
 97 |     opacity: .5
 98 | }
 99 | 
100 | .corner-btn {
101 |     background-color: $header-color !important;
102 |     color: $link-color !important;
103 | }
104 | .popover-body ul {
105 |     border-width: 0 !important;
106 | }
107 | .separator {
108 |     padding: 5px;
109 |     font-size: 60%;
110 |     display: inline-block;
111 |     vertical-align: middle;
112 | }
113 | 
114 | $popover-body-padding-y: 0;
115 | $popover-body-padding-x: 0;
116 | 
117 | :export {
118 |     color: $passage-color
119 | }


--------------------------------------------------------------------------------
/web-app/src/components/reportSwitcher.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |     <div class="btn-group mt-2" role="group" aria-label="alt-results">
 3 |         <button type="button" class="btn btn-outline-secondary" :class="{ active: report == 'searchResults' }"
 4 |             style="border-bottom-left-radius: 0;" @click="viewAllResults">View all
 5 |             passage pairs</button>
 6 |         <button type="button" class="btn btn-outline-secondary" :class="{ active: report == 'sortedResults' }"
 7 |             @click="sortResults">View passages by frequency of
 8 |             reuse</button>
 9 |         <button type="button" class="btn btn-outline-secondary" :class="{ active: report == 'timeSeries' }"
10 |             @click="viewPassageInTimeline">View passages in
11 |             timeline</button>
12 |         <button type="button" class="btn btn-outline-secondary" :class="{ active: report == 'networkGraph' }"
13 |             @click="viewNetworkGraph">View network
14 |             graph</button>
15 |         <button type="button" class="btn btn-outline-secondary" :class="{ active: report == 'semanticGraph' }"
16 |             style="border-bottom-right-radius: 0;" @click="viewSemanticGraph">View semantic
17 |             graph</button>
18 |     </div>
19 | </template>
20 | <script>
21 | export default {
22 |     name: 'reportSwitcher',
23 |     data() {
24 |         return {
25 |             report: this.$route.name,
26 |         }
27 |     },
28 |     methods: {
29 |         viewAllResults() {
30 |             let queryParams = { ...this.$route.query };
31 |             delete queryParams.page;
32 |             delete queryParams.id_anchor;
33 |             queryParams.db_table = this.$globalConfig.databaseName;
34 |             this.$router.push(`/search?${this.paramsToUrl(queryParams)}`);
35 |         },
36 |         sortResults() {
37 |             let queryParams = { ...this.$route.query };
38 |             delete queryParams.page;
39 |             delete queryParams.id_anchor;
40 |             queryParams.db_table = this.$globalConfig.databaseName;
41 |             this.$router.push(`/sorted-results/?${this.paramsToUrl(queryParams)}`);
42 |         },
43 |         viewPassageInTimeline() {
44 |             let queryParams = { ...this.$route.query };
45 |             delete queryParams.page;
46 |             delete queryParams.id_anchor;
47 |             queryParams.db_table = this.$globalConfig.databaseName;
48 |             this.$router.push(`/time/?${this.paramsToUrl(queryParams)}`);
49 |         },
50 |         viewNetworkGraph() {
51 |             let queryParams = { ...this.$route.query };
52 |             delete queryParams.page;
53 |             delete queryParams.id_anchor;
54 |             queryParams.db_table = this.$globalConfig.databaseName;
55 |             this.$router.push(`/network/?${this.paramsToUrl(queryParams)}`);
56 |         },
57 |         viewSemanticGraph() {
58 |             let queryParams = { ...this.$route.query };
59 |             delete queryParams.page;
60 |             delete queryParams.id_anchor;
61 |             queryParams.db_table = this.$globalConfig.databaseName;
62 |             this.$router.push(`/semantic/?${this.paramsToUrl(queryParams)}`);
63 |         },
64 |     }
65 | }
66 | </script>
67 | <style scoped>
68 | .btn-group .btn {
69 |     border-bottom-width: 0;
70 | }
71 | </style>


--------------------------------------------------------------------------------
/lib/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["setuptools>=61.0", "wheel"]
  3 | build-backend = "setuptools.build_meta"
  4 | 
  5 | [project]
  6 | name = "textpair"
  7 | version = "2.3.0"
  8 | description = "TextPAIR (Pairwise Alignment for Intertextual Relations) - A scalable and high-performance sequence aligner for humanities text analysis designed to identify similar passages in large collections of texts"
  9 | readme = "README.md"
 10 | license = {text = "GPL-3.0-or-later"}
 11 | authors = [
 12 |     {name = "Clovis Gladstone", email = "clovisgladstone@gmail.com"}
 13 | ]
 14 | maintainers = [
 15 |     {name = "Clovis Gladstone", email = "clovisgladstone@gmail.com"}
 16 | ]
 17 | requires-python = ">=3.11"
 18 | dependencies = [
 19 |     "multiprocess",
 20 |     "mmh3",
 21 |     "unidecode",
 22 |     "tqdm",
 23 |     "cython",
 24 |     "pystemmer",
 25 |     "lxml",
 26 |     "namedlist",
 27 |     "sentence-transformers~=5.1",
 28 |     "lz4",
 29 |     "orjson",
 30 |     "text_preprocessing @ git+https://github.com/ARTFL-Project/text-preprocessing@v1.1.1.3#egg=text_preprocessing",
 31 |     "fastapi==0.110.3",
 32 |     "psycopg2",
 33 |     "gunicorn",
 34 |     "uvicorn",
 35 |     "uvloop",
 36 |     "httptools",
 37 |     "philologic~=5.1",
 38 |     "regex",
 39 |     "ahocorasick-rs",
 40 |     "msgspec",
 41 |     "faiss-cpu",
 42 |     "spacy-transformers",
 43 |     "networkx~=3.5",
 44 |     "torch-geometric~=2.7.0",
 45 |     "umap-learn~=0.5.9",
 46 |     "pgvector~=0.4.1",
 47 |     "textpair_llm",
 48 | ]
 49 | 
 50 | [project.urls]
 51 | Homepage = "https://github.com/ARTFL-Project/text-pair"
 52 | Repository = "https://github.com/ARTFL-Project/text-pair"
 53 | Documentation = "https://github.com/ARTFL-Project/text-pair#readme"
 54 | "Bug Tracker" = "https://github.com/ARTFL-Project/text-pair/issues"
 55 | 
 56 | [project.scripts]
 57 | textpair = "textpair.__main__:main"
 58 | 
 59 | [project.optional-dependencies]
 60 | cpu = [
 61 |     "torch==2.8",
 62 |     "torchvision>=0.22.0",
 63 | ]
 64 | cuda = [
 65 |     "torch==2.8",
 66 |     "torchvision>=0.22.0",
 67 |     "cupy-cuda12x>=13.0.0,<14.0.0",
 68 | ]
 69 | 
 70 | [tool.setuptools]
 71 | package-dir = {"" = "."}
 72 | 
 73 | [tool.setuptools.packages.find]
 74 | where = ["."]
 75 | include = ["textpair"]
 76 | exclude = ["tests*"]
 77 | 
 78 | [tool.isort]
 79 | profile = "black"
 80 | line_length = 100
 81 | multi_line_output = 3
 82 | include_trailing_comma = true
 83 | force_grid_wrap = 0
 84 | use_parentheses = true
 85 | ensure_newline_before_comments = true
 86 | 
 87 | # UV-specific configuration for PyTorch indexes
 88 | [tool.uv]
 89 | conflicts = [
 90 |   [
 91 |     { extra = "cpu" },
 92 |     { extra = "cuda" },
 93 |   ],
 94 | ]
 95 | 
 96 | [tool.uv.sources]
 97 | textpair_llm = { path = "textpair_llm", editable = true }
 98 | torch = [
 99 |     { index = "pytorch-cpu", extra = "cpu" },
100 |     { index = "pytorch-cuda", extra = "cuda" },
101 | ]
102 | torchvision = [
103 |     { index = "pytorch-cpu", extra = "cpu" },
104 |     { index = "pytorch-cuda", extra = "cuda" },
105 | ]
106 | 
107 | [[tool.uv.index]]
108 | name = "pytorch-cpu"
109 | url = "https://download.pytorch.org/whl/cpu"
110 | explicit = true
111 | 
112 | [[tool.uv.index]]
113 | name = "pytorch-cuda"
114 | url = "https://download.pytorch.org/whl/cu126"
115 | explicit = true
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/lib/textpair/text_parser.py:
--------------------------------------------------------------------------------
  1 | """Parse files using Philologic parser"""
  2 | 
  3 | import os
  4 | from typing import Set
  5 | 
  6 | from philologic.loadtime import LoadFilters
  7 | from philologic.loadtime import Parser as XMLParser
  8 | from philologic.loadtime import PlainTextParser
  9 | from philologic.loadtime.Loader import Loader, setup_db_dir
 10 | 
 11 | PHILO_TEXT_OBJECT_LEVELS = {"doc": 1, "div1": 2, "div2": 3, "div3": 4, "para": 5, "sent": 6, "word": 7}
 12 | 
 13 | 
 14 | def parse_files(
 15 |     input_file_path: str,
 16 |     file_type: str,
 17 |     metadata: str,
 18 |     output_path: str,
 19 |     words_to_index: str,
 20 |     object_level: str,
 21 |     lowercase: bool,
 22 |     workers: int,
 23 |     debug: bool,
 24 | ):
 25 |     """Parse files using Philologic parser"""
 26 |     output_path = os.path.abspath(output_path)
 27 |     os.makedirs(output_path, exist_ok=True)
 28 |     setup_db_dir(output_path, force_delete=True)
 29 |     word_list: Set[str] = set()
 30 |     if words_to_index != "all":
 31 |         with open(words_to_index, encoding="utf8") as fh:
 32 |             for line in fh:
 33 |                 word_list.add(line.strip())
 34 |     navigable_objects = [
 35 |         text_object
 36 |         for text_object, depth in PHILO_TEXT_OBJECT_LEVELS.items()
 37 |         if PHILO_TEXT_OBJECT_LEVELS[object_level] >= depth
 38 |     ]
 39 |     if file_type == "tei":
 40 |         loader = Loader.set_class_attributes(
 41 |             {
 42 |                 "post_filters": [],
 43 |                 "debug": debug,
 44 |                 "words_to_index": word_list,
 45 |                 "data_destination": output_path,
 46 |                 "db_destination": "",
 47 |                 "default_object_level": object_level,
 48 |                 "token_regex": XMLParser.TOKEN_REGEX,
 49 |                 "url_root": "",
 50 |                 "cores": workers,
 51 |                 "ascii_conversion": True,
 52 |                 "doc_xpaths": XMLParser.DEFAULT_DOC_XPATHS,
 53 |                 "metadata_sql_types": {},
 54 |                 "metadata_to_parse": XMLParser.DEFAULT_METADATA_TO_PARSE,
 55 |                 "tag_to_obj_map": XMLParser.DEFAULT_TAG_TO_OBJ_MAP,
 56 |                 "parser_factory": XMLParser.XMLParser,
 57 |                 "load_filters": LoadFilters.set_load_filters(navigable_objects=navigable_objects),
 58 |                 "file_type": file_type,
 59 |                 "bibliography": metadata,
 60 |                 "lowercase_index": lowercase,
 61 |                 "load_config": "",
 62 |                 "lemma_file": None,
 63 |                 "spacy_model": "",
 64 |                 "suppress_word_attributes": []
 65 |             }
 66 |         )
 67 |     else:
 68 |         loader = Loader.set_class_attributes(
 69 |             {
 70 |                 "post_filters": [],
 71 |                 "debug": debug,
 72 |                 "words_to_index": word_list,
 73 |                 "data_destination": output_path,
 74 |                 "db_destination": "",
 75 |                 "default_object_level": object_level,
 76 |                 "token_regex": PlainTextParser.TOKEN_REGEX,
 77 |                 "url_root": "",
 78 |                 "cores": workers,
 79 |                 "ascii_conversion": True,
 80 |                 "doc_xpaths": XMLParser.DEFAULT_DOC_XPATHS,
 81 |                 "metadata_sql_types": {},
 82 |                 "metadata_to_parse": XMLParser.DEFAULT_METADATA_TO_PARSE,
 83 |                 "tag_to_obj_map": XMLParser.DEFAULT_TAG_TO_OBJ_MAP,
 84 |                 "parser_factory": PlainTextParser.PlainTextParser,
 85 |                 "load_filters": LoadFilters.set_load_filters(navigable_objects=navigable_objects),
 86 |                 "file_type": file_type,
 87 |                 "bibliography": metadata,
 88 |                 "load_config": "",
 89 |                 "lemma_file": None,
 90 |                 "spacy_model": "",
 91 |                 "suppress_word_attributes": []
 92 |             }
 93 |         )
 94 |     loader.tables = ["toms"]  # We just want the toms (metadata) table.
 95 |     loader.add_files([f.path for f in os.scandir(input_file_path)])
 96 |     if metadata != "":
 97 |         doc_metadata = loader.parse_bibliography_file(metadata, ["year", "author", "title", "filename"])
 98 |     else:
 99 |         doc_metadata = loader.parse_metadata(["year", "author", "title", "filename"], header="tei", verbose=False)
100 |     loader.set_file_data(doc_metadata, loader.textdir, loader.workdir)
101 |     loader.parse_files(workers, verbose=False)
102 |     loader.merge_files("toms", verbose=False)
103 |     loader.setup_sql_load(verbose=False)
104 |     loader.post_processing(verbose=False)
105 |     loader.write_db_config()
106 |     os.chdir("../../..")
107 | 


--------------------------------------------------------------------------------
/config/appConfig.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "webServer": "Apache",
  3 |     "appPath": "text-pair/",
  4 |     "apiServer": "http://localhost/text-pair-api",
  5 |     "databaseName": "alignment_db",
  6 |     "databaseLabel": "Text Alignments",
  7 |     "matchingAlgorithm": "sa",
  8 |     "branding": [
  9 |         {
 10 |             "label": "ARTFL-Project",
 11 |             "link": "https://artfl-project.uchicago.edu"
 12 |         },
 13 |         {
 14 |             "label": "Textual Optics Lab",
 15 |             "link": "https://textual-optics-lab.uchicago.edu"
 16 |         }
 17 |     ],
 18 |     "sourcePhiloDBLink": "",
 19 |     "sourcePhiloDBPath": "",
 20 |     "targetPhiloDBLink": "",
 21 |     "targetPhiloDBPath": "",
 22 |     "sourceLabel": "Source",
 23 |     "targetLabel": "Target",
 24 |     "sourceCitation": [
 25 |         {
 26 |             "field": "source_author",
 27 |             "style": {}
 28 |         },
 29 |         {
 30 |             "field": "source_title",
 31 |             "style": {
 32 |                 "font-style": "italic"
 33 |             }
 34 |         },
 35 |         {
 36 |             "field": "source_year",
 37 |             "style": {}
 38 |         }
 39 |     ],
 40 |     "targetCitation": [
 41 |         {
 42 |             "field": "target_author",
 43 |             "style": {}
 44 |         },
 45 |         {
 46 |             "field": "target_title",
 47 |             "style": {
 48 |                 "font-style": "italic"
 49 |             }
 50 |         },
 51 |         {
 52 |             "field": "target_year",
 53 |             "style": {}
 54 |         }
 55 |     ],
 56 |     "metadataFields": {
 57 |         "source": [
 58 |             {
 59 |                 "label": "Passage",
 60 |                 "value": "source_passage"
 61 |             },
 62 |             {
 63 |                 "label": "Author",
 64 |                 "value": "source_author"
 65 |             },
 66 |             {
 67 |                 "label": "Title",
 68 |                 "value": "source_title"
 69 |             },
 70 |             {
 71 |                 "label": "Year",
 72 |                 "value": "source_year"
 73 |             },
 74 |             {
 75 |                 "label": "Passage Length",
 76 |                 "value": "source_passage_length"
 77 |             }
 78 |         ],
 79 |         "target": [
 80 |             {
 81 |                 "label": "Passage",
 82 |                 "value": "target_passage"
 83 |             },
 84 |             {
 85 |                 "label": "Author",
 86 |                 "value": "target_author"
 87 |             },
 88 |             {
 89 |                 "label": "Title",
 90 |                 "value": "target_title"
 91 |             },
 92 |             {
 93 |                 "label": "Year",
 94 |                 "value": "target_year"
 95 |             },
 96 |             {
 97 |                 "label": "Passage Length",
 98 |                 "value": "target_passage_length"
 99 |             }
100 |         ]
101 |     },
102 |     "facetsFields": {
103 |         "source": [
104 |             {
105 |                 "label": "Passage",
106 |                 "value": "source_passage"
107 |             },
108 |             {
109 |                 "label": "Author",
110 |                 "value": "source_author"
111 |             },
112 |             {
113 |                 "label": "Title",
114 |                 "value": "source_title"
115 |             },
116 |             {
117 |                 "label": "Year",
118 |                 "value": "source_year"
119 |             },
120 |             {
121 |                 "label": "Passage Length",
122 |                 "value": "source_passage_length"
123 |             }
124 |         ],
125 |         "target": [
126 |             {
127 |                 "label": "Passage",
128 |                 "value": "target_passage"
129 |             },
130 |             {
131 |                 "label": "Author",
132 |                 "value": "target_author"
133 |             },
134 |             {
135 |                 "label": "Title",
136 |                 "value": "target_title"
137 |             },
138 |             {
139 |                 "label": "Year",
140 |                 "value": "target_year"
141 |             },
142 |             {
143 |                 "label": "Passage Length",
144 |                 "value": "source_passage_length"
145 |             }
146 |         ]
147 |     },
148 |     "timeSeriesIntervals": [
149 |         {
150 |             "label": "Year",
151 |             "value": 1
152 |         },
153 |         {
154 |             "label": "Decade",
155 |             "value": 10
156 |         },
157 |         {
158 |             "label": "Half Century",
159 |             "value": 50
160 |         },
161 |         {
162 |             "label": "Century",
163 |             "value": 100
164 |         }
165 |     ],
166 |     "banalitiesStored": false,
167 |     "passageClassification": []
168 | }


--------------------------------------------------------------------------------
/lib/textpair_graph/textpair_graph/__main__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | TextPair Graph CLI - Build and label graph models for alignment data.
  4 | """
  5 | 
  6 | import argparse
  7 | import asyncio
  8 | import sys
  9 | 
 10 | 
 11 | def main():
 12 |     """Main entry point for textpair_graph CLI."""
 13 |     parser = argparse.ArgumentParser(
 14 |         description="TextPair Graph - Build and label graph models for alignment data",
 15 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 16 |         epilog="""
 17 | Examples:
 18 |   # Build graph model from alignments
 19 |   python -m textpair_graph build alignments.jsonl.lz4 ./output --model antoinelouis/french-mgte-base
 20 | 
 21 |   # Generate cluster labels using LLM
 22 |   python -m textpair_graph label ./output/graph_data --model unsloth/gemma-3-4b-it-qat-GGUF
 23 |         """
 24 |     )
 25 | 
 26 |     subparsers = parser.add_subparsers(dest='command', help='Available commands')
 27 | 
 28 |     # Build command
 29 |     build_parser = subparsers.add_parser('build', help='Build graph model from alignments')
 30 |     build_parser.add_argument('alignments_file', help='Path to alignments file (JSONL with lz4 compression)')
 31 |     build_parser.add_argument('output_dir', help='Output directory for graph data')
 32 |     build_parser.add_argument('--model', required=True, help='SentenceTransformer model name or path')
 33 |     build_parser.add_argument('--min-cluster-size', type=int, default=5,
 34 |                              help='Minimum cluster size for HDBSCAN (default: 5)')
 35 |     build_parser.add_argument('--use-cuda', action='store_true',
 36 |                              help='Use CUDA-accelerated UMAP/HDBSCAN if available')
 37 | 
 38 |     # Label command
 39 |     label_parser = subparsers.add_parser('label', help='Generate cluster labels using LLM')
 40 |     label_parser.add_argument('graph_data_dir', help='Path to graph_data directory')
 41 |     label_parser.add_argument('--model', default='unsloth/gemma-3-4b-it-qat-GGUF',
 42 |                              help='LLM model name or path (default: unsloth/gemma-3-4b-it-qat-GGUF)')
 43 |     label_parser.add_argument('--context-window', type=int, default=8192,
 44 |                              help='Context window size for LLM (default: 8192)')
 45 |     label_parser.add_argument('--top-k', type=int, default=25,
 46 |                              help='Number of top passages per cluster (default: 25)')
 47 |     label_parser.add_argument('--port', type=int, default=8080,
 48 |                              help='Port for llama-server (default: 8080)')
 49 | 
 50 |     args = parser.parse_args()
 51 | 
 52 |     if not args.command:
 53 |         parser.print_help()
 54 |         sys.exit(1)
 55 | 
 56 |     if args.command == 'build':
 57 |         import os
 58 | 
 59 |         import lz4.frame
 60 |         import orjson
 61 | 
 62 |         from .build_graph_model import (
 63 |             build_alignment_data,
 64 |             build_precomputed_api_graph,
 65 |             cluster_alignments,
 66 |         )
 67 | 
 68 |         print(f"Building graph model from {args.alignments_file}")
 69 |         print(f"SBERT model: {args.model}")
 70 | 
 71 |         # Count alignments
 72 |         print("\rCounting alignments...", end="", flush=True)
 73 |         alignment_counts = sum(1 for _ in lz4.frame.open(args.alignments_file, 'rb'))
 74 |         print(f"\r✓ Counted {alignment_counts:,} alignments" + " " * 20)
 75 | 
 76 |         output_path = os.path.join(args.output_dir, "graph_data")
 77 |         os.makedirs(output_path, exist_ok=True)
 78 | 
 79 |         # Build alignment data and embeddings
 80 |         data = build_alignment_data(args.alignments_file, alignment_counts, args.model)
 81 | 
 82 |         # Save author mapping
 83 |         with open(os.path.join(output_path, 'author_to_id.json'), 'wb') as f:
 84 |             f.write(orjson.dumps(data['author_to_id']))
 85 | 
 86 |         # Cluster alignments
 87 |         print("\n" + "="*60)
 88 |         print("CLUSTERING ALIGNMENTS")
 89 |         print("="*60)
 90 |         cluster_labels = cluster_alignments(data, output_path, args.alignments_file, alignment_counts)
 91 | 
 92 |         # Build precomputed graph for API
 93 |         build_precomputed_api_graph(args.alignments_file, output_path)
 94 | 
 95 |         print("\n✓ Graph model built successfully!")
 96 |         print(f"   Graph data saved to: {output_path}/")
 97 | 
 98 |     elif args.command == 'label':
 99 |         import os
100 | 
101 |         from .label_clusters import generate_and_update_cluster_labels
102 | 
103 |         # Find alignments file
104 |         parent_dir = os.path.dirname(args.graph_data_dir)
105 |         alignments_file = None
106 |         for fname in os.listdir(parent_dir):
107 |             if fname.endswith('.jsonl.lz4'):
108 |                 alignments_file = os.path.join(parent_dir, fname)
109 |                 break
110 | 
111 |         if not alignments_file:
112 |             print("Error: Could not find alignments file (.jsonl.lz4) in parent directory")
113 |             sys.exit(1)
114 | 
115 |         print(f"Generating cluster labels for {args.graph_data_dir}")
116 |         print(f"LLM model: {args.model}")
117 | 
118 |         # Run async function
119 |         cluster_labels = asyncio.run(
120 |             generate_and_update_cluster_labels(
121 |                 alignments_file=alignments_file,
122 |                 graph_data_path=args.graph_data_dir,
123 |                 model_path=args.model,
124 |                 context_window=args.context_window,
125 |                 top_k=args.top_k,
126 |                 port=args.port
127 |             )
128 |         )
129 | 
130 |         print(f"\n✓ Generated labels for {len(cluster_labels)} clusters!")
131 | 
132 | 
133 | if __name__ == "__main__":
134 |     main()
135 | 


--------------------------------------------------------------------------------
/REFERENCES.md:
--------------------------------------------------------------------------------
 1 | # PAPERS AND REFERENCES
 2 | 
 3 | TextPAIR is the result of years of work. Here are some articles and talks we've given
 4 | on TextPAIR (and its direct ancestor PhiloLine) over the years:
 5 | 
 6 | ## PAPERS
 7 | 
 8 | -   Charles Cooney and Clovis Gladstone, ["Opening New Paths for Scholarship: Algorithms to Track Text Reuse in ECCO"](https://drive.google.com/file/d/1I9O0JNw2jOP3p-uQwnd0fMNnAojbpfe6/view?usp=sharing), Digitizing Enlightenment: Digital Humanities and the Transformation of Eighteenth-Century Studies, Simon Burrows & Glenn Roe ed., Oxford University Studies in the Enlightenment, Voltaire Foundation in association with Liverpool University Press, July 2020
 9 | -   Alfie Abdul-Rahman, Glenn Roe, Mark Olsen, Clovis Gladstone, Robert Morrissey, Nicholas Cronk and Min Chen, “Constructive Visual Analytics for Text Similarity Detection,” Computer Graphics Forum (February 2016). [doi:10.1111/cgf.12798](http://onlinelibrary.wiley.com/doi/10.1111/cgf.12798)
10 | -   Dan Edelstein, Robert Morrissey, and Glenn Roe, ["To Quote or not to Quote: Citation Strategies in the Encyclopédie"](http://muse.jhu.edu/journals/journal_of_the_history_of_ideas/v074/74.2.edelstein.pdf), Journal of the History of Ideas Vol. 74, No. 2, 2013: 213-236.
11 | -   Russell Horton, Mark Olsen, and Glenn Roe, ["Something Borrowed: Sequence Alignment and the Identification of Similar Passages in Large Text Collections"](https://www.digitalstudies.org/articles/10.16995/dscn.258/), Digital Studies / Le Champ numérique Volume 2, Number 1 (2010)
12 | -   Timothy Allen, Charles Cooney, Stéphane Douard, Russell Horton, Robert Morrissey, Mark Olsen, Glenn Roe, and Robert Voyer, ["Plundering Philosophers: Identifying Sources of the Encyclopédie"](http://hdl.handle.net/2027/spo.3310410.0013.107), Journal of the Association for History and Computing, May 2010, Volume 13, Number 1
13 | 
14 | ## TALKS
15 | 
16 | -   ["The TextPAIR Viewer (TPV): An Interactive Visual Toolkit for Exploring Large-Scale Networks of Textual Alignments and Text Reuses"](https://docs.google.com/presentation/d/e/2PACX-1vSl5mcP92u96bbIFrGGHj3UFYB7f_N4xer4sl0eTd51qkQ9z6URskvUMA5WuuLvTJoLgJjKWssANLZq/pub?start=false&loop=false&delayms=3000), with Jeffrey Tharsen, The Association for Computers and the Humanities Conference 2021, Virtual Conference, July 2021
17 | -   Clovis Gladstone and Mark Olsen, ["Text Reuse: The Curious Case of Robespierre and Rousseau"](https://docs.google.com/presentation/d/e/2PACX-1vQGzugIlqk0S2EILMK6QGAIw-H6F2omNf7EL76YmLfGP7cd3dqWDgvzUd5ANTqN3xqbwea6RK3P-Yf8/pub?start=false&loop=false&delayms=3000), Digitizing Enlightenment IV, International Society for Eighteenth Century Studies (ISECS) Conference, Edinburgh, July 2019
18 | -   Clovis Gladstone, Glenn Roe, Robert Morrissey, and Mark Olsen, ["Enlightenment Legacies: Sequence Alignment and Text Reuse at Scale"](https://docs.google.com/presentation/d/e/2PACX-1vRhaZntXPjM9fQoKVk5RQnxAQWPO0lErGpe53ge3bGOE4krY4uI3varMbtgv_dLb-lOM8ES_bxHo8EP/pub?start=false&loop=false&delayms=3000), Digital Humanities Conference 2019, Utrecht, July 2019
19 | -   Clovis Gladstone, ["Where is Rousseau? Tracking Enlightenment Thought in the Revolutionary Period"](https://docs.google.com/presentation/d/e/2PACX-1vRO20fNkB1zGAAbd9tnPi_f8iPiukOi1Ukz3Y3taS4etbXr0_sF8I8ADAGjTTE9yugVO3538M9ae471/pub?start=false&loop=false&delayms=3000), Besterman Enlightenment Workshop, Voltaire Foundation, Oxford, May 2019
20 | -   Charles Cooney and Clovis Gladstone, [“Tracing Swerves Of Influence: Text Reuse And The Reception Of Lucretius In 18th-century England”](https://docs.google.com/presentation/d/1DXgB4Hl3eJiTfj7P_DX8GkQO3k_pDUT_t4AVt29Mxm4/edit?usp=sharing), Digital Humanities 2017, Montreal, Canada, August 9, 2017
21 | -   Didier Alexandre, Jean-Gabriel Ganascia, Clovis Gladstone, Robert Morrissey et Glenn Roe, "Effets d’optique textuelle à grande échelle: Les Champs du réemploi", Colloque "Des Humanités Littéraires", Centre Culturel International de Cerisy, Cerisy (France), Juin 2017
22 | -   Charles Cooney and Clovis Gladstone, ["Omnia aurea dicta — Examining reception of Lucretius in 18th-century England using ARTFL’s ECCO alignment database."](https://docs.google.com/presentation/d/1NcQOIQXo1uJX9BKQA3Q1idl0ai82f94FzBU5gDr0DhI/edit?usp=sharing), Digital Humanities and Computer Science, University of Illinois-Chicago, November 2016
23 | -   Clovis Gladstone, Glenn Roe, Robert Morrissey, and Mark Olsen, ["Digging into ECCO: Identifying Commonplaces and other Forms of Text Reuse at Scale"](https://docs.google.com/presentation/d/1K_imHrWUTG9_7zFNeGOwkahifzl-sHA4JoEMWSlmNvM/edit?usp=sharing), Digital Humanities 2016, Krakow, Poland, July 2016
24 | -   Glenn Roe, ["Text Mining Electronic Enlightenment: Influence and Intertextuality in the Eighteenth-Century Republic of Letters"](http://cofk.history.ox.ac.uk/text-mining-the-republic-of-letters/), Cultures of Knowledge in Early Modern Europe Seminar, University of Oxford, United Kingdom, May 2012
25 | -   Robert Morrissey and Glenn Roe, [“Quelques sources de l'Encyclopédie revues à la lumière de nouvelles techniques informatiques”](https://docs.google.com/presentation/pub?id=1kL9nS9VDVtNObUspyOkGc8SQVva71ebfhuU1JjTu5iE&start=false&loop=false&delayms=3000), Séminaire: La Manufacture Encyclopédique, Société Diderot, Paris, France, May 2012
26 | -   Russell Horton, Mark Olsen and Glenn Roe, ["PAIR: Pairwise Alignment for Intertextual Relations"](http://docs.google.com/Present?id=ddj2s2rb_3f3cjsp8c&skipauth=true), Annual Meeting of the Society for Digital Humanities -- Société pour l'étude des médias interactifs - Carleton University, Ottawa, May 25-27, 2009
27 | -   Glenn Roe, ["Encyclopedic Intertextuality: Identifying Intertextual Relationships in the Encyclopédie using Sequence Alignment"](https://share.acrobat.com/adc/document.do?docid=83cb2b1c-98ed-434d-8b92-bb13d73b4bf0),"Knowledge Production, Technology, and Cultural Change: Colloquium on the Digital Encyclopédie" - University of Minnesota, April 23-24, 2009
28 | 


--------------------------------------------------------------------------------
/extras/backup_database.py:
--------------------------------------------------------------------------------
  1 | """Backs up existing TextPAIR database (a table in PostGreSQL), along with web config and web files to a tarball file."""
  2 | 
  3 | import json
  4 | import os
  5 | import shutil
  6 | from argparse import ArgumentParser
  7 | from configparser import ConfigParser
  8 | from pathlib import Path
  9 | 
 10 | import lz4.frame
 11 | import psycopg2
 12 | 
 13 | GLOBAL_CONFIG = ConfigParser()
 14 | GLOBAL_CONFIG.read("/etc/text-pair/global_settings.ini")
 15 | 
 16 | 
 17 | def table_exists(user, password, table_name):
 18 |     conn = psycopg2.connect(database=GLOBAL_CONFIG.get("DATABASE", "database_name"), user=user, password=password)
 19 |     with conn.cursor() as cursor:
 20 |         cursor.execute("SELECT 1 FROM information_schema.tables WHERE table_name=%s", (table_name,))
 21 |         result = cursor.fetchone()
 22 |     conn.close()
 23 |     return result if result else None
 24 | 
 25 | 
 26 | def back_up_philo_db_data(philo_db_path, output_path):
 27 |     """Backs up the source data for a PhiloLogic database."""
 28 |     print(f"  - Backing up PhiloLogic database from {philo_db_path}")
 29 | 
 30 |     # Copy text files
 31 |     text_path = output_path / "TEXT"
 32 |     text_path.mkdir()
 33 |     print("    - Copying TEXT files...")
 34 |     for file in os.scandir(philo_db_path / 'data/TEXT/'):
 35 |         shutil.copy(file.path, text_path)
 36 | 
 37 |     # Copy db related files:
 38 |     print("    - Copying database files...")
 39 |     shutil.copy(philo_db_path / 'data/toms.db', output_path)
 40 |     shutil.copy(philo_db_path / 'data/db.locals.py', output_path)
 41 |     print("    ✓ PhiloLogic database backup complete")
 42 | 
 43 | 
 44 | def extract_textpair_database(table, web_app_path, output_path):
 45 |     print(f"\nStarting TextPAIR backup for database: {table}")
 46 |     print(f"Web app path: {web_app_path}")
 47 |     print(f"Output path: {output_path}\n")
 48 | 
 49 |     db_name = GLOBAL_CONFIG.get("DATABASE", "database_name")
 50 |     db_user = GLOBAL_CONFIG.get("DATABASE", "database_user")
 51 |     db_password = GLOBAL_CONFIG.get("DATABASE", "database_password")
 52 | 
 53 |     # Create a temporary directory for organizing files
 54 |     print("Creating temporary directory structure...")
 55 |     temp_dir = Path(output_path) / f"textpair_{table}_temp"
 56 |     temp_dir.mkdir(exist_ok=True)
 57 | 
 58 |     # Create the backup directory that will be the root in the tarball
 59 |     backup_dir = temp_dir / f"{table}_textpair_backup"
 60 |     backup_dir.mkdir(exist_ok=True)
 61 | 
 62 |     # Copy web app contents to backup directory using original directory name
 63 |     web_app_name = Path(web_app_path).name
 64 |     web_app_dest = backup_dir / web_app_name
 65 |     print(f"Copying web application files to {web_app_name}...")
 66 |     shutil.copytree(web_app_path, web_app_dest, dirs_exist_ok=True)
 67 |     print("✓ Web application files copied")
 68 | 
 69 |     # Check if we have a source_data directory in our temp directory
 70 |     # if not, call the back_up_philo_db_data function to create one
 71 |     source_data_dir = web_app_dest / "source_data/data"
 72 |     if not source_data_dir.exists():
 73 |         print("\nBacking up PhiloLogic databases...")
 74 |         app_config = json.load(open(web_app_dest / "appConfig.json"))
 75 |         source_data_dir.mkdir(parents=True)
 76 |         back_up_philo_db_data(Path(app_config["sourcePhiloDBPath"]), source_data_dir)
 77 |         if app_config["targetPhiloDBPath"] and app_config["sourcePhiloDBPath"] != app_config["targetPhiloDBPath"]:
 78 |             print("\n  - Found separate target database, backing up...")
 79 |             target_data_dir = web_app_dest / "target_data/data"
 80 |             target_data_dir.mkdir(parents=True)
 81 |             back_up_philo_db_data(Path(app_config["targetPhiloDBPath"]), target_data_dir)
 82 |         print("✓ PhiloLogic databases backup complete\n")
 83 | 
 84 |     # Dump database tables to backup directory
 85 |     print("Checking for TextPAIR database tables...")
 86 |     existing_tables = [
 87 |         table_name
 88 |         for table_name in [table, f"{table}_groups", f"{table}_ordered"]
 89 |         if table_exists(db_user, db_password, table_name)
 90 |     ]
 91 | 
 92 |     print(f"Found {len(existing_tables)} tables to backup")
 93 |     sql_files = []
 94 |     for table_name in existing_tables:
 95 |         sql_file = f"textpair_{table_name}.sql"
 96 |         sql_path = backup_dir / sql_file
 97 |         print(f"  - Dumping table {table_name}...")
 98 |         os.system(f"PGPASSWORD={db_password} pg_dump -U {db_user} {db_name} -t {table_name} > {sql_path}")
 99 |         sql_files.append(sql_file)
100 |     print("✓ Database tables backup complete\n")
101 | 
102 |     # Create tarball from temp directory
103 |     print("Creating final backup archive...")
104 |     tar_path = Path(output_path) / f"textpair_{table}.tar.lz4"
105 |     current_dir = os.getcwd()
106 |     try:
107 |         os.chdir(temp_dir)
108 |         print("  - Creating tar archive...")
109 |         # Create temporary tar file
110 |         temp_tar = "temp.tar"
111 |         os.system(f"tar cf {temp_tar} {table}_textpair_backup")
112 | 
113 |         print("  - Compressing with LZ4...")
114 |         # Read the tar file and compress with lz4
115 |         with open(temp_tar, 'rb') as f:
116 |             tar_data = f.read()
117 |         compressed_data = lz4.frame.compress(tar_data, compression_level=3)
118 | 
119 |         # Write the compressed data
120 |         with open(tar_path, 'wb') as f:
121 |             f.write(compressed_data)
122 | 
123 |         # Clean up temporary tar file
124 |         os.remove(temp_tar)
125 | 
126 |     finally:
127 |         os.chdir(current_dir)
128 | 
129 |     # Clean up
130 |     print("Cleaning up temporary files...")
131 |     shutil.rmtree(temp_dir)
132 | 
133 |     print(f"\n✓ Backup completed successfully!")
134 |     print(f"Backup archive created at: {tar_path}")
135 | 
136 | 
137 | if __name__ == "__main__":
138 |     parser = ArgumentParser()
139 |     parser.add_argument("--db_name", type=str, required=True, help="Name of the database to backup.")
140 |     parser.add_argument("--web_app_path", type=str, default="", help="Path to the output tarball file.")
141 |     parser.add_argument("--output_path", type=str, default="", help="Path to the output tarball file.")
142 |     args = parser.parse_args()
143 | 
144 |     output_path = args.output_path or os.getcwd()
145 |     web_app_path = args.web_app_path or os.path.join(GLOBAL_CONFIG.get("WEB_APP", "web_app_path"), args.db_name)
146 |     web_app_path = web_app_path.rstrip("/")
147 |     extract_textpair_database(args.db_name, web_app_path, output_path)


--------------------------------------------------------------------------------
/lib/textpair_llm/textpair_llm/llama_server_manager.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Standalone llama-server manager script.
  4 | Starts and manages a llama-server process with automatic cleanup.
  5 | """
  6 | 
  7 | import atexit
  8 | import os
  9 | import signal
 10 | import subprocess
 11 | import sys
 12 | import time
 13 | from pathlib import Path
 14 | 
 15 | import requests
 16 | 
 17 | 
 18 | class LlamaServerManager:
 19 |     def __init__(self, model_path, port=8080, context_window=8192, concurrency_limit=8, max_retries=30, retry_delay=1):
 20 |         self.model_path = model_path
 21 |         self.port = port
 22 |         self.context_window = context_window
 23 |         self.concurrency_limit = concurrency_limit
 24 |         self.max_retries = max_retries
 25 |         self.retry_delay = retry_delay
 26 |         self.process = None
 27 |         self.base_url = f"http://127.0.0.1:{port}"
 28 |         self.llama_server_cmd = []
 29 | 
 30 |     def find_llama_server(self):
 31 |         """Find llama-server binary in common locations"""
 32 |         possible_locations = [
 33 |             "llama-server",  # In PATH
 34 |             "llama.cpp/llama-server",  # Local build
 35 |             "~/llama.cpp/llama-server",  # Home directory
 36 |             "/usr/local/bin/llama-server",  # Homebrew on macOS
 37 |             "/opt/homebrew/bin/llama-server",  # M1 Mac homebrew
 38 |         ]
 39 | 
 40 |         for location in possible_locations:
 41 |             expanded_path = Path(location).expanduser()
 42 |             if expanded_path.exists() and expanded_path.is_file():
 43 |                 return str(expanded_path)
 44 | 
 45 |         # Try to find in PATH
 46 |         import shutil
 47 |         if shutil.which("llama-server"):
 48 |             return "llama-server"
 49 | 
 50 |         raise FileNotFoundError(
 51 |             "llama-server binary not found. Please ensure llama.cpp is installed and "
 52 |             "llama-server is in your PATH or in one of the common locations."
 53 |         )
 54 | 
 55 |     def start(self):
 56 |         """Start the llama-server process"""
 57 |         llama_server = self.find_llama_server()
 58 | 
 59 |         # When using --parallel, the context window is divided among all slots
 60 |         # So we need to multiply the desired context window by the parallel value
 61 |         total_context_size = self.context_window * self.concurrency_limit
 62 | 
 63 |         cmd = [
 64 |             llama_server,
 65 |             "--host", "127.0.0.1",
 66 |             "--port", str(self.port),
 67 |             "--ctx-size", str(total_context_size),
 68 |             "--n-gpu-layers", "99",
 69 |             "--parallel", str(self.concurrency_limit),  # Handle multiple concurrent requests
 70 |             "--log-disable",    # Reduce log noise
 71 |             "--threads", "4",   # CPU threads
 72 |             "--mlock",           # Enable memory locking
 73 |             "--cont-batching",   # Enable continuous batching
 74 |             "-fa", "on"        # Enable fast attention
 75 |         ]
 76 | 
 77 |         # If file exists locally, use --model, otherwise assume HF repo
 78 |         if os.path.exists(self.model_path):
 79 |             cmd.extend(["--model", self.model_path])
 80 |             print(f"Starting llama-server with local model: {self.model_path}")
 81 |         else:
 82 |             cmd.extend(["-hf", self.model_path])
 83 |             print(f"Starting llama-server with HF repo: {self.model_path}")
 84 | 
 85 |         print(f"Context window per slot: {self.context_window}, Parallel slots: {self.concurrency_limit}, Total context: {total_context_size}")
 86 | 
 87 |         self.llama_server_cmd = cmd
 88 |         try:
 89 |             self.process = subprocess.Popen(
 90 |                 cmd,
 91 |                 stdout=subprocess.PIPE,
 92 |                 stderr=subprocess.PIPE,
 93 |                 text=True
 94 |             )
 95 |         except FileNotFoundError:
 96 |             raise RuntimeError(f"Failed to start llama-server. Binary not found: {llama_server}")
 97 | 
 98 |         # Register cleanup function
 99 |         atexit.register(self.stop)
100 |         signal.signal(signal.SIGINT, self._signal_handler)
101 |         signal.signal(signal.SIGTERM, self._signal_handler)
102 | 
103 |         # Wait for server to be ready
104 |         self.wait_for_ready()
105 | 
106 |     def wait_for_ready(self):
107 |         """Wait for server to be ready to accept requests"""
108 |         for attempt in range(self.max_retries):
109 |             if self.process and self.process.poll() is not None:
110 |                 # Process has terminated
111 |                 stdout, stderr = self.process.communicate()
112 |                 raise RuntimeError(f"llama-server process terminated early:\nstdout: {stdout}\nstderr: {stderr}")
113 | 
114 |             try:
115 |                 # Try to make a simple health check request
116 |                 response = requests.get(f"{self.base_url}/health", timeout=2)
117 |                 if response.status_code == 200:
118 |                     return  # Server is ready
119 |             except (requests.exceptions.RequestException, requests.exceptions.ConnectionError):
120 |                 pass  # Server not ready yet
121 | 
122 |             time.sleep(self.retry_delay)
123 | 
124 | 
125 |         raise RuntimeError(
126 |             f"Server failed to become ready after {self.max_retries} attempts.\n"
127 |             f"This may indicate:\n"
128 |             f"  - Model download from Hugging Face is taking too long\n"
129 |             f"  - Insufficient system resources (RAM/GPU)\n"
130 |             f"  - Invalid model path or format\n\n"
131 |             f"Try running the server manually to diagnose the issue:\n"
132 |             f"  {' '.join(self.llama_server_cmd)}"
133 |         )
134 | 
135 |     def stop(self):
136 |         """Stop the llama-server process"""
137 |         if self.process:
138 |             self.process.terminate()
139 |             try:
140 |                 self.process.wait(timeout=10)
141 |             except subprocess.TimeoutExpired:
142 |                 self.process.kill()
143 |                 self.process.wait()
144 |             self.process = None
145 | 
146 |     def _signal_handler(self, signum, frame):
147 |         """Handle interrupt signals"""
148 |         self.stop()
149 |         sys.exit(0)
150 | 
151 |     def is_running(self):
152 |         """Check if the server process is still running"""
153 |         return self.process is not None and self.process.poll() is None
154 | 
155 | 
156 | def main():
157 |     if len(sys.argv) < 2:
158 |         print("Usage: python llama_server_manager.py <model_path> [port] [concurrency_limit]")
159 |         print("Examples:")
160 |         print("  Local model: python llama_server_manager.py /path/to/model.gguf 8080")
161 |         print("  HF model:    python llama_server_manager.py microsoft/DialoGPT-medium 8080")
162 |         print("  HF model:    python llama_server_manager.py unsloth/gemma-2-2b-it-bnb-4bit 8080")
163 |         sys.exit(1)
164 | 
165 |     model_path = sys.argv[1]
166 |     port = int(sys.argv[2]) if len(sys.argv) > 2 else 8080
167 |     context_window = int(sys.argv[3]) if len(sys.argv) > 3 else 8192
168 |     concurrency_limit = int(sys.argv[4]) if len(sys.argv) > 4 else 8
169 | 
170 |     server = LlamaServerManager(model_path, port, context_window, concurrency_limit)
171 | 
172 |     try:
173 |         server.start()
174 | 
175 |         # Keep the server alive
176 |         if server.process:
177 |             server.process.wait()
178 | 
179 |     except KeyboardInterrupt:
180 |         print("\nShutting down...")
181 |     except Exception as e:
182 |         print(f"Error: {e}")
183 |         sys.exit(1)
184 |     finally:
185 |         server.stop()
186 | 
187 | 
188 | if __name__ == "__main__":
189 |     main()


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Default values
  4 | PYTHON_VERSION="python3"
  5 | USE_CUDA=false
  6 | 
  7 | # Parse command line arguments
  8 | while getopts "p:c" opt; do
  9 |   case $opt in
 10 |     p) PYTHON_VERSION="$OPTARG"
 11 |     ;;
 12 |     c) USE_CUDA=true
 13 |     ;;
 14 |     *) echo "Usage: $0 [-p python_version] [-c]"
 15 |        echo "  -p: Specify Python version (default: python3)"
 16 |        echo "  -c: Install with CUDA support (default: CPU only)"
 17 |        exit 1
 18 |     ;;
 19 |   esac
 20 | done
 21 | 
 22 | echo "Using Python version: $PYTHON_VERSION"
 23 | if [ "$USE_CUDA" = true ]; then
 24 |     echo "Installing with CUDA support"
 25 | else
 26 |     echo "Installing with CPU-only PyTorch"
 27 | fi
 28 | 
 29 | # Check if uv is installed, install if not
 30 | if ! command -v uv &> /dev/null; then
 31 |     echo "uv could not be found. Installing uv..."
 32 |     curl -LsSf https://astral.sh/uv/install.sh | sh
 33 |     # Add uv to PATH for current session
 34 |     export PATH="$HOME/.local/bin:$PATH"
 35 |     # Verify installation
 36 |     if ! command -v uv &> /dev/null; then
 37 |         echo "ERROR: Failed to install uv. Please install uv manually and try again."
 38 |         echo "Visit https://docs.astral.sh/uv/getting-started/installation/ for installation instructions."
 39 |         exit 1
 40 |     else
 41 |         echo "uv installed successfully"
 42 |     fi
 43 | else
 44 |     echo "uv is already installed"
 45 | fi
 46 | 
 47 | # Delete virtual environment if it already exists
 48 | if [ -d /var/lib/text-pair/textpair_env ]; then
 49 |     echo "Deleting existing TextPAIR installation..."
 50 |     sudo rm -rf /var/lib/text-pair/textpair_env
 51 | fi
 52 | 
 53 | 
 54 | # Give current user permission to write to /var/lib/textpair
 55 | sudo mkdir -p /var/lib/text-pair
 56 | sudo chown -R $USER:$USER /var/lib/text-pair
 57 | 
 58 | # Create the virtual environment using uv
 59 | echo "Creating virtual environment with uv..."
 60 | uv venv -p $PYTHON_VERSION /var/lib/text-pair/textpair_env
 61 | source /var/lib/text-pair/textpair_env/bin/activate
 62 | 
 63 | # Install textpair and textpair_llm together
 64 | if [ "$USE_CUDA" = true ]; then
 65 |     echo "Installing textpair with CUDA support..."
 66 |     uv pip install -e lib/.[cuda]
 67 | else
 68 |     echo "Installing textpair with CPU-only PyTorch..."
 69 |     uv pip install -e lib/.[cpu]
 70 | fi
 71 | 
 72 | deactivate
 73 | 
 74 | # Create separate virtual environment for graph building
 75 | echo ""
 76 | echo "Creating separate virtual environment for graph building..."
 77 | if [ -d /var/lib/text-pair/graph ]; then
 78 |     echo "Deleting existing graph environment..."
 79 |     rm -rf /var/lib/text-pair/graph
 80 | fi
 81 | 
 82 | uv venv -p $PYTHON_VERSION /var/lib/text-pair/graph
 83 | source /var/lib/text-pair/graph/bin/activate
 84 | 
 85 | 
 86 | # Install textpair_graph and textpair_llm together
 87 | echo "Installing textpair_graph..."
 88 | if [ "$USE_CUDA" = true ]; then
 89 |     echo "Installing with GPU acceleration (cuML)..."
 90 |     uv pip install -e lib/textpair_graph[cuda] || {
 91 |         echo "WARNING: Failed to install CUDA graph libraries. Falling back to CPU alternatives..."
 92 |         uv pip install -e lib/textpair_graph
 93 |     }
 94 | else
 95 |     echo "Installing CPU-only graph dependencies..."
 96 |     uv pip install -e lib/textpair_graph
 97 | fi
 98 | 
 99 | deactivate
100 | echo "Graph building environment created at /var/lib/text-pair/graph"
101 | 
102 | # Install the textpair script
103 | sudo cp textpair /usr/local/bin/
104 | 
105 | # Install compareNgrams binary
106 | arch=$(uname -m)
107 | if [ "$arch" = "x86_64" ]; then
108 |     binary_path="lib/core/binary/x86_64/compareNgrams"
109 | elif [ "$arch" = "aarch64" ]; then
110 |     binary_path="lib/core/binary/aarch64/compareNgrams"
111 | else
112 |     echo "Only x86_64 and ARM are supported at this time."
113 |     exit 1
114 | fi
115 | sudo rm -f /usr/local/bin/compareNgrams
116 | sudo cp "$binary_path" /usr/local/bin/compareNgrams
117 | sudo chmod +x /usr/local/bin/compareNgrams
118 | 
119 | 
120 | # Install the web application components
121 | echo -e "\nMoving web application components into place..."
122 | sudo mkdir -p /var/lib/text-pair
123 | if [ ! -f /var/lib/text-pair/api_server/web_server.sh ]
124 |     then
125 |         sudo cp -Rf api_server /var/lib/text-pair/api_server/
126 | else
127 |     echo "/var/lib/text-pair/api_server/web_server.sh already exists, not modifying..."
128 | fi
129 | 
130 | if [ -d web/web_app/node_modules ]
131 |     then
132 |         sudo rm -rf web/web_app/node_modules
133 | fi
134 | sudo cp -Rf api /var/lib/text-pair/
135 | sudo cp -Rf web-app /var/lib/text-pair/
136 | sudo cp -Rf config /var/lib/text-pair/
137 | 
138 | echo -e "\nMoving global configuration into place..."
139 | sudo mkdir -p /etc/text-pair
140 | if [ ! -f /etc/text-pair/global_settings.ini ]
141 |     then
142 |         sudo touch /etc/text-pair/global_settings.ini
143 |         echo "[WEB_APP]" | sudo tee -a /etc/text-pair/global_settings.ini > /dev/null
144 |         echo "api_server = http://localhost/text-pair-api" | sudo tee -a /etc/text-pair/global_settings.ini > /dev/null
145 |         echo "web_app_path =" | sudo tee -a /etc/text-pair/global_settings.ini > /dev/null
146 |         echo "[DATABASE]" | sudo tee -a /etc/text-pair/global_settings.ini > /dev/null
147 |         echo "database_name =" | sudo tee -a /etc/text-pair/global_settings.ini > /dev/null
148 |         echo "database_user =" | sudo tee -a /etc/text-pair/global_settings.ini > /dev/null
149 |         echo "database_password =" | sudo tee -a /etc/text-pair/global_settings.ini > /dev/null
150 |         echo "Make sure you create a PostgreSQL database with a user with read/write access to that database and configure /etc/text-pair/global_settings.ini accordingly."
151 | else
152 |     echo "/etc/text-pair/global_settings.ini already exists, not modifying..."
153 | fi
154 | 
155 | # Check if pgvector extension is available in PostgreSQL
156 | echo ""
157 | echo "Checking for pgvector extension in PostgreSQL..."
158 | if [ -f /usr/share/postgresql/*/extension/vector.control ] || [ -f /usr/local/share/postgresql/*/extension/vector.control ]; then
159 |     echo "✓ pgvector extension found in PostgreSQL"
160 |     echo ""
161 |     echo "IMPORTANT: Before using TextPAIR, you must enable the vector extension in your database."
162 |     echo "As a PostgreSQL superuser, run:"
163 |     echo "  psql -d your_textpair_database -c 'CREATE EXTENSION IF NOT EXISTS vector;'"
164 | else
165 |     echo ""
166 |     echo "ERROR: pgvector extension not found!"
167 |     echo "TextPAIR requires the pgvector extension for PostgreSQL."
168 |     echo ""
169 |     echo "To install pgvector:"
170 |     echo "  See https://github.com/pgvector/pgvector?tab=readme-ov-file#installation for installation instructions."
171 |     echo ""
172 |     echo "After installing pgvector, run this install script again."
173 |     echo ""
174 |     exit 1
175 | fi
176 | 
177 | echo -e "\n## INSTALLATION COMPLETE ##"
178 | echo "TextPAIR has been installed successfully using uv!"
179 | if [ "$USE_CUDA" = true ]; then
180 |     echo "- PyTorch was installed with CUDA support"
181 | else
182 |     echo "- PyTorch was installed with CPU-only support"
183 |     echo "- To install with CUDA support in the future, run: $0 -c"
184 | fi
185 | echo ""
186 | echo "## NEXT STEPS ##"
187 | echo "In order to start the TextPAIR web app, you need to configure and start up the web_server.sh script."
188 | echo "You can either:"
189 | echo "- Start it manually at /var/lib/text-pair/api_server/web_server.sh"
190 | echo "- Use the systemd init script located at /var/lib/text-pair/api_server/textpair.service: for this you will need to copy it to your OS systemd folder (usually /etc/systemd/system/) and run 'systemctl enable textpair && systemctl start textpair' as root"


--------------------------------------------------------------------------------
/lib/textpair/sequence_alignment/generate_ngrams.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """N-gram generator"""
  3 | 
  4 | import configparser
  5 | import os
  6 | import sqlite3
  7 | from collections import defaultdict
  8 | from glob import glob
  9 | from typing import Any, Dict, List, Tuple
 10 | 
 11 | import orjson
 12 | from mmh3 import hash as hash32
 13 | from text_preprocessing import PreProcessor, Tokens
 14 | from tqdm import tqdm
 15 | 
 16 | # https://github.com/tqdm/tqdm/issues/481
 17 | tqdm.monitor_interval = 0
 18 | PHILO_TEXT_OBJECT_LEVELS = {"doc": 1, "div1": 2, "div2": 3, "div3": 4, "para": 5, "sent": 6, "word": 7}
 19 | 
 20 | 
 21 | class Ngrams:
 22 |     """Generate Ngrams"""
 23 | 
 24 |     def __init__(
 25 |         self,
 26 |         text_object_type="doc",
 27 |         ngram=3,
 28 |         gap=0,
 29 |         stemmer=True,
 30 |         lemmatizer="",
 31 |         stopwords=False,
 32 |         numbers=False,
 33 |         language="french",
 34 |         lowercase=True,
 35 |         minimum_word_length=2,
 36 |         word_order=True,
 37 |         modernize=True,
 38 |         ascii=False,
 39 |         pos_to_keep=[],
 40 |         debug=False,
 41 |         **kwargs,
 42 |     ):
 43 |         self.config = {
 44 |             "ngram": ngram,
 45 |             "window": ngram + gap,
 46 |             "gap": gap,
 47 |             "word_order": word_order,
 48 |             "numbers": numbers,
 49 |             "stemmer": stemmer,
 50 |             "modernize": modernize,
 51 |             "language": language,
 52 |             "lowercase": lowercase,
 53 |             "minimum_word_length": minimum_word_length,
 54 |             "lemmatizer": lemmatizer,
 55 |             "stopwords": stopwords,
 56 |             "text_object_type": text_object_type,
 57 |             "pos_to_keep": pos_to_keep,
 58 |             "ascii": ascii,
 59 |         }
 60 |         self.debug = debug
 61 |         self.input_path = ""
 62 |         self.output_path = ""
 63 |         self.db_name = ""
 64 |         self.db_path = ""
 65 |         self.is_philo_db = False
 66 | 
 67 |     def __dump_config(self, output_path):
 68 |         with open(os.path.join(output_path, "config/ngram_config.ini"), "w", encoding="utf-8") as ini_file:
 69 |             ngram_config = configparser.ConfigParser()
 70 |             ngram_config.add_section("PREPROCESSING")
 71 |             for param, value in self.config.items():
 72 |                 ngram_config.set("PREPROCESSING", param, repr(value))
 73 |             ngram_config.write(ini_file)
 74 | 
 75 |     def generate(
 76 |         self,
 77 |         file_path: str,
 78 |         output_path: str,
 79 |         workers: int,
 80 |     ):
 81 |         """Generate n-grams."""
 82 |         if os.path.isfile(file_path):
 83 |             files = [file_path]
 84 |         else:
 85 |             files = glob(os.path.join(file_path, "*"))
 86 |         os.system(f"rm -rf {output_path}/ngrams")
 87 |         os.system(f"rm -rf {output_path}/ngrams_in_order")
 88 |         os.system(f"mkdir -p {output_path}/ngrams")
 89 |         if self.debug:
 90 |             os.system(f"mkdir {output_path}/debug")
 91 |         os.system(f"mkdir -p {output_path}/metadata")
 92 |         os.system(f"mkdir -p {output_path}/index")
 93 |         os.system(f"mkdir -p {output_path}/config")
 94 |         os.system(f"mkdir -p {output_path}/temp")
 95 |         os.system(f"mkdir -p {output_path}/ngrams_in_order")
 96 |         self.input_path = os.path.abspath(os.path.join(files[0], "../../../"))
 97 |         self.output_path = output_path
 98 |         combined_metadata: dict[str, Any] = {}
 99 | 
100 |         print("Generating ngrams...", flush=True)
101 |         preprocessor = PreProcessor(
102 |             language=self.config["language"],
103 |             stemmer=self.config["stemmer"],
104 |             lemmatizer=self.config["lemmatizer"],
105 |             modernize=self.config["modernize"],
106 |             lowercase=self.config["lowercase"],
107 |             strip_numbers=self.config["numbers"],
108 |             stopwords=self.config["stopwords"],
109 |             pos_to_keep=self.config["pos_to_keep"],
110 |             ngrams=self.config["ngram"],
111 |             ngram_gap=self.config["gap"],
112 |             text_object_type=self.config["text_object_type"],
113 |             min_word_length=self.config["minimum_word_length"],
114 |             ascii=self.config["ascii"],
115 |             post_processing_function=self.text_to_ngram,
116 |             is_philo_db=True,
117 |             workers=workers,
118 |             progress=False,
119 |         )
120 |         philo_type_count = self.count_texts(files[0])
121 |         with tqdm(total=philo_type_count, leave=False) as pbar:
122 |             for local_metadata in preprocessor.process_texts(files, progress=False):
123 |                 combined_metadata.update(local_metadata)  # type: ignore
124 |                 pbar.update()
125 | 
126 |         print("Saving ngram index and most common ngrams (this can take a while)...", flush=True)
127 |         os.system(
128 |             rf"""for i in {output_path}/temp/*; do cat $i; done | sort -T {output_path} -S 25% | uniq -c |
129 |             sort -rn -T {output_path} -S 25% | awk '{{print $2"\t"$3}}' | tee {output_path}/index/index.tab |
130 |             awk '{{print $2}}' > {output_path}/index/most_common_ngrams.txt"""
131 |         )
132 | 
133 |         print("Saving metadata...")
134 |         with open(f"{self.output_path}/metadata/metadata.json", "wb") as metadata_output:
135 |             metadata_output.write(orjson.dumps(combined_metadata))
136 |         self.__dump_config(output_path)
137 | 
138 |         print("Cleaning up...")
139 |         os.system(f"rm -r {self.output_path}/temp")
140 | 
141 |     def text_to_ngram(self, text_object: Tokens) -> Dict[str, Any]:
142 |         """Tranform doc to inverted index of ngrams"""
143 |         doc_ngrams: List[str] = []
144 |         metadata: Dict[str, Any] = {}
145 |         # Make sure we only have strings in our metadata:
146 |         for k, v in text_object.metadata.items():
147 |             if not isinstance(v, str):
148 |                 text_object.metadata[k] = str(v)
149 |         text_object_id = "_".join(
150 |             text_object.metadata["philo_id"].split()[: PHILO_TEXT_OBJECT_LEVELS[self.config["text_object_type"]]]
151 |         )
152 |         metadata[text_object_id] = text_object.metadata
153 |         text_index = defaultdict(list)
154 |         doc_ngrams_in_order: List[Tuple[int, int]] = []  # for banality filter
155 |         for index_pos, ngram in enumerate(text_object):
156 |             hashed_ngram = hash32(ngram)
157 |             text_index[str(hashed_ngram)].append((index_pos, ngram.ext["start_byte"], ngram.ext["end_byte"]))
158 |             doc_ngrams_in_order.append((ngram.ext["start_byte"], hashed_ngram))
159 |             doc_ngrams.append("\t".join((ngram, str(hashed_ngram))))
160 |         with open(f"{self.output_path}/ngrams/{text_object_id}.json", "wb") as json_file:
161 |             json_file.write(orjson.dumps(dict(text_index)))
162 |         with open(f"{self.output_path}/temp/{text_object_id}", "w", encoding="utf-8") as output:
163 |             output.write("\n".join(sorted(doc_ngrams)))
164 |         with open(f"{self.output_path}/ngrams_in_order/{text_object_id}.json", "wb") as json_file:
165 |             json_file.write(orjson.dumps(doc_ngrams_in_order))
166 |         return metadata
167 | 
168 |     def count_texts(self, text: str) -> int:
169 |         """Count number of texts in PhiloLogic database"""
170 |         philo_db_path: str = os.path.abspath(os.path.join(text, os.pardir, os.pardir, "toms.db"))
171 |         toms_db = sqlite3.connect(philo_db_path)
172 |         cursor = toms_db.cursor()
173 |         cursor.execute("SELECT COUNT(*) FROM toms WHERE philo_type = ?", (self.config["text_object_type"],))
174 |         philo_type_count = cursor.fetchone()[0]
175 |         return philo_type_count


--------------------------------------------------------------------------------
/web-app/src/components/alignmentGroup.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |     <div id="alignment-group" class="mt-4">
  3 |         <h5 style="text-align: center">
  4 |             <i>"{{ sourcePassage.source_passage }}"</i>
  5 |             <div class="reuse-title mt-1">
  6 |                 <citations :citation="globalConfig.sourceCitation" :alignment="sourcePassage"></citations>
  7 |             </div>
  8 |         </h5>
  9 |         <div id="timeline-container" class="px-2 pb-2">
 10 |             <div id="vertical-line"></div>
 11 |             <div class="timeline-dates" v-for="(date, index) in timeline" :key="index">
 12 |                 <div class="year btn btn-secondary">{{ date.year }}</div>
 13 |                 <div class="timeline-events card shadow-1 px-2 pt-2 mt-3" v-for="(reuse, reuseIndex) in date.result"
 14 |                     :key="reuseIndex">
 15 |                     <h5 class="reuse-title" @click="showPassage">
 16 |                         <citations :citation="globalConfig[`${reuse.direction}Citation`]" :alignment="reuse"></citations>
 17 |                     </h5>
 18 |                     <p class="timeline-text-content m-0">
 19 |                         <span class="text-content">
 20 |                             {{ reuse[`${reuse.direction}_context_before`] }}
 21 |                             <span class="highlight">{{ reuse[`${reuse.direction}_passage`] }}</span> {{
 22 |                                 reuse[`${reuse.direction}_context_after`] }}
 23 |                         </span>
 24 |                         <span class="text-muted text-center d-block mt-1">
 25 |                             <button class="group-diff-btn" @click="showDifferences(reuse)">Show differences</button>
 26 |                         </span>
 27 |                     </p>
 28 |                 </div>
 29 |             </div>
 30 |         </div>
 31 |         <div id="passage-diff" class="modal fade" tabindex="-1">
 32 |             <div class="modal-dialog" style="min-width: 1024px">
 33 |                 <div class="modal-content">
 34 |                     <div class="modal-header">
 35 |                         <h5 class="modal-title">Passage Pair</h5>
 36 |                         <button type="button" class="btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
 37 |                     </div>
 38 |                     <div class="modal-body">
 39 |                         <passage-pair v-if="showPair" :alignment="localAlignment" :diffed="true" :index="0"
 40 |                             :startByteIndex="startByteIndex" :endByteIndex="this.endByteIndex" :key="render"></passage-pair>
 41 |                     </div>
 42 |                 </div>
 43 |             </div>
 44 |         </div>
 45 |     </div>
 46 | </template>
 47 | 
 48 | <script>
 49 | import passagePair from "./passagePair.vue";
 50 | import citations from "./citations.vue";
 51 | import { Modal } from "bootstrap";
 52 | 
 53 | export default {
 54 |     name: "alignmentGroup",
 55 |     components: { passagePair, citations },
 56 |     inject: ["$http"],
 57 |     data() {
 58 |         return {
 59 |             loading: true,
 60 |             globalConfig: this.$globalConfig,
 61 |             done: false,
 62 |             timeline: {},
 63 |             sourcePassage: {},
 64 |             passages: [],
 65 |             showPair: false,
 66 |             localAlignment: {},
 67 |             modal: null,
 68 |             startByteIndex: 0,
 69 |             endByteIndex: 0,
 70 |             render: 0, // used to force re-rendering of passagePair
 71 |         };
 72 |     },
 73 |     created() {
 74 |         this.fetchData();
 75 |     },
 76 |     mounted() {
 77 |         this.modal = new Modal(document.getElementById("passage-diff"), {
 78 |             keyboard: false,
 79 |             backdrop: "static",
 80 |         });
 81 |     },
 82 |     watch: {
 83 |         $route: "fetchData",
 84 |     },
 85 |     methods: {
 86 |         fetchData() {
 87 |             this.results = {};
 88 |             let params = { db_table: this.$globalConfig.databaseName };
 89 |             this.loading = true;
 90 |             this.$http
 91 |                 .get(`${this.$globalConfig.apiServer}/group/${this.$route.params.groupId}/?${this.paramsToUrl(params)}`, {
 92 |                     metadata: this.$globalConfig.metadataTypes,
 93 |                 })
 94 |                 .then((response) => {
 95 |                     this.loading = false;
 96 |                     this.done = true;
 97 |                     this.timeline = response.data.passageList
 98 |                     this.sourcePassage = response.data.original_passage
 99 |                 })
100 |                 .catch((error) => {
101 |                     this.loading = false;
102 |                     this.error = error.toString();
103 |                     console.log(error);
104 |                 });
105 |         },
106 |         formatTitle(title) {
107 |             if (title.length > 300) {
108 |                 let titleSplit = title.slice(0, 300).split(' ');
109 |                 title = titleSplit.slice(0, titleSplit.length - 1).join(" ") + " [...]";
110 |             }
111 |             return title;
112 |         },
113 |         showPassage(event) {
114 |             let textPassage = event.srcElement.closest(".timeline-events").querySelector(".timeline-text-content")
115 |             if (!textPassage.classList.contains("show")) {
116 |                 textPassage.classList.add("show")
117 |             } else {
118 |                 textPassage.classList.remove("show")
119 |             }
120 | 
121 |         },
122 |         showDifferences(reuse) {
123 |             this.showPair = false;
124 |             this.localAlignment = { ...this.sourcePassage }
125 |             for (let key in reuse) {
126 |                 let newKey = key.replace(/source_/, "target_")
127 |                 if (newKey.startsWith("target_")) {
128 |                     this.localAlignment[newKey] = reuse[key]
129 |                 }
130 |             }
131 |             this.localAlignment.count = 1;
132 |             reuse.source_passage = reuse.source_passage.replace(/\s\s+/g, ' ')
133 |             this.sourcePassage.source_passage = this.sourcePassage.source_passage.replace(/\s\s+/g, ' ')
134 |             this.startByteIndex = this.sourcePassage.source_passage.indexOf(reuse.source_passage)
135 |             let remove = reuse.source_passage.length
136 |             while (this.startByteIndex == -1) { // if the passage is not found, remove one character from the end and try again
137 |                 this.startByteIndex = this.sourcePassage.source_passage.indexOf(reuse.source_passage.slice(0, remove))
138 |                 remove -= 1
139 |             }
140 |             this.endByteIndex = this.startByteIndex + reuse.source_passage.length
141 |             this.render += 1;
142 |             this.showPair = true;
143 |             this.modal.show()
144 |         },
145 |     },
146 | };
147 | </script>
148 | 
149 | <style  lang="scss" scoped>
150 | @import "../assets/theme.module.scss";
151 | 
152 | 
153 | #timeline-container,
154 | .timeline-dates {
155 |     text-align: center;
156 |     position: relative;
157 | }
158 | 
159 | #vertical-line {
160 |     position: absolute;
161 |     left: 50%;
162 |     border-left: 2px dotted $button-color;
163 |     top: 0;
164 |     height: 105%;
165 |     z-index: -2;
166 | }
167 | 
168 | .year,
169 | .reuse-title,
170 | .timeline-events {
171 |     position: relative;
172 |     z-index: 1;
173 | }
174 | 
175 | .timeline-events {
176 |     width: 70%;
177 |     margin-left: auto;
178 |     margin-right: auto;
179 | }
180 | 
181 | .highlight,
182 | .reuse-title {
183 |     color: $passage-color;
184 | }
185 | 
186 | .reuse-title {
187 |     cursor: pointer;
188 | }
189 | 
190 | .timeline-text-content {
191 |     font-size: 1rem;
192 |     overflow: hidden;
193 |     z-index: -1;
194 |     position: relative;
195 |     line-height: 0;
196 |     padding: 0;
197 |     transform: translateY(-25%);
198 |     opacity: 0;
199 |     transition: all 200ms ease-out;
200 | }
201 | 
202 | .timeline-text-content.show {
203 |     opacity: 1;
204 |     padding-top: 0.25rem;
205 |     padding-bottom: 0.5rem;
206 |     line-height: 1.5;
207 |     transform: translateY(0);
208 | }
209 | 
210 | .year {
211 |     margin-top: 2rem;
212 |     margin-bottom: .25rem;
213 |     cursor: initial;
214 | }
215 | 
216 | .group-diff-btn {
217 |     font-size: smaller;
218 | }
219 | </style>


--------------------------------------------------------------------------------
/web-app/src/components/sortedResults.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |     <div class="mt-3">
  3 |         <div class="row" style="padding: 0 0.75rem">
  4 |             <div class="m-4" style="font-size: 120%" v-if="error">No results for your query</div>
  5 |             <search-arguments></search-arguments>
  6 |         </div>
  7 |         <report-switcher></report-switcher>
  8 |         <div class="row">
  9 |             <div class="col position-relative">
 10 |                 <div class="d-flex justify-content-center position-relative" v-if="loading">
 11 |                     <div class="spinner-border"
 12 |                         style="width: 8rem; height: 8rem; position: absolute; z-index: 50; top: 30px" role="status">
 13 |                         <span class="visually-hidden">Loading...</span>
 14 |                     </div>
 15 |                 </div>
 16 |                 <transition-group name="staggered-fade" tag="div" v-bind:css="false" v-on:before-enter="beforeEnter"
 17 |                     v-on:enter="enter">
 18 |                     <div class="card mb-3 rounded-0 shadow-1" style="position: relative"
 19 |                         v-for="(sourcePassage, index) in results.groups" :key="index + 1" v-bind:data-index="index">
 20 |                         <div class="corner-btn left">{{ index + 1 }}</div>
 21 |                         <h6 class="mt-2 pt-3" style="text-align: center;">
 22 |                             <citations :citation="globalConfig.sourceCitation" :alignment="sourcePassage"
 23 |                                 :link-to-doc="globalConfig.sourceLinkToDocMetadata"></citations>
 24 |                         </h6>
 25 |                         <span class="px-3 pb-2" v-if="sourcePassage.count">The following passage is reused (in whole or in
 26 |                             part) {{
 27 |                                 sourcePassage.count.toLocaleString() }}
 28 |                             times:</span>
 29 |                         <i class="passages px-3 pb-3">"{{ sourcePassage.source_passage }}"</i>
 30 |                         <div class="ps-3">
 31 |                             <router-link class="" :to="`/group/${sourcePassage.group_id}`">
 32 |                                 <button class="btn btn-secondary btn-sm mb-3">
 33 |                                     View all reuses of this passage
 34 |                                 </button>
 35 |                             </router-link>
 36 |                         </div>
 37 |                     </div>
 38 |                 </transition-group>
 39 |             </div>
 40 |         </div>
 41 |         <h5 class="mt-2 mb-4" style="text-align: center">For performance reasons, only the top 100 are displayed.</h5>
 42 |     </div>
 43 | </template>
 44 | 
 45 | <script>
 46 | import searchArguments from "./searchArguments";
 47 | import passagePair from "./passagePair";
 48 | import reportSwitcher from "./reportSwitcher";
 49 | import citations from "./citations";
 50 | import Velocity from "velocity-animate";
 51 | 
 52 | export default {
 53 |     name: "sortedResults",
 54 |     components: {
 55 |         searchArguments, passagePair, reportSwitcher, citations
 56 |     },
 57 |     inject: ["$http"],
 58 |     data() {
 59 |         return {
 60 |             loading: false,
 61 |             done: false,
 62 |             results: { groups: [] },
 63 |             counts: null,
 64 |             error: null,
 65 |             globalConfig: this.$globalConfig,
 66 |         };
 67 |     },
 68 |     created() {
 69 |         // fetch the data when the view is created and the data is
 70 |         // already being observed
 71 |         this.fetchData();
 72 |     },
 73 |     watch: {
 74 |         // call again the method if the route changes
 75 |         $route: "fetchData",
 76 |     },
 77 |     methods: {
 78 |         fetchData() {
 79 |             this.results = { groups: [] }; // clear alignments with new search
 80 |             this.facetResults = null; // clear facet results with new search
 81 |             this.error = null;
 82 |             this.loading = true;
 83 |             let params = { ...this.$route.query };
 84 |             params.db_table = this.$globalConfig.databaseName;
 85 |             this.emitter.emit("searchArgsUpdate", {
 86 |                 counts: "",
 87 |                 searchParams: params,
 88 |             });
 89 |             this.$http
 90 |                 .get(`${this.$globalConfig.apiServer}/sorted_results/?${this.paramsToUrl(params)}`)
 91 |                 .then((response) => {
 92 |                     this.results.groups = response.data.groups;
 93 |                     this.loading = false;
 94 |                     this.done = true;
 95 |                     this.emitter.emit("searchArgsUpdate", {
 96 |                         counts: response.data.total_count,
 97 |                         searchParams: params,
 98 |                     });
 99 |                 })
100 |                 .catch((error) => {
101 |                     this.loading = false;
102 |                     this.error = error.toString();
103 |                     console.log(error);
104 |                 });
105 |         },
106 |         previousPage() {
107 |             let queryParams = { ...this.$route.query };
108 |             queryParams.page = parseInt(this.results.page) - 1;
109 |             queryParams.direction = "previous";
110 |             queryParams.id_anchor = this.results.alignments[0].rowid_ordered;
111 |             this.$router.push(`/search?${this.paramsToUrl(queryParams)}`);
112 |         },
113 |         nextPage() {
114 |             let queryParams = { ...this.$route.query };
115 |             queryParams.page = parseInt(this.results.page) + 1;
116 |             queryParams.direction = "next";
117 |             queryParams.id_anchor = this.results.alignments[this.results.alignments.length - 1].rowid_ordered;
118 |             this.$router.push(`/search?${this.paramsToUrl(queryParams)}`);
119 |         },
120 |         facetSearch(field) {
121 |             let queryParams = { ...this.$route.query };
122 |             queryParams.db_table = this.$globalConfig.databaseName;
123 |             queryParams.facet = field;
124 |             this.facetLoading = true;
125 |             this.$http
126 |                 .post(`${this.$globalConfig.apiServer}/facets/?${this.paramsToUrl(queryParams)}`, {
127 |                     metadata: this.$globalConfig.metadataTypes,
128 |                 })
129 |                 .then((response) => {
130 |                     this.facetDirectionLabel = this.$globalConfig[`${response.data.facet.split("_")[0]}Label`];
131 |                     this.facetResults = response.data;
132 |                     this.toggleFacetList();
133 |                     this.facetLoading = false;
134 |                 })
135 |                 .catch((error) => {
136 |                     this.facetLoading = false;
137 |                     this.error = error.toString();
138 |                     console.log("ERROR", error);
139 |                 });
140 |         },
141 |         toggleFacetList() {
142 |             Array.from(document.getElementsByClassName("facet-list")).forEach(function (element) {
143 |                 element.classList.toggle("hide");
144 |             });
145 |             document.querySelector("#metadata-list").classList.toggle("show");
146 |         },
147 |         closeFacetResults() {
148 |             this.facetResults = null;
149 |             this.toggleFacetList();
150 |         },
151 |         filteredSearch(fieldName, value) {
152 |             let queryParams = { ...this.$route.query };
153 |             delete queryParams.page;
154 |             delete queryParams.id_anchor;
155 |             queryParams.db_table = this.$globalConfig.databaseName;
156 |             queryParams[fieldName] = `"${value}"`;
157 |             this.emitter.emit("urlUpdate", queryParams);
158 |             this.facetResults = null;
159 |             this.results = { alignments: [] };
160 |             this.$router.push(`/search?${this.paramsToUrl(queryParams)}`);
161 |         },
162 |         beforeEnter: function (el) {
163 |             el.style.opacity = 0;
164 |             el.style.height = 0;
165 |         },
166 |         enter: function (el, done) {
167 |             var delay = el.dataset.index * 100;
168 |             setTimeout(function () {
169 |                 Velocity(el, { opacity: 1, height: "100%" }, { complete: done });
170 |             }, delay);
171 |         },
172 |         toggleSearchForm() {
173 |             this.emitter.emit("toggleSearchForm");
174 |         },
175 |     },
176 | };
177 | </script>
178 | 
179 | <style scoped>
180 | .corner-btn {
181 |     font-family: "Open-Sans", sans-serif;
182 | }
183 | 
184 | #metadata-list {
185 |     display: none;
186 |     opacity: 0;
187 |     cursor: pointer;
188 |     transition: all 0.2s ease-out;
189 | }
190 | 
191 | #metadata-list:hover {
192 |     color: #565656;
193 | }
194 | </style>


--------------------------------------------------------------------------------
/web-app/src/components/searchArguments.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |     <!-- Compact version when no search params on either side -->
  3 |     <div v-if="hasNoParams" class="compact-search-args p-2 mb-2 shadow-1"
  4 |         style="background-clip: border-box; border: 1px solid rgba(0, 0, 0, 0.125)">
  5 |         <div class="p-2" v-if="!counts || counts === '...'">
  6 |             <span class="spinner-border spinner-border-sm me-1" role="status" aria-hidden="true"></span>
  7 |             results across all documents
  8 |         </div>
  9 |         <div class="p-2" v-else>
 10 |             <strong>{{ counts }}</strong> results across all documents
 11 |         </div>
 12 |     </div>
 13 | 
 14 |     <!-- Regular version when search params exist -->
 15 |     <div v-else id="search-arguments" class="mt-1 mb-2 pt-2 shadow-1"
 16 |         style="background-clip: border-box; border: 1px solid rgba(0, 0, 0, 0.125);">
 17 |         <div class="mb-1 p-2" style="font-size: 1rem" v-if="!error">
 18 |             <span v-if="!counts || counts === '...'">
 19 |                 <span class="spinner-border spinner-border-sm me-1" role="status" aria-hidden="true"></span>
 20 |                 results for the following query:
 21 |             </span>
 22 |             <span v-else>
 23 |                 <strong>{{ counts }}</strong> results for the following query:
 24 |             </span>
 25 |         </div>
 26 |         <div class="row pl-2" v-if="!error">
 27 |             <div class="col-6 rounded-0 pb-2 search-args-group" v-for="(paramGroup, groupIndex) in searchParams"
 28 |                 :key="groupIndex">
 29 |                 <h6 class="text-center text-capitalize mb-2">
 30 |                     <span v-html="paramGroup.direction"></span>
 31 |                     Parameters:
 32 |                 </h6>
 33 |                 <div v-if="paramGroup.params != null">
 34 |                     <div class="metadata-args" v-for="metadata in paramGroup.params" :key="metadata.field">
 35 |                         <span class="metadata-label">
 36 |                             {{ metadata.label }}
 37 |                         </span>
 38 |                         <span class="metadata-value">
 39 |                             {{ checkValue(metadata.value) }}
 40 |                         </span>
 41 |                         <span class="remove-metadata" @click="removeMetadata(metadata, $event)">x</span>
 42 |                     </div>
 43 |                 </div>
 44 |                 <div class="metadata-args none" v-if="paramGroup.params == null">None</div>
 45 |             </div>
 46 |         </div>
 47 |         <div class="my-2 pb-1 px-2" v-if="banality">
 48 |             <div class="metadata-args rounded-pill">
 49 |                 <span class="metadata-label"> Banality filter </span>
 50 |                 <span class="metadata-value">
 51 |                     {{ banality }}
 52 |                 </span>
 53 |                 <span class="remove-metadata" @click="removeMetadata({ fieldName: 'banality' }, $event)">x </span>
 54 |             </div>
 55 |         </div>
 56 |     </div>
 57 | </template>
 58 | <script>
 59 | export default {
 60 |     name: "searchArguments",
 61 |     created() {
 62 |         this.emitter.on("searchArgsUpdate", (params) => {
 63 |             this.counts = params.counts.toLocaleString();
 64 |             this.searchParams = this.processParams(params.searchParams);
 65 |             if ("banality" in params.searchParams && params.searchParams.banality.length > 0) {
 66 |                 if (params.searchParams.banality == "false") {
 67 |                     this.banality = "Filter all";
 68 |                 } else {
 69 |                     this.banality = "Only banalities";
 70 |                 }
 71 |             } else {
 72 |                 this.banality = false;
 73 |             }
 74 |         });
 75 |     },
 76 |     data() {
 77 |         return {
 78 |             globalConfig: this.$globalConfig,
 79 |             counts: "...",
 80 |             error: null,
 81 |             searchParams: [
 82 |                 { direction: this.$globalConfig.sourceLabel, params: null },
 83 |                 { direction: this.$globalConfig.targetLabel, params: null },
 84 |             ],
 85 |             banality: null,
 86 |         };
 87 |     },
 88 |     computed: {
 89 |         hasNoParams() {
 90 |             // Check if both source and target have no params, and no banality filter
 91 |             const noSourceParams = !this.searchParams[0].params || this.searchParams[0].params.length === 0;
 92 |             const noTargetParams = !this.searchParams[1].params || this.searchParams[1].params.length === 0;
 93 |             return noSourceParams && noTargetParams && !this.banality;
 94 |         }
 95 |     },
 96 |     methods: {
 97 |         toggleSearchForm() {
 98 |             this.emitter.emit("toggleSearchForm");
 99 |         },
100 |         processParams(params) {
101 |             let searchParams = [];
102 |             for (let direction of ["source", "target"]) {
103 |                 let paramGroup = [];
104 |                 for (let metadata of this.$globalConfig.metadataFields[direction]) {
105 |                     if (metadata.value in params && params[metadata.value].length > 0) {
106 |                         paramGroup.push({
107 |                             label: metadata.label,
108 |                             fieldName: metadata.value,
109 |                             value: params[metadata.value],
110 |                         });
111 |                     }
112 |                 }
113 |                 if (paramGroup.length > 0) {
114 |                     searchParams.push({ direction: this.globalConfig[`${direction}Label`], params: paramGroup });
115 |                 } else {
116 |                     searchParams.push({ direction: this.globalConfig[`${direction}Label`], params: null });
117 |                 }
118 |             }
119 |             return searchParams;
120 |         },
121 |         removeMetadata(metadata, event) {
122 |             event.target.parentNode.parentNode.style.display = "none";
123 |             let queryParams = { ...this.$route.query };
124 |             delete queryParams.page;
125 |             delete queryParams.id_anchor;
126 |             queryParams.db_table = this.$globalConfig.databaseName;
127 |             queryParams[metadata.fieldName] = "";
128 |             this.emitter.emit("urlUpdate", queryParams);
129 |             this.facetResults = null;
130 |             this.results = { alignments: [] };
131 |             let route = this.$route.path;
132 |             this.$router.push(`${route}?${this.paramsToUrl(queryParams)}`);
133 |         },
134 |         checkValue(value) {
135 |             if (value == '""') {
136 |                 return "N/A";
137 |             } else {
138 |                 return value;
139 |             }
140 |         },
141 |     },
142 | };
143 | </script>
144 | <style scoped>
145 | #search-arguments {
146 |     font-family: "Open-Sans", sans-serif;
147 | }
148 | 
149 | .compact-search-args {
150 |     font-family: "Open-Sans", sans-serif;
151 |     font-size: 1rem;
152 |     line-height: 1.2;
153 | }
154 | 
155 | .metadata-args {
156 |     border: 1px solid #ddd;
157 |     display: -webkit-inline-box !important;
158 |     display: -ms-inline-flexbox !important;
159 |     display: inline-flex !important;
160 |     margin-right: 5px;
161 |     border-radius: 50rem;
162 |     width: -webkit-fit-content;
163 |     width: -moz-fit-content;
164 |     width: fit-content;
165 |     line-height: 2;
166 |     margin-bottom: 0.5rem;
167 | }
168 | 
169 | .metadata-args.none {
170 |     border-width: 0px !important;
171 | }
172 | 
173 | .metadata-label {
174 |     display: -webkit-box;
175 |     display: -ms-flexbox;
176 |     display: flex;
177 |     -webkit-box-align: center;
178 |     -ms-flex-align: center;
179 |     align-items: center;
180 |     border: solid #ddd;
181 |     border-width: 0 1px 0 0;
182 |     border-top-left-radius: 50rem;
183 |     border-bottom-left-radius: 50rem;
184 |     padding: 0 0.5rem;
185 | }
186 | 
187 | .metadata-value {
188 |     display: -webkit-box;
189 |     display: -ms-flexbox;
190 |     display: flex;
191 |     -webkit-box-align: center;
192 |     -ms-flex-align: center;
193 |     align-items: center;
194 |     -webkit-box-decoration-break: clone;
195 |     box-decoration-break: clone;
196 |     padding: 0 0.5rem;
197 | }
198 | 
199 | .remove-metadata {
200 |     display: -webkit-box;
201 |     display: -ms-flexbox;
202 |     display: flex;
203 |     -webkit-box-align: center;
204 |     -ms-flex-align: center;
205 |     align-items: center;
206 |     padding-right: 5px;
207 |     padding-left: 5px;
208 |     border-left: 1px solid #ddd;
209 |     border-top-right-radius: 50rem;
210 |     border-bottom-right-radius: 50rem;
211 |     padding: 0 0.5rem;
212 | }
213 | 
214 | .remove-metadata:hover {
215 |     cursor: pointer;
216 | }
217 | 
218 | .corner-btn.right {
219 |     position: initial;
220 |     line-height: 29px;
221 |     padding: 5px;
222 |     border-width: 0px 0px 1px 1px;
223 | }
224 | 
225 | .search-args-group:last-of-type {
226 |     border-left: 1px solid #ddd;
227 | }
228 | </style>


--------------------------------------------------------------------------------
/lib/textpair/passage_classifier.py:
--------------------------------------------------------------------------------
  1 | """Passage classification for thematic categorization of alignments"""
  2 | 
  3 | import html
  4 | import re
  5 | 
  6 | import lz4.frame
  7 | import orjson
  8 | from tqdm import tqdm
  9 | from transformers import pipeline
 10 | 
 11 | 
 12 | def get_expanded_passage(alignment: dict, context_bytes: int = 1000) -> str:
 13 |     """
 14 |     Pulls context around a target passage using byte offsets.
 15 | 
 16 |     Args:
 17 |         alignment: Alignment dict with target_passage, target_filename,
 18 |                    target_start_byte, target_end_byte
 19 |         context_bytes: Number of bytes to read before and after (default: 1000)
 20 | 
 21 |     Returns:
 22 |         Expanded passage string with context before and after
 23 |     """
 24 |     target_passage = alignment.get("target_passage", "")
 25 |     filename = alignment.get("target_filename", "")
 26 |     start_byte = alignment.get("target_start_byte", 0)
 27 |     end_byte = alignment.get("target_end_byte", 0)
 28 | 
 29 |     # If we don't have the required fields, just return the original passage
 30 |     if not filename or not start_byte or not end_byte:
 31 |         return target_passage
 32 | 
 33 |     context_before = ""
 34 |     context_after = ""
 35 | 
 36 |     try:
 37 |         with open(filename, "rb") as f:
 38 |             # Get context before
 39 |             seek_pos_before = max(0, start_byte - context_bytes)
 40 |             read_len_before = start_byte - seek_pos_before
 41 |             if read_len_before > 0:
 42 |                 f.seek(seek_pos_before)
 43 |                 bytes_before = f.read(read_len_before)
 44 |                 context_before = bytes_before.decode("utf-8", errors="ignore").strip()
 45 |                 context_before = re.sub(r"\s+", " ", context_before)
 46 |                 context_before = re.sub(r"^\w+>", "", context_before)
 47 |                 context_before = re.sub(r"<.*?>", "", context_before)
 48 |                 context_before = html.unescape(context_before)
 49 | 
 50 |             # Get context after
 51 |             f.seek(end_byte)
 52 |             bytes_after = f.read(context_bytes)
 53 |             context_after = bytes_after.decode("utf-8", errors="ignore").strip()
 54 |             context_after = re.sub(r"\s+", " ", context_after)
 55 |             context_after = re.sub(r"<[^>]$", "", context_after)
 56 |             context_after = re.sub(r"<.*?>", "", context_after)
 57 |             context_after = html.unescape(context_after)
 58 |     except Exception:
 59 |         # If file reading fails, fall back to original passage
 60 |         return target_passage
 61 | 
 62 |     # Return the expanded passage
 63 |     return f"{context_before} {target_passage} {context_after}".strip()
 64 | 
 65 | 
 66 | async def classify_passages(
 67 |     input_path: str,
 68 |     zero_shot_model: str,
 69 |     classification_classes: dict[str, str],
 70 |     min_confidence: float = 0.7,
 71 |     top_k: int = 3,
 72 |     batch_size: int = 32
 73 | ) -> int:
 74 |     """
 75 |     Classify passages into thematic categories using zero-shot classification.
 76 | 
 77 |     This performs multi-label classification where each passage can receive multiple
 78 |     category labels based on confidence thresholds.
 79 | 
 80 |     Args:
 81 |         input_path: Path to alignments file (jsonl.lz4 format)
 82 |         zero_shot_model: Hugging Face model for zero-shot classification
 83 |         classification_classes: Dict mapping class names to their definitions/criteria
 84 |         min_confidence: Minimum confidence score (0-1) to assign a label (default: 0.3)
 85 |         top_k: Maximum number of labels to assign per passage (default: 3)
 86 |         batch_size: Number of passages to process at once (default: 32)
 87 | 
 88 |     Returns:
 89 |         Number of passages classified
 90 |     """
 91 |     if not classification_classes:
 92 |         print("No classification classes defined. Skipping passage classification.")
 93 |         return 0
 94 | 
 95 |     print(f"Loading passage classifier: {zero_shot_model}")
 96 |     classifier = pipeline(
 97 |         "zero-shot-classification",
 98 |         model=zero_shot_model,
 99 |         device=0  # Use GPU if available
100 |     )
101 | 
102 |     # Extract class labels and their descriptions
103 |     candidate_labels = list(classification_classes.keys())
104 | 
105 |     # Prepare output
106 |     temp_output_path = input_path.replace(".jsonl.lz4", ".jsonl_temp.lz4")
107 | 
108 |     # Count lines for progress
109 |     with lz4.frame.open(input_path, "rb") as f_count:
110 |         num_lines = sum(1 for _ in f_count)
111 | 
112 |     if num_lines == 0:
113 |         print("Input file is empty.")
114 |         return 0
115 | 
116 |     classified_count = 0
117 |     with (lz4.frame.open(temp_output_path, "wb") as output_file,
118 |           lz4.frame.open(input_path, "rb") as f_in,
119 |           tqdm(total=num_lines, desc="Passage classification") as pbar):
120 | 
121 |         batch = []
122 |         batch_alignments = []
123 | 
124 |         for line_b in f_in:
125 |             alignment = orjson.loads(line_b)
126 | 
127 |             # Expand passage with surrounding context for better classification
128 |             expanded_passage = get_expanded_passage(alignment, context_bytes=1000)
129 | 
130 |             batch.append(expanded_passage)
131 |             batch_alignments.append(alignment)
132 | 
133 |             # Process batch
134 |             if len(batch) >= batch_size:
135 |                 results = classifier(
136 |                     batch,
137 |                     candidate_labels,
138 |                     multi_label=True,  # Allow multiple labels per passage
139 |                     batch_size=batch_size
140 |                 )
141 | 
142 |                 for alignment, result in zip(batch_alignments, results):
143 |                     # Filter labels by confidence threshold and take top_k
144 |                     labels_and_scores = list(zip(result["labels"], result["scores"]))
145 | 
146 |                     # Filter by minimum confidence
147 |                     filtered = [(label, score) for label, score in labels_and_scores if score >= min_confidence]
148 | 
149 |                     # Take top_k
150 |                     top_labels = filtered[:top_k]
151 | 
152 |                     # Store results
153 |                     alignment["passage_categories"] = [label for label, _ in top_labels]
154 |                     alignment["passage_categories_scores"] = [round(score, 3) for _, score in top_labels]
155 | 
156 |                     if top_labels:
157 |                         classified_count += 1
158 | 
159 |                     output_file.write(orjson.dumps(alignment) + b"\n")
160 |                     pbar.update(1)
161 | 
162 |                 batch = []
163 |                 batch_alignments = []
164 | 
165 |         # Process remaining batch
166 |         if batch:
167 |             results = classifier(
168 |                 batch,
169 |                 candidate_labels,
170 |                 multi_label=True,
171 |                 batch_size=len(batch)
172 |             )
173 | 
174 |             for alignment, result in zip(batch_alignments, results):
175 |                 labels_and_scores = list(zip(result["labels"], result["scores"]))
176 |                 filtered = [(label, score) for label, score in labels_and_scores if score >= min_confidence]
177 |                 top_labels = filtered[:top_k]
178 | 
179 |                 alignment["passage_categories"] = [label for label, _ in top_labels]
180 |                 alignment["passage_categories_scores"] = [round(score, 3) for _, score in top_labels]
181 | 
182 |                 if top_labels:
183 |                     classified_count += 1
184 | 
185 |                 output_file.write(orjson.dumps(alignment) + b"\n")
186 |                 pbar.update(1)
187 | 
188 |     # Replace original with classified version
189 |     import os
190 |     os.replace(temp_output_path, input_path)
191 | 
192 |     print(f"Classification complete: {classified_count}/{num_lines} passages received category labels")
193 |     print(f"(Passages with no labels had all scores below {min_confidence} threshold)")
194 | 
195 |     return num_lines
196 | 
197 | 
198 | if __name__ == "__main__":
199 |     import asyncio
200 |     import sys
201 | 
202 |     if len(sys.argv) < 2:
203 |         print("Usage: python passage_classifier.py <path_to_alignments.jsonl.lz4>")
204 |         sys.exit(1)
205 | 
206 |     file_path = sys.argv[1]
207 | 
208 |     # Test with example categories
209 |     test_classes = {
210 |         "Satire & Humor": "Passages using irony, satire, humor, parody, or comical situations",
211 |         "Religion & Spirituality": "Speech about faith, God, theology, scripture, church",
212 |         "Philosophy": "Speech about morality, ethics, virtue, reason, metaphysics",
213 |     }
214 | 
215 |     total = asyncio.run(classify_passages(
216 |         file_path,
217 |         "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7",
218 |         test_classes,
219 |         min_confidence=0.3,
220 |         top_k=3
221 |     ))
222 |     print(f"Total passages processed: {total}")
223 | 


--------------------------------------------------------------------------------
/lib/textpair/vector_space_alignment/structures.py:
--------------------------------------------------------------------------------
  1 | """Data structures for vector space alignment"""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import os
  6 | import sqlite3
  7 | from collections.abc import Iterable
  8 | from shutil import rmtree
  9 | from typing import Callable
 10 | 
 11 | import dill as pickle
 12 | import msgspec
 13 | import numpy as np
 14 | import torch
 15 | from msgspec import field
 16 | from text_preprocessing import Tokens
 17 | 
 18 | # Global constants for serialization and path management
 19 | TEMP_DIR = os.getcwd()
 20 | PHILO_TEXT_OBJECT_LEVELS = {"doc": 1, "div1": 2, "div2": 3, "div3": 4, "para": 5, "sent": 6, "word": 7}
 21 | 
 22 | 
 23 | class PassageGroup(msgspec.Struct, array_like=True):
 24 |     """Text passage with all associated properties and vector representation"""
 25 | 
 26 |     start_byte: int = 0
 27 |     end_byte: int = 0
 28 |     filename: str = ""
 29 |     metadata: dict = {}
 30 | 
 31 | 
 32 | class MergedGroup(msgspec.Struct, array_like=True):
 33 |     """A source and target PassageGroup pair with similarity"""
 34 | 
 35 |     source: PassageGroup = field(default_factory=PassageGroup)
 36 |     target: PassageGroup = field(default_factory=PassageGroup)
 37 |     similarity: float = 0.0
 38 | 
 39 | 
 40 | # Msgpack encoders/decoders for serialization
 41 | ENCODER = msgspec.msgpack.Encoder()
 42 | DECODER = msgspec.msgpack.Decoder(type=MergedGroup)
 43 | 
 44 | 
 45 | class DocumentChunks:
 46 |     """A generator with caching"""
 47 | 
 48 |     def __init__(self, docs: Iterable[list[str]], save_path: str, transform_function: Callable):
 49 |         self.docs = docs
 50 |         self.doc_list: list[list[str]] = []
 51 |         self.doc_count = 0
 52 |         self.generator_exhausted = False
 53 |         self.transform_function = transform_function
 54 |         self.corpus_type = self.transform_function.__qualname__.split(".")[0]
 55 |         self.path = os.path.join(TEMP_DIR, "output/chunks/", save_path)
 56 |         if os.path.exists(self.path):
 57 |             rmtree(self.path)
 58 |         os.makedirs(self.path, exist_ok=True)
 59 | 
 60 |     def __iter__(self) -> Iterable[str | list[str] | torch.Tensor | np.ndarray]:
 61 |         if self.generator_exhausted is False:
 62 |             if self.doc_count == 0:
 63 |                 for doc in self.docs:
 64 |                     doc = self.__format_doc(doc)
 65 |                     self.__save(doc)
 66 |                     self.doc_count += 1
 67 |                     yield doc
 68 |             else:
 69 |                 for doc_name in range(self.doc_count):
 70 |                     yield self.__load(doc_name)
 71 |                 for doc in self.docs:
 72 |                     doc = self.__format_doc(doc)
 73 |                     self.__save(doc)
 74 |                     self.doc_count += 1
 75 |                     yield doc
 76 |             self.generator_exhausted = True
 77 |         else:
 78 |             for doc_name in self.doc_list:
 79 |                 yield self.__load(doc_name)
 80 | 
 81 |     def __save(self, doc: list[str] | str):
 82 |         filename = os.path.join(self.path, str(self.doc_count))
 83 |         if self.transform_function is None:
 84 |             with open(filename, "wb") as output_file:
 85 |                 pickle.dump(doc, output_file)
 86 |         transformed_doc = self.transform_function([doc])
 87 |         if self.corpus_type == "TransformerCorpus":
 88 |             torch.save(transformed_doc, f"{filename}.pt")
 89 |         else:
 90 |             np.save(f"{filename}.npy", transformed_doc)
 91 | 
 92 |     def __load(self, doc_name) -> list[str] | torch.Tensor | np.ndarray:
 93 |         filename = os.path.join(self.path, str(doc_name))
 94 |         if self.transform_function is None:
 95 |             with open(filename, "rb") as input_file:
 96 |                 doc = pickle.load(input_file)
 97 |             return doc
 98 |         elif self.corpus_type == "TransformerCorpus":
 99 |             return torch.load(f"{filename}.pt")
100 |         return np.load(f"{filename}.npy")[0]
101 | 
102 |     def __get_doc(self, index: int) -> list[str] | torch.Tensor | np.ndarray:
103 |         doc = None
104 |         while index > self.doc_count:
105 |             try:
106 |                 doc = next(self.docs)
107 |                 self.__format_doc(doc)
108 |                 self.__save(doc)
109 |                 self.doc_count += 1
110 |             except StopIteration as e:
111 |                 raise IndexError from e
112 |         if doc is None:
113 |             return self.__load(index)
114 |         return doc
115 | 
116 |     def __getitem__(self, item: int | slice) -> list[str] | str | list[list[str] | str] | np.ndarray | torch.Tensor:
117 |         if isinstance(item, slice):
118 |             end = item.stop
119 |             if item.stop > len(self):  # avoid index out of range
120 |                 end = len(self)
121 |             if self.transform_function is None or self.corpus_type == "Word2VecEmbeddingCorpus":
122 |                 return np.array([self.__get_doc(index) for index in range(item.start, end)])
123 |             return torch.cat([self.__get_doc(index) for index in range(item.start, end)])  # type:ignore
124 |         return self.__get_doc(item)
125 | 
126 |     def __format_doc(self, doc: list[str]) -> str:
127 |         return " ".join(doc)
128 | 
129 |     def __len__(self):
130 |         if self.generator_exhausted is False:
131 |             for _ in self:
132 |                 pass
133 |         return self.doc_count
134 | 
135 | 
136 | class Matches:
137 |     """Matches cached to disk"""
138 | 
139 |     def __init__(self, matches: Iterable[MergedGroup]):
140 |         self.path = os.path.join(TEMP_DIR, "output/results/matches")
141 |         os.makedirs(self.path, exist_ok=True)
142 |         self.count = 0
143 |         if isinstance(matches, list) and matches:
144 |             self.matches = matches
145 |             self.is_cached = False
146 |             self.count = len(self.matches)
147 |         else:
148 |             self.conn = sqlite3.connect(os.path.join(self.path, "matches.db"))
149 |             self.cursor = self.conn.cursor()
150 |             self.cursor.execute("DROP TABLE IF EXISTS matches")
151 |             self.cursor.execute("CREATE TABLE matches (match_id INTEGER, match blob)")
152 |             self.cursor.execute("CREATE INDEX match_id_index ON matches (match_id)")
153 |             self.matches = None
154 |             self.is_cached = True
155 |             self.count = self.__save(matches)  # save generator to disk
156 | 
157 |     def match_generator(self, new_matches):
158 |         for match in new_matches:
159 |             dump = ENCODER.encode(match)
160 |             yield (self.count, dump)
161 |             self.count += 1
162 | 
163 |     def extend(self, new_matches: Iterable[MergedGroup]):
164 |         """Add new matches to existing matches"""
165 |         encoded_matches = self.match_generator(new_matches)
166 |         self.cursor.executemany("INSERT INTO matches VALUES (?, ?)", encoded_matches)
167 | 
168 |     def __save(self, matches):
169 |         count = 0
170 |         for count, match in enumerate(matches):
171 |             dump = ENCODER.encode(match)
172 |             self.cursor.execute("INSERT INTO matches VALUES (?, ?)", (self.count, dump))
173 |         if count == 0:
174 |             return 0
175 |         self.conn.commit()
176 |         return count + 1
177 | 
178 |     def done(self):
179 |         """Commit changes to database"""
180 |         self.conn.commit()
181 |         self.conn.close()
182 | 
183 |     @classmethod
184 |     def load(cls):
185 |         """Load instance of class by reading previously cached matches"""
186 |         matches = []
187 |         conn = sqlite3.connect(os.path.join(TEMP_DIR, "output/results/matches/matches.db"))
188 |         cursor = conn.cursor()
189 |         cursor.execute("SELECT match from matches ORDER BY match_id")
190 |         for match in cursor:
191 |             matches.append(DECODER.decode(match[0]))
192 |         conn.close()
193 |         return cls(matches)
194 | 
195 |     def __len__(self):
196 |         return self.count
197 | 
198 |     def __iter__(self):
199 |         if self.is_cached is False:
200 |             for index in range(self.count):
201 |                 yield self.matches[index] # type: ignore
202 |         else:
203 |             self.cursor.execute("SELECT match FROM matches ORDER BY match_id")
204 |             for match in self.cursor:
205 |                 yield DECODER.decode(match[0])
206 | 
207 | 
208 | # Lightweight, serializable data structure for efficient sentence searching.
209 | class TokenSearchData(msgspec.Struct):
210 |     """A lightweight container for token data needed for sentence searching."""
211 |     start_bytes: list[int]
212 |     end_bytes: list[int]
213 |     surface_forms: list[str]
214 |     sentence_ids: list[str]
215 | 
216 | 
217 | def save_tokens(tokens: Tokens, parsed_filename: str):
218 |     """
219 |     Saves token search data to a cache file using msgpack serialization.
220 |     """
221 |     start_bytes = [token.ext['start_byte'] for token in tokens.tokens]
222 |     end_bytes = [token.ext['end_byte'] for token in tokens.tokens]
223 |     surface_forms = [token.surface_form for token in tokens.tokens]
224 |     sentence_ids = [get_sentence_id(token) for token in tokens.tokens]
225 | 
226 |     search_data = TokenSearchData(
227 |         start_bytes=start_bytes,
228 |         end_bytes=end_bytes,
229 |         surface_forms=surface_forms,
230 |         sentence_ids=sentence_ids,
231 |     )
232 | 
233 |     # Save the data to the cache file
234 |     encoder = msgspec.msgpack.Encoder()
235 |     with open(parsed_filename, "wb") as f:
236 |         f.write(encoder.encode(search_data))
237 | 
238 | def load_token_search_data(parsed_filename: str) -> TokenSearchData:
239 |     """
240 |     Loads token search data from a cache if available, otherwise creates it
241 |     from the full Tokens object and caches it.
242 |     """
243 |     decoder = msgspec.msgpack.Decoder(TokenSearchData)
244 | 
245 |     with open(parsed_filename, "rb") as f:
246 |         return decoder.decode(f.read())
247 | 
248 | 
249 | def find_token_index_by_byte(bytes: list[int], byte_offset: int) -> int:
250 |     """
251 |     Finds the index of the token at a given byte offset using binary search
252 |     on a pre-computed list of start_bytes.
253 |     """
254 |     import bisect
255 |     if not bytes:
256 |         return -1
257 | 
258 |     # bisect_left finds the insertion point for the byte_offset.
259 |     index = bisect.bisect_left(bytes, byte_offset)
260 | 
261 |     # If the offset is exactly a token's start, we found it.
262 |     if index < len(bytes) and bytes[index] == byte_offset:
263 |         return index
264 | 
265 |     # If the insertion point is 0, it must be the first token.
266 |     if index == 0:
267 |         return 0
268 | 
269 |     # Otherwise, the correct token is the one *before* the insertion point.
270 |     return index - 1
271 | 
272 | 
273 | def get_sentence_id(token) -> str:
274 |     """Extracts the sentence ID from a token's position string."""
275 |     try:
276 |         # The sentence ID is composed of the first 6 integers of the position string.
277 |         return " ".join(token.ext['position'].split()[:6])
278 |     except (AttributeError, KeyError, IndexError):
279 |         return ""


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | "Nous ne faisons que nous entregloser" Montaigne wrote famously in his <i>Essais</i>... Since all we do is glose over what's already been written, we may as well build a tool to detect these intertextual relationships...
  2 | 
  3 | # TextPAIR (Pairwise Alignment for Intertextual Relations)
  4 | 
  5 | TextPAIR is a scalable and high-performance sequence aligner for humanities text analysis designed to identify "similar passages" in large collections of texts. These may include direct quotations, plagiarism and other forms of borrowings, commonplace expressions and the like. It is a complete rewrite and rethink of the <a href="https://code.google.com/archive/p/text-pair/">original implementation</a> released in 2009.
  6 | 
  7 | While TextPAIR was developed in response to the fairly specific phenomenon of similar passages across literary works, the sequence analysis techniques employed in TextPAIR were developed in widely disparate fields, such as bioinformatics and computer science, with applications ranging from genome sequencing to plagiarism detection. TextPAIR generates a set of overlapping word sequence shingles for every text in a corpus, then stores and indexes that information to be analyzed against shingles from other texts. For example, the opening declaration from Rousseau's Du Contrat Social,
  8 | 
  9 | `"L'homme est né libre, est partout il est dans les fers. Tel se croit le maître des autres, qui ne laisse pas d'être plus esclave qu'eux,"`
 10 | 
 11 | would be rendered in trigram shingles (with lemmatization, accents flattened and function words removed) as:
 12 | 
 13 | `homme_libre_partout, libre_partout_fer, partout_fer_croire, fer_croire_maitre, croire_maitre_laisser, maitre_laisser_esclave`
 14 | 
 15 | Common shingles across texts indicate many different types of textual borrowings, from direct citations to more ambiguous and unattributed usages of a passage. Using a simple search form, the user can quickly identify similar passages shared between different texts in one database, or even across databases, such as in the example below.
 16 | 
 17 | ![alt text](example.png)
 18 | 
 19 | ## Installation
 20 | 
 21 | The recommended install is to build your own Docker image and run TextPAIR inside a container.
 22 | 
 23 | ### Docker container method
 24 | 
 25 | -   Go to the docker folder and build a docker image: `docker build -t textpair .`
 26 | -   Start a new container: `docker run -td -p 80:80 --name textpair artfl/textpair init_textpair_db`
 27 |     Note that you may want to customize the `run` command according to your needs (e.g. to mount a volume for your data)
 28 |     You will need to copy your texts to the container, and then follow the normal procedure described below once inside the container.
 29 | 
 30 | If you do run into the issue where the web server does not respond, restart the web server with the following command:
 31 | `/var/lib/text-pair/api_server/web_server.sh &`
 32 | 
 33 | ### Manual installation
 34 | 
 35 | If you wish to install TextPAIR on a host machine, note that TextPair will only run on 64 bit Linux, see below.
 36 | 
 37 | #### Dependencies
 38 | 
 39 | -   Python 3.11 and up
 40 | -   Node and NPM
 41 | -   PostgreSQL: you will need to create a dedicated database and create a user with read/write permissions on that database. You will also need to create the pg_trgm extension on that database by running the following command in the PostgreSQL shell: `CREATE EXTENSION pg_trgm;` run as a superuser.
 42 | 
 43 | #### Install script
 44 | 
 45 | See <a href="docs/ubuntu_installation.md">Ubuntu install instructions</a>
 46 | 
 47 | -   Run `install.sh` script. This should install all needed components
 48 | -   Make sure you include `/etc/text-pair/apache_wsgi.conf` in your main Apache configuration file to enable searching
 49 | -   Edit `/etc/text-pair/global_settings.ini` to provide your PostgreSQL user, database, and password.
 50 | 
 51 | ## Quick start
 52 | 
 53 | Before running any alignment, make sure you edit your copy of `config.ini`. See [below](#configuring-the-alignment) for details
 54 | 
 55 | #### NOTE: source designates the source database from which reuses are deemed to originate, and target is the collection borrowing from source. In practice, the number of alignments won't vary significantly if you swap source and target
 56 | 
 57 | The sequence aligner is executed via the `textpair` command. The basic command is:
 58 | `textpair --config=/path/to/config [OPTIONS] [database_name]`
 59 | 
 60 | `textpair` takes the following command-line arguments:
 61 | 
 62 | -   `--config`: This argument is required. It defines the path to the configuration file where preprocessing, matching, and web application settings are set
 63 | -   `--is_philo_db`: Define if files are from a PhiloLogic database. If set to `True` metadata will be fetched using the PhiloLogic metadata index. Set to False by default.
 64 | -   `--output_path`: path to results
 65 | -   `--debug`: turn on debugging
 66 | -   `--workers`: Set number of workers/threads to use for parsing, ngram generation, and alignment.
 67 | -   `--update_db`: update database without rebuilding web_app. Should be used in conjunction with the --file argument
 68 | -   `--file`: alignment results file to load into database. Only used with the --update_db argument.
 69 | -   `--source_metadata`: source metadata needed for loading database. Used only with the --update_db and --file argument.
 70 | -   `--target_metadata`: target metadata needed for loading database. Used only with the --update_db and --file argument.
 71 | -   `--only_align`: Run alignment based on preprocessed text data from a previous alignment.
 72 | -   `--load_only_web_app`: Define whether to load results into a database viewable via a web application. Set to True by default.
 73 | -   `--skip_web_app`: define whether to load results into a database and build a corresponding web app
 74 | 
 75 | ## Configuring the alignment
 76 | 
 77 | When running an alignment, you need to provide a configuration file to the `textpair` command.
 78 | You can find a generic copy of the file in `/var/lib/text-pair/config/config.ini`.
 79 | You should copy this file to the directory from which you are starting the alignment.
 80 | Then you can start editing this file. Note that all parameters have comments explaining their role.
 81 | 
 82 | While most values are reasonable defaults and don't require any edits, here are the most important settings you will want to checkout:
 83 | 
 84 | #### In the TEXT_SOURCES section
 85 | 
 86 | This is where you should define the paths for your source and target files. Note that if you define no target, files from source will be compared to one another. In this case, files will be compared only when the source file is older or of the same year as the target file. This is to avoid considering a source a document which was written after the target.
 87 | To leverage a PhiloLogic database to extract text and relevant metadata, point to the directory of the PhiloLogic DB used. You should then use the `--is_philo_db` flag.
 88 | To link your TextPAIR web app to PhiloLogic databases (for source and target), set source_url and target_url.
 89 | 
 90 | #### In the TEXT_PARSING section
 91 | 
 92 | -   `parse_source_files`, and `parse_target_files`: both of these setting determine whether you want textPAIR to parse your TEI files or not.
 93 |     Set to `yes` by default. If you are relying on parsed output from PhiloLogic, you will want to set this to `no` or `false`.
 94 | -   `source_file_type` and `target_file_type`: defines the type of text file: either TEI or plain text. If using plain text, you will need to supply a metadata file in the TEXT_SOURCES section
 95 | -   `source_words_to_keep` and `target_words_to_keep`: defines files containing lists of words (separated by a newline) which the parser should keep.
 96 |     Other words are discarded.
 97 | 
 98 | #### In the Preprocessing section
 99 | 
100 | -   `source_text_object_level` and `target_text_object_level`: Define the individual text object from which to compare other texts with.
101 |     Possible values are `doc`, `div1`, `div2`, `div3`, `para`, `sent`. This is only used when relying on a PhiloLogic database.
102 | -   `ngram`: Size of your ngram. The default is 3, which seems to work well in most cases. A lower number tends to produce more uninteresting short matches.
103 | -   `language`: This determines the language used by the Porter Stemmer as well as by Spacy (if using more advanced POS filtering features, lemmatization, or NER).
104 |     Note that you should use language codes from the <a href="https://spacy.io/models/">Spacy
105 |     documentation</a>.
106 |     Note that there is a section on Vector Space Alignment preprocessing. These options are for the `vsa` matcher (see next section) only. It is not recommended that you use these at this time.
107 | 
108 | #### In the Matching section
109 | 
110 | Note that there are two different types of matching algorithms, with different parameters. The current recommended one is `sa` (for sequence alignment). The `vsa` algorith is HIGHLY experimental, still under heavy development, and is not guaranteed to work.
111 | 
112 | ## Run comparison between preprocessed files manually
113 | 
114 | It is possible to run a comparison between documents without having to regenerate ngrams. In this case you need to use the
115 | `--only_align` argument with the `textpair` command.
116 | 
117 | Example:
118 | 
119 | ```console
120 | textpair --config=config.ini--only_align --workers=10 my_database_name
121 | ```
122 | 
123 | ## Configuring the Web Application
124 | 
125 | The `textpair` script automatically generates a Web Application, and does so by relying on the defaults configured in the `appConfig.json` file which is copied to the directory where the Web Application lives, typically `/var/www/html/text-pair/database_name`.
126 | 
127 | #### Note on metadata naming: metadata fields extracted for the text files are prepended by `source_` for source texts and `target_` for target texts.
128 | 
129 | In this file, there are a number of fields that can be configured:
130 | 
131 | -   `webServer`: should not be changed as only Apache is supported for the foreseeable future.
132 | -   `appPath`: this should match the WSGI configuration in `/etc/text-pair/apache_wsgi.conf`. Should not be changed without knowing how to work with `mod_wsgi`.
133 | -   `databaseName`: Defines the name of the PostgreSQL database where the data lives.
134 | -   `matchingAlgorithm`: DO NOT EDIT: tells the web app which matching method you used, and therefore impacts functionality within the Web UI.
135 | -   `databaseLabel`: Title of the database used in the Web Application
136 | -   `branding`: Defines links in the header
137 | -   `sourcePhiloDBLink` and `targetPhiloDBLink`: Provide URL to PhiloLogic database to contextualize shared passages.
138 | -   `sourceLabel` and `targetLabel` are the names of source DB and target DB. This field supports HTML tags.
139 | -   `sourceCitation` and `targetCitation` define the bibliography citation in results. `field` defines the metadata field to use, and `style` is for CSS styling (using key/value for CSS rules)
140 | -   `metadataFields` defines the fields available for searching in the search form for `source` and `target`.
141 |     `label` is the name used in the form and `value` is the actual name of the metadata field as stored in the SQL database.
142 | -   `facetFields` works the same way as `metadataFields` but for defining which fields are available in the faceted browser section.
143 | -   `timeSeriesIntervals` defines the time intervals available for the time series functionnality.
144 | -   `banalitiesStored` DO NOT EDIT: defines whether banalities (formulaic passages) have been stored.
145 | 
146 | Once you've edited these fields to your liking, you can regenerate your database by running the `npm run build` command from the directory where the `appConfig.json` file is located.
147 | 
148 | Built with support from the Mellon Foundation and the Fondation de la Maison des Sciences de l'Homme.
149 | 
150 | ## Post processing alignment results
151 | 
152 | TextPAIR produces two (or three if passage filtering is enabled) different files (found in the `output/results/` directory) as a result of each alignment task:
153 | 
154 | -   The `alignments.jsonl` file: this contains all alignments which were found by TextPAIR. Each line is formatted as an individual JSON string.
155 | -   The `duplicate_files.csv` file: this contains a list of potential duplicate files TextPAIR identified between the source and target databases.
156 | -   The `filtered_passages` file: shows source_passages which were filtered out based on phrase matching. Only generated if a file containing passages to filter has been provided.
157 | 
158 | These files are designed to be used for further inspection of the alignments, and potential post processing tasks such as alignment filtering or clustering.
159 | 


--------------------------------------------------------------------------------
/extras/restore_database.py:
--------------------------------------------------------------------------------
  1 | """Restores TextPAIR database and web files from a backup tarball, and rebuilds the web application."""
  2 | 
  3 | import json
  4 | import os
  5 | import shutil
  6 | import subprocess
  7 | from argparse import ArgumentParser
  8 | from configparser import ConfigParser
  9 | from pathlib import Path
 10 | 
 11 | import lz4.frame
 12 | import psycopg2
 13 | 
 14 | GLOBAL_CONFIG = ConfigParser()
 15 | GLOBAL_CONFIG.read("/etc/text-pair/global_settings.ini")
 16 | 
 17 | 
 18 | def check_database_connection(user, password):
 19 |     """Test database connection and permissions."""
 20 |     try:
 21 |         conn = psycopg2.connect(
 22 |             database=GLOBAL_CONFIG.get("DATABASE", "database_name"),
 23 |             user=user,
 24 |             password=password
 25 |         )
 26 |         conn.close()
 27 |         return True
 28 |     except psycopg2.OperationalError as e:
 29 |         print(f"Database connection error: {e}")
 30 |         return False
 31 | 
 32 | 
 33 | def update_app_config(web_app_path):
 34 |     """
 35 |     Update the appConfig.json file with the API server from global settings
 36 |     and update PhiloLogic paths to point to the backed up data.
 37 |     Returns True if successful, False otherwise.
 38 |     """
 39 |     try:
 40 |         config_path = web_app_path / "appConfig.json"
 41 |         if not config_path.exists():
 42 |             print(f"Warning: appConfig.json not found at {config_path}")
 43 |             return False
 44 | 
 45 |         # Read the current config
 46 |         with open(config_path) as f:
 47 |             config = json.load(f)
 48 | 
 49 |         # Update the apiServer value
 50 |         api_server = GLOBAL_CONFIG.get("WEB_APP", "api_server")
 51 |         config['apiServer'] = api_server
 52 | 
 53 |         # Update PhiloLogic paths to point to the restored data
 54 |         source_data_path = web_app_path / "source_data"
 55 |         if source_data_path.exists():
 56 |             config['sourcePhiloDBPath'] = str(source_data_path.absolute())
 57 | 
 58 |         target_data_path = web_app_path / "target_data"
 59 |         if target_data_path.exists():
 60 |             config['targetPhiloDBPath'] = str(target_data_path.absolute())
 61 |         elif 'targetPhiloDBPath' in config:
 62 |             # If target_data doesn't exist and there was a target path, remove it
 63 |             config['targetPhiloDBPath'] = ""
 64 | 
 65 |         # Write the updated config back
 66 |         with open(config_path, 'w') as f:
 67 |             json.dump(config, f, indent=2)
 68 | 
 69 |         print(f"Updated appConfig.json:")
 70 |         print(f"  - apiServer: {api_server}")
 71 |         print(f"  - sourcePhiloDBPath: {config['sourcePhiloDBPath']}")
 72 |         if config.get('targetPhiloDBPath'):
 73 |             print(f"  - targetPhiloDBPath: {config['targetPhiloDBPath']}")
 74 | 
 75 |         return True
 76 | 
 77 |     except Exception as e:
 78 |         print(f"Error updating appConfig.json: {e}")
 79 |         return False
 80 | 
 81 | 
 82 | def run_npm_build(web_app_path):
 83 |     """
 84 |     Run npm install and build in the web app directory.
 85 |     Returns True if successful, False otherwise.
 86 |     """
 87 |     try:
 88 |         # Change to web app directory
 89 |         original_dir = os.getcwd()
 90 |         os.chdir(web_app_path)
 91 | 
 92 |         # Run npm install
 93 |         print("Running npm install...")
 94 |         subprocess.run(['npm', 'install'], check=True)
 95 | 
 96 |         # Run npm build
 97 |         print("Running npm run build...")
 98 |         subprocess.run(['npm', 'run', 'build'], check=True)
 99 | 
100 |         return True
101 | 
102 |     except subprocess.CalledProcessError as e:
103 |         print(f"Error during npm build process: {e}")
104 |         return False
105 |     except Exception as e:
106 |         print(f"Unexpected error during build process: {e}")
107 |         return False
108 |     finally:
109 |         # Always return to original directory
110 |         os.chdir(original_dir)
111 | 
112 | 
113 | def check_existing_resources(db_name, db_user, db_password, web_app_dest, backup_dir):
114 |     """Check for existing database tables and web app directory."""
115 |     existing_resources = []
116 | 
117 |     # Check for existing tables
118 |     sql_files = list(backup_dir.glob("textpair_*.sql"))
119 |     for sql_file in sql_files:
120 |         table_name = sql_file.stem.replace('textpair_', '')
121 |         with psycopg2.connect(database=db_name, user=db_user, password=db_password) as conn:
122 |             with conn.cursor() as cursor:
123 |                 cursor.execute(
124 |                     "SELECT 1 FROM information_schema.tables WHERE table_name = %s",
125 |                     (table_name,)
126 |                 )
127 |                 if cursor.fetchone() is not None:
128 |                     existing_resources.append(f"database table '{table_name}'")
129 | 
130 |     # Check for existing web app directory
131 |     web_dirs = [d for d in backup_dir.iterdir() if d.is_dir()]
132 |     if web_dirs and (web_app_dest / web_dirs[0].name).exists():
133 |         existing_resources.append(f"web application directory '{web_dirs[0].name}'")
134 | 
135 |     return existing_resources
136 | 
137 | 
138 | def restore_textpair_database(backup_path, web_app_dest=None, force=False):
139 |     """
140 |     Restore TextPAIR database and web files from a backup tarball.
141 | 
142 |     Args:
143 |         backup_path: Path to the backup tarball
144 |         web_app_dest: Optional destination for web app files. If not provided,
145 |                      uses the path from global_settings.ini
146 |         force: If True, overwrite existing files/tables without prompting
147 |     """
148 |     print(f"\nStarting TextPAIR restoration from: {backup_path}")
149 | 
150 |     db_name = GLOBAL_CONFIG.get("DATABASE", "database_name")
151 |     db_user = GLOBAL_CONFIG.get("DATABASE", "database_user")
152 |     db_password = GLOBAL_CONFIG.get("DATABASE", "database_password")
153 | 
154 |     # Check database connection before proceeding
155 |     print("\nChecking database connection...")
156 |     if not check_database_connection(db_user, db_password):
157 |         raise Exception("Cannot connect to database. Please check credentials and permissions.")
158 |     print("✓ Database connection verified")
159 | 
160 |     backup_path = Path(backup_path)
161 |     if not backup_path.exists():
162 |         raise FileNotFoundError(f"Backup file not found: {backup_path}")
163 | 
164 |     # Create temporary directory for extraction
165 |     print("\nPreparing temporary workspace...")
166 |     temp_dir = Path("/tmp/textpair_restore_temp")
167 |     if temp_dir.exists():
168 |         print("  - Cleaning up existing temporary files...")
169 |         shutil.rmtree(temp_dir)
170 |     temp_dir.mkdir()
171 |     print("✓ Workspace prepared")
172 | 
173 |     restored_web_app_path = None
174 | 
175 |     try:
176 |         # Extract the tarball using lz4 module
177 |         print("\nExtracting backup archive...")
178 |         print("  - Decompressing with LZ4...")
179 |         with open(backup_path, 'rb') as f:
180 |             compressed_data = f.read()
181 |         decompressed_data = lz4.frame.decompress(compressed_data)
182 |         print("  - Extracting files...")
183 |         temp_tar = temp_dir / "temp.tar"
184 |         with open(temp_tar, 'wb') as f:
185 |             f.write(decompressed_data)
186 |         os.system(f"tar xf {temp_tar} -C {temp_dir}")
187 |         os.remove(temp_tar)
188 |         print("✓ Backup extracted successfully")
189 | 
190 |         backup_contents = list(temp_dir.iterdir())
191 |         if not backup_contents:
192 |             raise Exception("Backup archive appears to be empty")
193 | 
194 |         backup_dir = backup_contents[0]
195 |         if not backup_dir.is_dir():
196 |             raise Exception("Unexpected backup structure")
197 | 
198 |         # Set up web app destination path
199 |         if not web_app_dest:
200 |             web_app_dest = Path(GLOBAL_CONFIG.get("WEB_APP", "web_app_path"))
201 |         else:
202 |             web_app_dest = Path(web_app_dest)
203 | 
204 |         # Check for existing resources
205 |         if not force:
206 |             print("\nChecking for existing resources...")
207 |             existing = check_existing_resources(db_name, db_user, db_password, web_app_dest, backup_dir)
208 |             if existing:
209 |                 print("\nWARNING: The following resources will be overwritten:")
210 |                 for resource in existing:
211 |                     print(f"  - {resource}")
212 |                 response = input("\nDo you want to proceed with the restoration? This will replace all existing resources (y/n): ")
213 |                 if response.lower() != 'y':
214 |                     print("Restoration cancelled")
215 |                     return
216 |                 print("")  # Empty line for better readability
217 | 
218 |         # Restore database tables
219 |         sql_files = list(backup_dir.glob("textpair_*.sql"))
220 |         if not sql_files:
221 |             raise Exception("No SQL files found in backup")
222 | 
223 |         print("\nRestoring database tables...")
224 |         print(f"Found {len(sql_files)} tables to restore")
225 | 
226 |         for sql_file in sql_files:
227 |             table_name = sql_file.stem.replace('textpair_', '')
228 | 
229 |             # Drop existing table if it exists
230 |             print(f"  - Processing {table_name}:")
231 |             print(f"    • Dropping existing table if present...")
232 |             with psycopg2.connect(database=db_name, user=db_user, password=db_password) as conn:
233 |                 with conn.cursor() as cursor:
234 |                     cursor.execute(f"DROP TABLE IF EXISTS {table_name} CASCADE")
235 |                 conn.commit()
236 | 
237 |             # Restore table
238 |             print(f"    • Restoring table data...")
239 |             os.system(f'PGPASSWORD={db_password} psql -U {db_user} -d {db_name} -f {sql_file}')
240 |             print(f"    ✓ Table {table_name} restored")
241 | 
242 |         print("✓ Database restoration complete")
243 | 
244 |         # Restore web app files
245 |         web_dirs = [d for d in backup_dir.iterdir() if d.is_dir()]
246 |         if web_dirs:
247 |             web_app_dir = web_dirs[0]
248 |             web_app_dest = web_app_dest / web_app_dir.name
249 | 
250 |             if web_app_dest.exists():
251 |                 print(f"\nRemoving existing web application at {web_app_dest}...")
252 |                 shutil.rmtree(web_app_dest)
253 | 
254 |             print(f"Copying web application files...")
255 |             shutil.copytree(web_app_dir, web_app_dest)
256 |             restored_web_app_path = web_app_dest
257 |             print("✓ Web application files restored")
258 | 
259 |         # Update app configuration and rebuild web application if it was restored
260 |         if restored_web_app_path:
261 |             print("\nConfiguring web application...")
262 |             if not update_app_config(restored_web_app_path):
263 |                 print("Failed to update web application configuration")
264 |                 if not force:
265 |                     raise Exception("Web application configuration update failed")
266 |             print("✓ Configuration updated")
267 | 
268 |             print("\nRebuilding web application...")
269 |             print("  - Installing dependencies...")
270 |             if run_npm_build(restored_web_app_path):
271 |                 print("✓ Web application rebuilt successfully")
272 |             else:
273 |                 print("✗ Failed to rebuild web application")
274 |                 if not force:
275 |                     raise Exception("Web application build failed")
276 | 
277 |         print("\n✓ Restore completed successfully!")
278 |         db_url = Path(GLOBAL_CONFIG.get("WEB_APP", "api_server").replace("-api", "")) / web_app_dest.name
279 |         print(f"The database is viewable at: {db_url}")
280 | 
281 |     finally:
282 |         # Clean up
283 |         print("\nCleaning up...")
284 |         if temp_dir.exists():
285 |             shutil.rmtree(temp_dir)
286 |         os.remove(backup_path)
287 |         print("✓ Cleanup completed")
288 | 
289 | 
290 | if __name__ == "__main__":
291 |     parser = ArgumentParser()
292 |     parser.add_argument("backup_path", type=str, help="Path to the backup tarball file")
293 |     parser.add_argument("--web_app_dest", type=str, default="",
294 |                       help="Optional destination path for web app files")
295 |     parser.add_argument("--force", action="store_true",
296 |                       help="Overwrite existing files/tables without prompting")
297 |     args = parser.parse_args()
298 | 
299 |     restore_textpair_database(args.backup_path, args.web_app_dest, args.force)


--------------------------------------------------------------------------------
/web-app/src/components/passagePair.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |     <div class="local-pair">
  3 |         <div class="row">
  4 |             <div class="col mt-4">
  5 |                 <h6 class="paired-passage-label text-center pb-2" v-html="globalConfig.sourceLabel"></h6>
  6 |                 <citations :citation="globalConfig.sourceCitation" :alignment="alignment"
  7 |                     :link-to-doc="globalConfig.sourceLinkToDocMetadata"></citations>
  8 |             </div>
  9 |             <div class="col mt-4 border border-top-0 border-right-0 border-bottom-0 target-passage-container">
 10 |                 <h6 class="paired-passage-label text-center pb-2" v-html="globalConfig.targetLabel"></h6>
 11 |                 <citations :citation="globalConfig.targetCitation" :alignment="alignment"
 12 |                     :link-to-doc="globalConfig.targetLinkToDocMetadata"></citations>
 13 | 
 14 |                 <div class="classification-tags ms-3" style="margin-top: -0.75rem"
 15 |                     v-if="alignment.target_first_class || alignment.target_second_class || alignment.target_third_class">
 16 |                     <span v-if="alignment.target_first_class" class="classification-tag tag-primary">
 17 |                         {{ alignment.target_first_class.replace(/_/g, ' ') }}
 18 |                     </span>
 19 |                     <span v-if="alignment.target_second_class" class="classification-tag tag-secondary">
 20 |                         {{ alignment.target_second_class.replace(/_/g, ' ') }}
 21 |                     </span>
 22 |                     <span v-if="alignment.target_third_class" class="classification-tag tag-tertiary">
 23 |                         {{ alignment.target_third_class.replace(/_/g, ' ') }}
 24 |                     </span>
 25 |                 </div>
 26 |             </div>
 27 |         </div>
 28 | 
 29 |         <div class="row passages">
 30 |             <div class="col mb-2">
 31 |                 <p class="card-text text-justify px-3 pt-2 mb-2">
 32 |                     {{ alignment.source_context_before }}
 33 |                     <span class="source-passage">{{ alignment.source_passage }}</span>
 34 |                     {{ alignment.source_context_after }}
 35 |                 </p>
 36 |                 <button type="button" class="btn btn-outline-secondary position-absolute rounded-0"
 37 |                     style="bottom: 0; left: 0" v-if="globalConfig.sourcePhiloDBLink"
 38 |                     @click="goToContext(alignment, 'source')">
 39 |                     View passage in context
 40 |                 </button>
 41 |             </div>
 42 |             <div class="col mb-2 border border-top-0 border-right-0 border-bottom-0 target-passage-container">
 43 |                 <p class="card-text text-justify px-3 mb-2">
 44 |                     {{ alignment.target_context_before }}
 45 |                     <span class="target-passage">{{ alignment.target_passage }}</span>
 46 |                     {{ alignment.target_context_after }}
 47 |                 </p>
 48 |                 <button type="button" class="btn btn-outline-secondary position-absolute rounded-0"
 49 |                     style="bottom: 0; right: 0" v-if="globalConfig.targetPhiloDBLink"
 50 |                     @click="goToContext(alignment, 'target')">
 51 |                     View passage in context
 52 |                 </button>
 53 |             </div>
 54 |         </div>
 55 |         <div class="mb-2 ms-3" style="margin-top: -0.5rem"
 56 |             v-if="globalConfig.matchingAlgorithm == 'sa' && alignment.count > 1">
 57 |             &rarr;
 58 |             <router-link class="" :to="`group/${alignment.group_id}`">
 59 |                 Passage reused in {{ alignment.count }} different titles
 60 |             </router-link>
 61 |         </div>
 62 | 
 63 |         <div class="text-muted text-center mb-2">
 64 |             <div v-if="globalConfig.matchingAlgorithm == 'vsa'">
 65 |                 <div>{{ alignment.similarity.toFixed(2) * 100 }} % similar</div>
 66 |                 <a class="diff-btn" diffed="false" @click="showMatches(alignment)">Show matching words</a>
 67 |                 <div class="loading position-absolute" style="display: none; left: 50%; transform: translateX(-50%)">
 68 |                     <div class="spinner-border"
 69 |                         style="width: 4rem; height: 4rem; position: absolute; z-index: 50; top: 30px" role="status">
 70 |                         <span class="visually-hidden">Loading...</span>
 71 |                     </div>
 72 |                 </div>
 73 |             </div>
 74 |             <div v-if="globalConfig.matchingAlgorithm == 'sa'">
 75 |                 <a class="diff-btn" diffed="false" @click="showDifferences(
 76 |                     alignment.source_passage,
 77 |                     alignment.target_passage,
 78 |                     alignment.source_passage_length,
 79 |                     alignment.target_passage.length
 80 |                 )
 81 |                     ">Show differences</a>
 82 |                 <div class="loading position-absolute" style="display: none; left: 50%; transform: translateX(-50%)">
 83 |                     <div class="spinner-border"
 84 |                         style="width: 1.4rem; height: 1.4rem; position: absolute; z-index: 50; top: 5px; left: -10px;"
 85 |                         role="status">
 86 |                         <span class="visually-hidden">Loading...</span>
 87 |                     </div>
 88 |                 </div>
 89 |             </div>
 90 |         </div>
 91 |     </div>
 92 | </template>
 93 | 
 94 | <script>
 95 | import citations from "./citations";
 96 | import Worker from "./diffStrings?worker";
 97 | 
 98 | export default {
 99 |     name: "passagePair",
100 |     components: {
101 |         citations,
102 |     },
103 |     inject: ["$http"],
104 |     props: {
105 |         alignment: Object,
106 |         index: Number,
107 |         diffed: Boolean,
108 |         startByteIndex: Number,
109 |         endByteIndex: Number,
110 |     },
111 |     data() {
112 |         return {
113 |             loading: true,
114 |             globalConfig: this.$globalConfig,
115 |             done: false,
116 |             timeline: {},
117 |             sourcePassage: {},
118 |             passages: [],
119 |         };
120 |     },
121 |     mounted() {
122 |         if (this.diffed) {
123 |             let element = document.getElementsByClassName("diff-btn")[this.index];
124 |             this.showDifferences(
125 |                 this.alignment.source_passage,
126 |                 this.alignment.target_passage,
127 |                 this.alignment.source_passage_length,
128 |                 this.alignment.target_passage.length,
129 |                 element
130 |             );
131 |         }
132 |     },
133 |     methods: {
134 |         showDifferences(sourceText, targetText, sourcePassageLength, targetPassageLength, diffBtn) {
135 |             sourceText = sourceText.replace(/\s\s+/g, ' ')
136 |             targetText = targetText.replace(/\s\s+/g, ' ')
137 |             if (sourcePassageLength > 10000 || targetPassageLength > 10000) {
138 |                 alert("Passage of 10000 words or more may take up a long time to compare");
139 |             }
140 |             if (diffBtn == undefined) {
141 |                 diffBtn = document.getElementsByClassName("diff-btn")[this.index];
142 |             }
143 |             let parent = diffBtn.parentNode.parentNode.parentNode;
144 |             let loading = parent.querySelector(".loading");
145 |             let sourceElement = parent.querySelector(".source-passage");
146 |             let targetElement = parent.querySelector(".target-passage");
147 |             if (diffBtn.getAttribute("diffed") == "false") {
148 |                 loading.style.display = "initial";
149 |                 const worker = new Worker();
150 |                 let slicing = false;
151 |                 let startByteIndex = this.startByteIndex
152 |                 let endByteIndex = this.endByteIndex
153 |                 if (this.startByteIndex != undefined && this.startByteIndex > 0) {
154 |                     // works around issue with diff-match-patch where no matches are found if the match is far after the beginning of the string
155 |                     slicing = true
156 |                     worker.postMessage([sourceText.slice(this.startByteIndex, this.endByteIndex), targetText]);
157 |                 } else {
158 |                     worker.postMessage([sourceText, targetText]);
159 |                 }
160 |                 worker.onmessage = function (response) {
161 |                     let differences = response.data;
162 |                     let newSourceString = "";
163 | 
164 |                     let newTargetString = "";
165 |                     for (let diffObj of differences) {
166 |                         let [diffCode, text] = diffObj;
167 |                         if (diffCode === 0) {
168 |                             newSourceString += text;
169 |                             newTargetString += text;
170 |                         } else if (diffCode === -1) {
171 |                             newSourceString += `<span class="removed">${text}</span>`;
172 |                         } else if (diffCode === 1) {
173 |                             newTargetString += `<span class="added">${text}</span>`;
174 |                         }
175 |                     }
176 |                     if (slicing) {
177 |                         newSourceString = `<span class="removed">${sourceText.slice(0, startByteIndex)}</span>${newSourceString}<span class="removed">${sourceText.slice(endByteIndex)}</span>`;
178 |                     }
179 |                     sourceElement.innerHTML = newSourceString;
180 |                     targetElement.innerHTML = newTargetString;
181 |                     diffBtn.setAttribute("diffed", "true");
182 |                     loading.style.display = "none";
183 |                     diffBtn.textContent = "Hide differences";
184 |                 };
185 |             } else {
186 |                 sourceElement.innerHTML = sourceText;
187 |                 targetElement.innerHTML = targetText;
188 |                 diffBtn.setAttribute("diffed", "false");
189 |                 diffBtn.textContent = "Show differences";
190 |             }
191 |         },
192 |         showMatches: function (alignment, diffBtn) {
193 |             if (diffBtn == undefined) {
194 |                 diffBtn = document.getElementsByClassName("diff-btn")[this.index];
195 |             }
196 |             let parent = diffBtn.parentNode.parentNode.parentNode;
197 |             let sourceElement = parent.querySelector(".source-passage");
198 |             let targetElement = parent.querySelector(".target-passage");
199 |             if (diffBtn.getAttribute("diffed") == "false") {
200 |                 let source = alignment.source_passage_with_matches.replace(/&gt;/g, ">").replace(/&lt;/g, "<");
201 |                 sourceElement.innerHTML = source;
202 |                 let target = alignment.target_passage_with_matches.replace(/&gt;/g, ">").replace(/&lt;/g, "<");
203 |                 targetElement.innerHTML = target;
204 |                 diffBtn.setAttribute("diffed", "true");
205 |                 diffBtn.textContent = "Hide matching words";
206 |             } else {
207 |                 sourceElement.innerHTML = alignment.source_passage;
208 |                 targetElement.innerHTML = alignment.target_passage;
209 |                 diffBtn.setAttribute("diffed", "false");
210 |                 diffBtn.textContent = "Show matching words";
211 |             }
212 |         },
213 |         goToContext(alignment, direction) {
214 |             let rootURL = "";
215 |             let params = {};
216 |             if (direction == "source") {
217 |                 rootURL = this.globalConfig.sourcePhiloDBLink.replace(/\/$/, "");
218 |                 params = {
219 |                     db_table: this.$globalConfig.databaseName, philo_url: rootURL, philo_path: this.globalConfig.sourcePhiloDBPath, philo_id: alignment.source_philo_id, start_byte: alignment.source_start_byte, end_byte: alignment.source_end_byte, directionSelected: "source"
220 |                 };
221 |             } else {
222 |                 rootURL = this.globalConfig.targetPhiloDBLink.replace(/\/$/, "");
223 |                 params = {
224 |                     db_table: this.$globalConfig.databaseName, philo_url: rootURL, philo_path: this.globalConfig.targetPhiloDBPath, philo_id: alignment.target_philo_id, start_byte: alignment.target_start_byte, end_byte: alignment.target_end_byte, directionSelected: "target"
225 |                 };
226 |             }
227 |             this.$router.push({
228 |                 path: "/text-view/",
229 |                 query: params,
230 |             })
231 |         },
232 |     },
233 | };
234 | </script>
235 | 
236 | <style lang="scss" scoped>
237 | @import "../assets/theme.module.scss";
238 | 
239 | .source-passage,
240 | .target-passage {
241 |     color: $passage-color;
242 |     font-weight: 700;
243 | }
244 | 
245 | :deep(.added) {
246 |     color: $added-color;
247 |     font-weight: 700;
248 | }
249 | 
250 | :deep(.removed) {
251 |     color: $removed-color;
252 |     font-weight: 700;
253 |     text-decoration: line-through;
254 | }
255 | 
256 | :deep(.token-match) {
257 |     color: darkblue;
258 |     font-weight: 700;
259 | }
260 | 
261 | :deep(.filtered-token) {
262 |     opacity: 0.25;
263 | }
264 | 
265 | 
266 | .target-passage-container {
267 |     border-right-width: 0 !important;
268 | }
269 | 
270 | /* Classification tag styling */
271 | .classification-tags {
272 |     padding-top: 8px;
273 |     padding-bottom: 8px;
274 |     display: flex;
275 |     flex-wrap: wrap;
276 |     gap: 8px;
277 | }
278 | 
279 | .classification-tag {
280 |     display: inline-block;
281 |     font-size: 0.85rem;
282 |     font-weight: 500;
283 |     line-height: 1;
284 |     padding: 6px 10px;
285 |     border-radius: 4px;
286 |     color: #fff;
287 |     text-transform: capitalize;
288 | }
289 | 
290 | .tag-primary {
291 |     background-color: #2c3e50;
292 | }
293 | 
294 | .tag-secondary {
295 |     background-color: #3498db;
296 | }
297 | 
298 | .tag-tertiary {
299 |     background-color: #9b59b6;
300 | }
301 | </style>


--------------------------------------------------------------------------------
/web-app/src/components/timeSeries.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |     <div id="time-series-chart" class="mt-3">
  3 |         <div class="row" style="padding: 0 0.75rem">
  4 |             <div class="m-4" style="font-size: 120%;" v-if="error">No results for your query</div>
  5 |             <search-arguments></search-arguments>
  6 |         </div>
  7 |         <report-switcher></report-switcher>
  8 |         <div>
  9 |             <div class="loading position-absolute" style="left: 50%; transform: translateX(-50%)" v-if="loading">
 10 |                 <div class="d-flex justify-content-center position-relative">
 11 |                     <div class="spinner-border"
 12 |                         style="width: 8rem; height: 8rem; position: absolute; z-index: 50; top: 30px" role="status">
 13 |                         <span class="visually-hidden">Loading...</span>
 14 |                     </div>
 15 |                 </div>
 16 |             </div>
 17 |             <div class="card p-4">
 18 |                 <div id="time-series-options">
 19 |                     Group
 20 |                     <div class="my-dropdown" style="display: inline-block">
 21 |                         <button type="button" class="btn btn-sm btn-light rounded-0" @click="toggleDropdown()">
 22 |                             {{ directionSelected.label }} &#9662;
 23 |                         </button>
 24 |                         <ul class="my-dropdown-menu shadow-1">
 25 |                             <li class="my-dropdown-item" v-for="direction in directions" :key="direction.label"
 26 |                                 @click="selectItem('directionSelected', direction)">
 27 |                                 {{ direction.label }}
 28 |                             </li>
 29 |                         </ul>
 30 |                     </div>
 31 |                     &nbsp;results by
 32 |                     <div class="my-dropdown" style="display: inline-block">
 33 |                         <button type="button" class="btn btn-sm btn-light rounded-0" @click="toggleDropdown()">
 34 |                             {{ timeSeriesInterval.label }} &#9662;
 35 |                         </button>
 36 |                         <ul class="my-dropdown-menu shadow-1">
 37 |                             <li class="my-dropdown-item text-nowrap" v-for="interval in globalConfig.timeSeriesIntervals"
 38 |                                 :key="interval.label" @click="selectItem('timeSeriesInterval', interval)">
 39 |                                 {{ interval.label }}
 40 |                             </li>
 41 |                         </ul>
 42 |                     </div>
 43 |                     <div class="d-inline-block ms-2">
 44 |                         <button class="btn btn-sm btn-secondary rounded-0" type="button" @click="displayTimeSeries()">
 45 |                             Reload Time Series
 46 |                         </button>
 47 |                     </div>
 48 |                 </div>
 49 |                 <canvas id="myChart" style="margin-left: -2rem" height="800"></canvas>
 50 |             </div>
 51 |         </div>
 52 |     </div>
 53 | </template>
 54 | 
 55 | <script>
 56 | import Chart from "chart.js/dist/Chart.js";
 57 | import searchArguments from "./searchArguments";
 58 | import reportSwitcher from "./reportSwitcher";
 59 | import cssVariables from "../assets/theme.module.scss";
 60 | 
 61 | export default {
 62 |     name: "timeSeries",
 63 |     components: {
 64 |         searchArguments, reportSwitcher
 65 |     },
 66 |     inject: ["$http"],
 67 |     data() {
 68 |         return {
 69 |             formValues: this.getFormValues(),
 70 |             timeSeriesInterval: this.getTimeSeriesInterval(),
 71 |             directions: [
 72 |                 {
 73 |                     label: this.$globalConfig.sourceLabel,
 74 |                     value: "source",
 75 |                 },
 76 |                 {
 77 |                     label: this.$globalConfig.targetLabel,
 78 |                     value: "target",
 79 |                 },
 80 |             ],
 81 |             directionSelected: {
 82 |                 label: this.$globalConfig.sourceLabel,
 83 |                 value: "source",
 84 |             },
 85 |             loading: true,
 86 |             done: false,
 87 |             results: { alignments: [] },
 88 |             lastRowID: null,
 89 |             page: 0,
 90 |             error: null,
 91 |             globalConfig: this.$globalConfig,
 92 |             facetResults: null,
 93 |             facetLoading: null,
 94 |             chart: null,
 95 |             interval: null,
 96 |         };
 97 |     },
 98 |     created() {
 99 |         this.fetchData();
100 |     },
101 |     watch: {
102 |         // call again the method if the route changes
103 |         $route: "fetchData",
104 |     },
105 |     methods: {
106 |         getTimeSeriesInterval() {
107 |             for (const interval of this.$globalConfig.timeSeriesIntervals) {
108 |                 if (interval.value == this.$route.query.timeSeriesInterval) {
109 |                     return interval;
110 |                 }
111 |             }
112 |         },
113 |         getFormValues() {
114 |             let formValues = {};
115 |             for (const key of this.$globalConfig.metadataFields.source) {
116 |                 if (key.value in this.$route.query) {
117 |                     formValues[key.value] = this.$route.query[key.value];
118 |                 } else {
119 |                     formValues[key.value] = "";
120 |                 }
121 |             }
122 |             for (const key of this.$globalConfig.metadataFields.target) {
123 |                 if (key.value in this.$route.query) {
124 |                     formValues[key.value] = this.$route.query[key.value];
125 |                 } else {
126 |                     formValues[key.value] = "";
127 |                 }
128 |             }
129 |             formValues.banality = "";
130 |             formValues.timeSeriesInterval = this.$globalConfig.timeSeriesIntervals[0].value;
131 |             formValues.directionSelected = "source";
132 |             return formValues;
133 |         },
134 |         fetchData() {
135 |             this.results = { alignments: [] }; // clear alignments with new search
136 |             this.facetResults = null; // clear facet results with new search
137 |             let params = { ...this.$route.query };
138 |             this.interval = parseInt(params.timeSeriesInterval);
139 |             params.db_table = this.$globalConfig.databaseName;
140 |             this.loading = true;
141 |             this.$http
142 |                 .post(`${this.$globalConfig.apiServer}/generate_time_series/?${this.paramsToUrl(params)}`, {
143 |                     metadata: this.$globalConfig.metadataTypes,
144 |                 })
145 |                 .then((response) => {
146 |                     this.loading = false;
147 |                     this.done = true;
148 |                     this.emitter.emit("searchArgsUpdate", { counts: response.data.counts, searchParams: params });
149 |                     this.drawChart(response.data.results);
150 |                 })
151 |                 .catch((error) => {
152 |                     this.loading = false;
153 |                     this.error = error.toString();
154 |                     console.log(error);
155 |                 });
156 |         },
157 |         drawChart(results) {
158 |             if (this.chart != null) {
159 |                 this.chart.destroy();
160 |             }
161 |             var ctx = document.getElementById("myChart");
162 |             ctx.width = document.getElementById("search-form").offsetWidth;
163 |             var labels = [];
164 |             var data = [];
165 |             for (var key of results) {
166 |                 labels.push(key.year);
167 |                 data.push(key.count);
168 |             }
169 |             var vm = this;
170 |             vm.chart = new Chart(ctx, {
171 |                 type: "bar",
172 |                 data: {
173 |                     labels: labels,
174 |                     datasets: [
175 |                         {
176 |                             data: data,
177 |                             borderWidth: 1,
178 |                             backgroundColor: cssVariables.color,
179 |                             hoverBackgroundColor: this.hexToRGBA(cssVariables.color),
180 |                         },
181 |                     ],
182 |                 },
183 |                 options: {
184 |                     legend: {
185 |                         display: false,
186 |                     },
187 |                     layout: {
188 |                         padding: {
189 |                             right: 10,
190 |                             left: 10,
191 |                         },
192 |                     },
193 |                     responsive: false,
194 |                     scales: {
195 |                         yAxes: [
196 |                             {
197 |                                 ticks: {
198 |                                     beginAtZero: true,
199 |                                 },
200 |                                 gridLines: {
201 |                                     color: "#eee",
202 |                                     offsetGridLines: true,
203 |                                 },
204 |                             },
205 |                         ],
206 |                         xAxes: [
207 |                             {
208 |                                 gridLines: {
209 |                                     drawOnChartArea: false,
210 |                                 },
211 |                                 scaleLabel: {
212 |                                     labelString: "Years",
213 |                                 },
214 |                             },
215 |                         ],
216 |                     },
217 |                     tooltips: {
218 |                         cornerRadius: 0,
219 |                         callbacks: {
220 |                             title: function (tooltipItem) {
221 |                                 if (vm.interval != 1) {
222 |                                     return `${tooltipItem[0].xLabel}-${parseInt(tooltipItem[0].xLabel) + vm.interval - 1
223 |                                         }`;
224 |                                 } else {
225 |                                     return tooltipItem[0].xLabel;
226 |                                 }
227 |                             },
228 |                             label: function (tooltipItem) {
229 |                                 return `${tooltipItem.yLabel.toLocaleString()} shared passages`;
230 |                             },
231 |                             displayColors: false,
232 |                         },
233 |                     },
234 |                 },
235 |             });
236 |             ctx.onclick = function (evt) {
237 |                 var activePoints = vm.chart.getElementsAtEvent(evt);
238 |                 if (activePoints.length > 0) {
239 |                     var clickedElementindex = activePoints[0]["_index"];
240 |                     var label = vm.chart.data.labels[clickedElementindex];
241 |                     let params = { ...vm.$route.query };
242 |                     if (vm.interval != 1) {
243 |                         params[`${params.directionSelected}_year`] = `${label}-${label + vm.interval - 1}`;
244 |                     } else {
245 |                         params[`${params.directionSelected}_year`] = label;
246 |                     }
247 |                     vm.emitter.emit("urlUpdate", params);
248 |                     vm.$router.push(`/search?${vm.paramsToUrl(params)}`);
249 |                 }
250 |             };
251 |         },
252 |         hexToRGBA(h) {
253 |             let r = 0,
254 |                 g = 0,
255 |                 b = 0;
256 | 
257 |             // 3 digits
258 |             if (h.length == 4) {
259 |                 r = "0x" + h[1] + h[1];
260 |                 g = "0x" + h[2] + h[2];
261 |                 b = "0x" + h[3] + h[3];
262 | 
263 |                 // 6 digits
264 |             } else if (h.length == 7) {
265 |                 r = "0x" + h[1] + h[2];
266 |                 g = "0x" + h[3] + h[4];
267 |                 b = "0x" + h[5] + h[6];
268 |             }
269 |             return "rgba(" + +r + "," + +g + "," + +b + ", .7)";
270 |         },
271 |         selectItem(key, item) {
272 |             this[key] = item;
273 |             this.formValues[key] = item.value;
274 |             this.toggleDropdown();
275 |         },
276 |         toggleDropdown() {
277 |             let element = event.srcElement.closest(".my-dropdown").querySelector("ul");
278 |             if (element.style.display != "inline-block") {
279 |                 element.style.display = "inline-block";
280 |             } else {
281 |                 element.style.display = "none";
282 |             }
283 |         },
284 |         displayTimeSeries() {
285 |             this.$router.push(`/time?${this.paramsToUrl(this.formValues)}`);
286 |         },
287 |     },
288 | };
289 | </script>
290 | 
291 | <style>
292 | .card-link {
293 |     color: #007bff !important;
294 | }
295 | 
296 | .card-link:hover,
297 | .page-link {
298 |     cursor: pointer;
299 | }
300 | 
301 | .list-group-item:first-child,
302 | .list-group-item:last-child {
303 |     border-radius: 0 !important;
304 | }
305 | 
306 | .facet-result {
307 |     cursor: pointer;
308 | }
309 | 
310 | .facet-count {
311 |     float: right;
312 | }
313 | 
314 | .list-group-item:focus,
315 | .list-group-item:active {
316 |     outline: none !important;
317 | }
318 | 
319 | .source-passage,
320 | .target-passage {
321 |     color: dodgerblue;
322 | }
323 | 
324 | .added {
325 |     color: darkblue;
326 |     font-weight: 700;
327 | }
328 | 
329 | .removed {
330 |     color: green;
331 |     font-weight: 700;
332 |     text-decoration: line-through;
333 | }
334 | 
335 | .diff-btn {
336 |     display: inline-block;
337 |     padding: 0.2rem;
338 |     margin-bottom: 2px;
339 |     border: solid 1px #ddd;
340 |     cursor: pointer;
341 | }
342 | 
343 | .diff-btn:hover {
344 |     color: #565656 !important;
345 |     background-color: #f8f8f8;
346 | }
347 | 
348 | .separator {
349 |     padding: 5px;
350 | }
351 | 
352 | #time-series-options {
353 |     margin-left: -1rem;
354 |     margin-bottom: 1rem;
355 | }
356 | </style>


--------------------------------------------------------------------------------
/web-app/src/components/searchResults.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |     <div class="mt-3">
  3 |         <div class="row" style="padding: 0 0.75rem">
  4 |             <div class="m-4" style="font-size: 120%" v-if="error">No results for your query</div>
  5 |             <search-arguments></search-arguments>
  6 |         </div>
  7 |         <report-switcher />
  8 |         <div class="row">
  9 |             <div class="col-9 position-relative">
 10 |                 <div class="d-flex justify-content-center position-relative" v-if="loading">
 11 |                     <div class="spinner-border"
 12 |                         style="width: 8rem; height: 8rem; position: absolute; z-index: 50; top: 30px" role="status">
 13 |                         <span class="visually-hidden">Loading...</span>
 14 |                     </div>
 15 |                 </div>
 16 |                 <transition-group name="staggered-fade" tag="div" v-bind:css="false" v-on:before-enter="beforeEnter"
 17 |                     v-on:enter="enter">
 18 |                     <div class="card mb-3 rounded-0 shadow-1" style="position: relative;"
 19 |                         v-for="(alignment, index) in results.alignments" :key="results.start_position + index + 1"
 20 |                         v-bind:data-index="index">
 21 |                         <div class="corner-btn left">{{ results.start_position + index + 1 }}</div>
 22 |                         <passage-pair :alignment="alignment" :index="index" :diffed="false"></passage-pair>
 23 |                     </div>
 24 |                 </transition-group>
 25 |                 <nav aria-label="Page navigation" v-if="done">
 26 |                     <ul class="pagination justify-content-center mb-4">
 27 |                         <li class="page-item" v-if="results.page > 1">
 28 |                             <a class="page-link" v-on:click="previousPage()" aria-label="Previous">
 29 |                                 <span aria-hidden="true">&laquo;</span>
 30 |                             </a>
 31 |                         </li>
 32 |                         <li class="page-item">
 33 |                             <a class="page-link">Page {{ results.page }}</a>
 34 |                         </li>
 35 |                         <li class="page-item" v-if="this.resultsLeft > 0">
 36 |                             <a class="page-link" v-on:click="nextPage()" aria-label="Next">
 37 |                                 <span aria-hidden="true">&raquo;</span>
 38 |                             </a>
 39 |                         </li>
 40 |                     </ul>
 41 |                 </nav>
 42 |             </div>
 43 |             <div id="facets" class="col-3 pl-0 position-relative">
 44 |                 <div class="card shadow-1">
 45 |                     <h6 class="card-header text-center">Browse by Counts</h6>
 46 |                     <div id="metadata-list" class="mx-auto p-2" @click="toggleFacetList()">Show options</div>
 47 |                     <div class="mt-2 mb-3 pr-3 pl-3 facet-list">
 48 |                         <span class="dropdown-header text-center" v-html="globalConfig.sourceLabel"></span>
 49 |                         <div class="list-group">
 50 |                             <button type="button" class="list-group-item list-group-item-action"
 51 |                                 v-for="(field, index) in globalConfig.facetsFields.source" :key="index"
 52 |                                 v-on:click="facetSearch(field.value)">
 53 |                                 {{ field.label }}
 54 |                             </button>
 55 |                         </div>
 56 |                     </div>
 57 |                     <div class="mb-3 pr-3 pl-3 facet-list">
 58 |                         <h6 class="dropdown-header text-center" v-html="globalConfig.targetLabel"></h6>
 59 |                         <div class="list-group">
 60 |                             <button type="button" class="list-group-item list-group-item-action"
 61 |                                 v-for="(field, index) in globalConfig.facetsFields.target" :key="index"
 62 |                                 v-on:click="facetSearch(field.value)">
 63 |                                 {{ field.label }}
 64 |                             </button>
 65 |                         </div>
 66 |                     </div>
 67 |                 </div>
 68 |                 <div class="loading position-absolute" style="left: 50%; transform: translateX(-50%)"
 69 |                     v-if="facetLoading">
 70 |                     <div class="d-flex justify-content-center position-relative">
 71 |                         <div class="spinner-border"
 72 |                             style="width: 4rem; height: 4rem; position: absolute; z-index: 50; top: 30px" role="status">
 73 |                             <span class="visually-hidden">Loading...</span>
 74 |                         </div>
 75 |                     </div>
 76 |                 </div>
 77 |                 <div class="card rounded-0 shadow-1 mt-3" v-if="facetResults">
 78 |                     <div class="corner-btn destroy right" @click="closeFacetResults()">X</div>
 79 |                     <h6 class="card-header text-center">
 80 |                         Frequency by
 81 |                         <span v-html="facetDirectionLabel"></span>&nbsp;
 82 |                         <span class="text-capitalize">{{ facetResults.facet.split("_")[1] }}</span>
 83 |                     </h6>
 84 |                     <div class="mt-1 p-2">
 85 |                         <div class="pb-2 text-center" style="opacity: 0.5">Showing top 100 results</div>
 86 |                         <div class="list-group">
 87 |                             <div class="list-group-item list-group-item-action facet-result"
 88 |                                 v-for="(field, index) in facetResults.results.slice(0, 100)" :key="index"
 89 |                                 v-on:click="filteredSearch(facetResults.facet, field.field)">
 90 |                                 <div class="row">
 91 |                                     <div class="col pr-1 pl-1">{{ field.field || "N/A" }}</div>
 92 |                                     <div class="col-4 pr-1 pl-1 facet-count">{{ field.count.toLocaleString() }}</div>
 93 |                                 </div>
 94 |                             </div>
 95 |                         </div>
 96 |                     </div>
 97 |                 </div>
 98 |             </div>
 99 |         </div>
100 |     </div>
101 | </template>
102 | 
103 | <script>
104 | import Velocity from "velocity-animate";
105 | import passagePair from "./passagePair";
106 | import reportSwitcher from "./reportSwitcher";
107 | import searchArguments from "./searchArguments";
108 | 
109 | export default {
110 |     name: "searchResults",
111 |     components: {
112 |         searchArguments, passagePair, reportSwitcher
113 |     },
114 |     inject: ["$http"],
115 |     data() {
116 |         return {
117 |             loading: false,
118 |             done: false,
119 |             results: { alignments: [] },
120 |             counts: null,
121 |             resultsLeft: 0,
122 |             lastRowID: null,
123 |             page: 0,
124 |             error: null,
125 |             globalConfig: this.$globalConfig,
126 |             facetResults: null,
127 |             facetLoading: null,
128 |         };
129 |     },
130 |     created() {
131 |         // fetch the data when the view is created and the data is
132 |         // already being observed
133 |         this.fetchData();
134 |     },
135 |     watch: {
136 |         // call again the method if the route changes
137 |         $route: "fetchData",
138 |     },
139 |     methods: {
140 |         fetchData() {
141 |             this.results = { alignments: [] }; // clear alignments with new search
142 |             this.facetResults = null; // clear facet results with new search
143 |             this.error = null;
144 |             this.loading = true;
145 |             let params = { ...this.$route.query };
146 |             params.db_table = this.$globalConfig.databaseName;
147 |             this.emitter.emit("searchArgsUpdate", {
148 |                 counts: "",
149 |                 searchParams: params,
150 |             });
151 |             this.$http
152 |                 .get(`${this.$globalConfig.apiServer}/search_alignments/?${this.paramsToUrl(params)}`)
153 |                 .then((response) => {
154 |                     this.results = response.data;
155 |                     this.lastRowID = this.results.alignments[this.results.alignments.length - 1].rowid_ordered;
156 |                     this.page++;
157 |                     this.loading = false;
158 |                     this.done = true;
159 |                     this.$http
160 |                         .get(`${this.$globalConfig.apiServer}/count_results/?${this.paramsToUrl(params)}`)
161 |                         .then((response) => {
162 |                             let counts = response.data.counts;
163 |                             this.emitter.emit("searchArgsUpdate", {
164 |                                 counts: counts,
165 |                                 searchParams: params,
166 |                             });
167 |                             this.resultsLeft = counts - (this.results.start_position + this.results.alignments.length);
168 |                         })
169 |                         .catch((error) => {
170 |                             console.log(error);
171 |                         });
172 |                     Array.from(document.getElementsByClassName("facet-list")).forEach(function (element) {
173 |                         element.classList.remove("hide");
174 |                     });
175 |                     document.querySelector("#metadata-list").classList.remove("show");
176 |                 })
177 |                 .catch((error) => {
178 |                     this.loading = false;
179 |                     this.error = error.toString();
180 |                     console.log(error);
181 |                 });
182 |         },
183 |         previousPage() {
184 |             let queryParams = { ...this.$route.query };
185 |             queryParams.page = parseInt(this.results.page) - 1;
186 |             queryParams.direction = "previous";
187 |             queryParams.id_anchor = this.results.alignments[0].rowid_ordered;
188 |             this.$router.push(`/search?${this.paramsToUrl(queryParams)}`);
189 |         },
190 |         nextPage() {
191 |             let queryParams = { ...this.$route.query };
192 |             queryParams.page = parseInt(this.results.page) + 1;
193 |             queryParams.direction = "next";
194 |             queryParams.id_anchor = this.results.alignments[this.results.alignments.length - 1].rowid_ordered;
195 |             this.$router.push(`/search?${this.paramsToUrl(queryParams)}`);
196 |         },
197 |         facetSearch(field) {
198 |             let queryParams = { ...this.$route.query };
199 |             queryParams.db_table = this.$globalConfig.databaseName;
200 |             queryParams.facet = field;
201 |             this.facetLoading = true;
202 |             this.$http
203 |                 .post(`${this.$globalConfig.apiServer}/facets/?${this.paramsToUrl(queryParams)}`, {
204 |                     metadata: this.$globalConfig.metadataTypes,
205 |                 })
206 |                 .then((response) => {
207 |                     this.facetDirectionLabel = this.$globalConfig[`${response.data.facet.split("_")[0]}Label`];
208 |                     this.facetResults = response.data;
209 |                     this.toggleFacetList();
210 |                     this.facetLoading = false;
211 |                 })
212 |                 .catch((error) => {
213 |                     this.facetLoading = false;
214 |                     this.error = error.toString();
215 |                     console.log("ERROR", error);
216 |                 });
217 |         },
218 |         toggleFacetList() {
219 |             Array.from(document.getElementsByClassName("facet-list")).forEach(function (element) {
220 |                 element.classList.toggle("hide");
221 |             });
222 |             document.querySelector("#metadata-list").classList.toggle("show");
223 |         },
224 |         closeFacetResults() {
225 |             this.facetResults = null;
226 |             this.toggleFacetList();
227 |         },
228 |         filteredSearch(fieldName, value) {
229 |             let queryParams = { ...this.$route.query };
230 |             delete queryParams.page;
231 |             delete queryParams.id_anchor;
232 |             queryParams.db_table = this.$globalConfig.databaseName;
233 |             queryParams[fieldName] = `"${value}"`;
234 |             this.emitter.emit("urlUpdate", queryParams);
235 |             this.facetResults = null;
236 |             this.results = { alignments: [] };
237 |             this.$router.push(`/search?${this.paramsToUrl(queryParams)}`);
238 |         },
239 |         beforeEnter: function (el) {
240 |             el.style.opacity = 0;
241 |             el.style.height = 0;
242 |         },
243 |         enter: function (el, done) {
244 |             var delay = el.dataset.index * 100;
245 |             setTimeout(function () {
246 |                 Velocity(el, { opacity: 1, height: "100%" }, { complete: done });
247 |             }, delay);
248 |         },
249 |         toggleSearchForm() {
250 |             this.emitter.emit("toggleSearchForm");
251 |         },
252 |     },
253 | };
254 | </script>
255 | 
256 | <style scoped>
257 | #facets,
258 | .corner-btn {
259 |     font-family: "Open-Sans", sans-serif;
260 | }
261 | 
262 | .passage-label {
263 |     font-family: "Open-Sans", sans-serif;
264 | }
265 | 
266 | .card-link {
267 |     color: #007bff !important;
268 | }
269 | 
270 | .card-link:hover,
271 | .page-link {
272 |     cursor: pointer;
273 | }
274 | 
275 | .list-group-item:first-child,
276 | .list-group-item:last-child {
277 |     border-radius: 0 !important;
278 | }
279 | 
280 | .facet-result {
281 |     cursor: pointer;
282 | }
283 | 
284 | .facet-count {
285 |     float: right;
286 | }
287 | 
288 | .list-group-item:focus,
289 | .list-group-item:active {
290 |     outline: none !important;
291 | }
292 | 
293 | .facet-list {
294 |     transition: all 0.2s ease-out;
295 | }
296 | 
297 | .facet-list button:hover {
298 |     cursor: pointer;
299 | }
300 | 
301 | .facet-list.hide {
302 |     max-height: 0px;
303 |     opacity: 0;
304 |     margin: 0 !important;
305 | }
306 | 
307 | #metadata-list {
308 |     display: none;
309 |     opacity: 0;
310 |     cursor: pointer;
311 |     transition: all 0.2s ease-out;
312 | }
313 | 
314 | #metadata-list.show {
315 |     display: block;
316 |     opacity: 1;
317 | }
318 | 
319 | #metadata-list:hover {
320 |     color: #565656;
321 | }
322 | 
323 | 
324 | 
325 | .dropdown-header {
326 |     display: block;
327 |     padding: 0.5rem 1rem;
328 |     margin-bottom: 0;
329 |     font-size: 0.875rem;
330 |     color: #6c757d;
331 |     white-space: nowrap;
332 | }
333 | 
334 | .facet-list .list-group-item {
335 |     border-left-width: 0 !important;
336 |     border-right-width: 0 !important;
337 | }
338 | </style>


--------------------------------------------------------------------------------
/config/config.ini:
--------------------------------------------------------------------------------
  1 | ########################
  2 | ## CONFIGURATION FILE ##
  3 | ########################
  4 | 
  5 | [TEXT_SOURCES]
  6 | # Path to source files. This can be a path to TEI or plain text files or a path to a PhiloLogic database.
  7 | source_file_path =
  8 | 
  9 | # Path to metadata for plain text source files. Needs to be a CSV or TSV file with at least the filename as metadata
 10 | source_metadata =
 11 | 
 12 | # Path to target files. This can be a path to to TEI or plain text files or a path to a PhiloLogic database.
 13 | target_file_path =
 14 | 
 15 | # Path to metadata for plain text target files. Needs to be a CSV or TSV file with at least the filename as metadata
 16 | target_metadata =
 17 | 
 18 | # For backwards compatibility. Will remove in future versions
 19 | source_url =
 20 | target_url =
 21 | 
 22 | [TEXT_PARSING]
 23 | ##########################################################################
 24 | ## If TEI parsing was not done by PhiloLogic, you can parse your source ##
 25 | ## and target files directly from TextPAIR                              ##
 26 | ##########################################################################
 27 | 
 28 | # Defines whether to parse source files
 29 | parse_source_files = yes
 30 | 
 31 | # Source files type: specify tei for TEI files, and plain_text for plain-text files.
 32 | source_file_type = tei
 33 | 
 34 | # Defines path to file containing words to keep (useful for dirty OCR)
 35 | # Default is keeping all words
 36 | source_words_to_keep = all
 37 | 
 38 | # Defines whether to parse target files
 39 | parse_target_files = yes
 40 | 
 41 | # Target files type: specify tei for TEI files, and plain_text for plain-text files.
 42 | target_file_type = tei
 43 | 
 44 | # Defines path to file containing words to keep (useful for dirty OCR)
 45 | # Default is keeping all words
 46 | target_words_to_keep = all
 47 | 
 48 | 
 49 | [PREPROCESSING]
 50 | # Defines what object type to divide each text into
 51 | # Useful to break up a single document into smaller text units
 52 | source_text_object_type = doc
 53 | target_text_object_type = doc
 54 | 
 55 | # Defines how many tokens constitute a ngram
 56 | ngram = 3
 57 | 
 58 | # Defines size of gap autorized in ngram. If not 0, this will generate multiple ngrams within a window size of ngram+gap
 59 | # Note that you may need to adjust your minimum number of ngrams for matches to avoid short matches as a result.
 60 | # USE WITH CAUTION as this will multiply the RAM usage for your alignment
 61 | gap = 0
 62 | 
 63 | # The word order must be respected
 64 | word_order = yes
 65 | 
 66 | # Language: set the language for various normalization tasks
 67 | # such as stemming, lemmatizing, word mapping...etc
 68 | language =
 69 | 
 70 | # Language for target corpus: only set if your source and target corpus are in a different language
 71 | # USE ONLY with vsa with transformer vectorization using a multilingual model
 72 | target_language =
 73 | 
 74 | # Modernize language if modernization is available for your language: currently only French is supported.
 75 | modernize = yes
 76 | 
 77 | # Transliterate characters to closest ascii representation.
 78 | ascii = no
 79 | 
 80 | # Stem words using the Porter Stemmer
 81 | stemmer = yes
 82 | 
 83 | # Lemmatizer: path to lemmatizer file where each line contains the inflected form and
 84 | # the corresponding lemma separated by a tab. If set to spacy, make sure to also set spacy_model
 85 | lemmatizer =
 86 | 
 87 | # Lowercase words
 88 | lowercase = yes
 89 | 
 90 | # Remove numbers
 91 | numbers = yes
 92 | 
 93 | # Minimum word length
 94 | minimum_word_length = 2
 95 | 
 96 | # Stopwords: path to stopword list
 97 | stopwords =
 98 | 
 99 | # Define a language model to use for lemmatization, and POS tagging
100 | # See https://spacy.io/models for available models. Make sure to download the model first
101 | spacy_model =
102 | 
103 | # Parts-of-speech to keep: specify which parts of speach to keep. Use Universal POS tag notation. See here for a complete list:
104 | # https://universaldependencies.org/docs/u/pos/
105 | # Separate each pos to keep by a comma
106 | pos_to_keep =
107 | 
108 | #######################################################################
109 | ### VECTOR SPACE ALIGNMENT preprocessing options: VERY EXPERIMENTAL ###
110 | #######################################################################
111 | 
112 | # If set to n_token, text object is constitued of n_tokens where n is min_text_object_length.
113 | # if set to text_object, text objects are defined by their level in the OHCO hierarchy as defined in source_text_object_type and
114 | # target_text_object_type.
115 | text_object_definition = n_token
116 | 
117 | # Minimum size of text object length to be counted as a chunk
118 | min_text_object_length = 10
119 | 
120 | # Defines how many text object should constitute a text chunk used for similarity comparison.
121 | n_chunk = 3
122 | 
123 | # Vectorization method: either tfidf, w2v, or transformer
124 | vectorization = tfidf
125 | 
126 | # Minimum frequency of token: expressed as a floating number between 0 and 1
127 | min_freq = 0.05
128 | 
129 | # Maximum frequency of token: expressed as a floating number between 0 and 1
130 | max_freq = 0.9
131 | 
132 | # Model used for creating a document embedding: required if using w2v or transformer vectorization
133 | # if using w2v vectorization, use a Spacy model
134 | # if using transformer, use a Hugging Face transformer model (supported by sentence-transformers)
135 | # Default: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 (small, fast, 50+ languages)
136 | # For better quality, consider language-specific models: https://huggingface.co/models?library=sentence-transformers&sort=downloads
137 | # Examples: CATIE-AQ/camembert-base-embedding (French), intfloat/multilingual-e5-large (high quality multilingual)
138 | embedding_model =
139 | 
140 | 
141 | [LLM_PARAMS]
142 | ##############################
143 | ## GENERATIVE AI SETTINGS  ##
144 | ##############################
145 | # Generative AI mode to re-evaluate similarity of passage pairs
146 | # Uses Llama-cpp under the hood so requires a local model or a Hugging Face model
147 | llm_model =
148 | 
149 | # Context window size for the LLM model. Must be equal to or smaller than the model's context window
150 | # Note that you may also be limited by your GPU VRAM if using a GPU
151 | llm_context_window = 8192
152 | 
153 | # llm server port: only change if you have a port conflict
154 | llm_port = 8080
155 | 
156 | # Concurrency limit for LLM requests: increase if you have the RAM/VRAM to handle more concurrent requests
157 | llm_concurrency_limit = 8
158 | 
159 | 
160 | [MATCHING]
161 | ########################
162 | ## PROCESSING OPTIONS ##
163 | ########################
164 | 
165 | # Matching algorithm used to find similar passage pairs. Current options are
166 | # sa (for sequence alignment), and vsa (for vector space alignment).
167 | # DON'T USE vsa at this time, it may not work at all.
168 | matching_algorithm = sa
169 | 
170 | # Sort files prior to matching. This may be important when wanting to avoid
171 | # comparing a source file with a target file that occurs later in time
172 | sort_by = year
173 | 
174 | # Defines in how many batches your source or target corpus will be loaded: useful if your corpus is too big to fit in RAM
175 | # The default of 1 is to process the whole corpus at once.
176 | source_batch = 1
177 | target_batch = 1
178 | 
179 | # Size of left and right context in bytes
180 | context_size = 300
181 | 
182 | #########################
183 | ## MATCHING PARAMETERS ##
184 | #########################
185 | 
186 | # Size of ngram window to be initially evaluated in the sequence aligner
187 | matching_window_size = 30
188 | 
189 | # Minimum number of shared ngrams between docs to start a comparison
190 | minimum_matching_ngrams_in_docs = 4
191 | 
192 | # Percentage of shared ngrams between 2 docs to consider the target as a duplicate of source
193 | duplicate_threshold = 80
194 | 
195 | # Minimum number of matching ngrams in ngram window
196 | minimum_matching_ngrams_in_window = 4
197 | 
198 | # Maximum gap authorized between matching ngrams
199 | max_gap = 15
200 | 
201 | # Minimum number of matching ngrams to constitute a match
202 | minimum_matching_ngrams  = 4
203 | 
204 | # Automatically increase max_gap once minimum_matching_ngrams is reached
205 | flex_gap = true
206 | 
207 | # ONLY FOR VSA: defines similarity threshold for initial matching. Value between 0 and 1, with values closer to one
208 | # meaning higher similarity.
209 | min_similarity = 0.5
210 | 
211 | # ONLY FOR VSA: minimum number of matching words: this is to make sure your match is not based on just a couple of
212 | # highly weighted words
213 | min_matching_words = 5
214 | 
215 | # Use LLM to re-evaluate similarity of passage pairs found by the initial matching algorithm
216 | llm_eval = false
217 | 
218 | # Similarity threshold for the LLM to keep a passage pair as a match
219 | # if no value is provided, will default to min_similarity used for initial matching
220 | llm_similarity_threshold = 0.75
221 | 
222 | # Output the reasoning of the LLM for each evaluation to a debug file
223 | # Debut file will be in output/debug/llm_evaluations.txt
224 | llm_debug = false
225 | 
226 | ###################################
227 | ## PASSAGE MERGING AND EXTENDING ##
228 | ###################################
229 | 
230 | # Merge passages within n number of byte: number defined by passage length and the passage_distance_multiplier option.
231 | merge_passages_on_byte_distance = true
232 | 
233 | # Combine passage which are within (multiplier * length of previous passage) bytes. Needs merge_passages_on_byte_distance set to true
234 | passage_distance_multiplier = 0.5
235 | 
236 | # Merge passages within n number of ngrams: the value used is the matching_window_size defaulting to 20
237 | merge_passages_on_ngram_distance = true
238 | 
239 | #################################
240 | ## BANALITY DETECTION SETTINGS ##
241 | #################################
242 | # Whether to detect banalities, or formulaic expressions automatically
243 | banality_auto_detection = true
244 | 
245 | # Whether to use the LLM to re-evaluate banalities detected by the automatic detection
246 | banality_llm_post_eval = false
247 | 
248 | # Percentage of the most frequent ngrams in the corpus to use as the 'common ngrams' set for banality detection.
249 | # Lower values = only the most frequent ngrams are considered common (more selective).
250 | # The value is expressed as a percentage.
251 | most_common_ngram_proportion = 0.1
252 | 
253 | # Expressed in percentage of ngrams flagged as common. Beyond that percentage, passages are
254 | # flagged as banalities. A passage is flagged as banal if this percentage or more of its ngrams
255 | # are from the common ngrams set.
256 | common_ngram_threshold = 50
257 | 
258 | # Whether to store or dismiss formulaic expressions. If not stored, these are
259 | # saved in a separate file for further examination
260 | store_banalities = true
261 | 
262 | # Path to file containing phrases used to flag banalities and non-interesting matches
263 | # Note that all matches are removed and saved in a separate file
264 | # Also of note, this filter will remove any passage which contains an instance of a phrase
265 | phrase_filter =
266 | 
267 | 
268 | [PASSAGE_CLASSIFICATION]
269 | ###################################
270 | ## PASSAGE_CLASSIFICATION SETTINGS ##
271 | ###################################
272 | # Whether to classify passages into thematic categories using a zero-shot transformer model
273 | classify_passage = false
274 | 
275 | # Zero-shot model to use for classification. Should be a Hugging Face model compatible with the pipeline
276 | # See https://huggingface.co/models?pipeline_tag=zero-shot-classification
277 | zero_shot_model =
278 | 
279 | # Define each class and the criteria to use to classify matches following the below model
280 | Satire & Humor = "Passages primarily using irony, satire, humor, parody, or comical situations to critique or entertain. Focus on the context of enunciation: the text must clearly mock or parody an important theme; stylistic choices are aimed at creating comic effects and exaggeration. It is not enough for the text to have a polemical intent; it must also display stylistic qualities that reveal a comic détournement of the original idea, a clear intention to ironize.",
281 | Religion & Spirituality = "Speech about faith, God, theology, scripture, church, sin, redemption, prayer, miracles, saints, religious practice, religious doubt, mysticism. It is not enough for this theme to be merely present — what matters is that it becomes the object of explicit reflection.",
282 | Philosophy = "Speech about morality, ethics, virtue, reason, metaphysics, logic, existence, knowledge, truth, justice (as a concept), free will, nature of humanity, ethical dilemmas. It is not enough for this theme to be merely present — what matters is that it becomes the object of explicit reflection.",
283 | Politics, Law, & Governance = "Speech about power and its nature, the state, specific laws/decrees (their content), rights, citizenship and citizen participation, social order, revolution, political factions, diplomacy, governance, monarchy, republic. It is not enough for this theme to be merely present — what matters is that it becomes the object of explicit reflection.",
284 | History & Memory = "Passages describing battles, military life, strategy, soldiers, heroism, the impacts of war, civil unrest, duels. They could include references to specific historical events or figures, chronicles, discussion of the past, memory, tradition, national identity. Unlike the Social & Cultural Commentary category, it is important that the focus is placed on major events, prominent figures, and the great themes in the history of nations or peoples.",
285 | Social & Cultural Commentary = "Observations or critiques of society, class structure, customs, manners, social norms, inequality, poverty, public behavior, specific social groups. The themes can also be other ones, relating of everyday activities, work, food, clothing, housing, common rituals, non-political/non-religious customs to practices of love or marriage, family and friendship, funeral practices or the sense of time. Even accounts of journeys, voyages, geographical discoveries, descriptions of foreign lands or peoples fall into this category. Unlike the History & Memory category, the focus is more strictly sociological, concerning the lives of individuals, their concrete practices, and their ideas about everyday life.",
286 | Nature & Science = "Any purely descriptions of the natural world (landscapes, animals, weather), natural philosophy, scientific thought, discovery, medicine, technology. Focus on the strictly descriptive aspect: whether it is the description of nature or of scientific practices, it should appear as detached as possible. The presence of appropriate naturalistic or scientific vocabulary is central to assigning the text to this category. Pay close attention to the context of enunciation — the text should clearly present itself as objective, observational, and uninvolved.",
287 | Art & Literature = "Discussions, analysis, commentary, or critique about literature (authors, works, characters, genres, style, rhetoric) or other art forms (visual arts, music, theatre, aesthetics, artists). Do not focus on the literary quality of the text, but only on its metatextual aspect — on the development of commentary and analysis concerning literature and art."


--------------------------------------------------------------------------------