25 | The following passage is reused (in whole or in
26 | part) {{
27 | sourcePassage.count.toLocaleString() }}
28 | times:
29 | "{{ sourcePassage.source_passage }}"
30 |
31 |
32 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
For performance reasons, only the top 100 are displayed.
57 |
58 |
144 |
--------------------------------------------------------------------------------
/lib/textpair/passage_classifier.py:
--------------------------------------------------------------------------------
1 | """Passage classification for thematic categorization of alignments"""
2 |
3 | import html
4 | import re
5 |
6 | import lz4.frame
7 | import orjson
8 | from tqdm import tqdm
9 | from transformers import pipeline
10 |
11 |
12 | def get_expanded_passage(alignment: dict, context_bytes: int = 1000) -> str:
13 | """
14 | Pulls context around a target passage using byte offsets.
15 |
16 | Args:
17 | alignment: Alignment dict with target_passage, target_filename,
18 | target_start_byte, target_end_byte
19 | context_bytes: Number of bytes to read before and after (default: 1000)
20 |
21 | Returns:
22 | Expanded passage string with context before and after
23 | """
24 | target_passage = alignment.get("target_passage", "")
25 | filename = alignment.get("target_filename", "")
26 | start_byte = alignment.get("target_start_byte", 0)
27 | end_byte = alignment.get("target_end_byte", 0)
28 |
29 | # If we don't have the required fields, just return the original passage
30 | if not filename or not start_byte or not end_byte:
31 | return target_passage
32 |
33 | context_before = ""
34 | context_after = ""
35 |
36 | try:
37 | with open(filename, "rb") as f:
38 | # Get context before
39 | seek_pos_before = max(0, start_byte - context_bytes)
40 | read_len_before = start_byte - seek_pos_before
41 | if read_len_before > 0:
42 | f.seek(seek_pos_before)
43 | bytes_before = f.read(read_len_before)
44 | context_before = bytes_before.decode("utf-8", errors="ignore").strip()
45 | context_before = re.sub(r"\s+", " ", context_before)
46 | context_before = re.sub(r"^\w+>", "", context_before)
47 | context_before = re.sub(r"<.*?>", "", context_before)
48 | context_before = html.unescape(context_before)
49 |
50 | # Get context after
51 | f.seek(end_byte)
52 | bytes_after = f.read(context_bytes)
53 | context_after = bytes_after.decode("utf-8", errors="ignore").strip()
54 | context_after = re.sub(r"\s+", " ", context_after)
55 | context_after = re.sub(r"<[^>]$", "", context_after)
56 | context_after = re.sub(r"<.*?>", "", context_after)
57 | context_after = html.unescape(context_after)
58 | except Exception:
59 | # If file reading fails, fall back to original passage
60 | return target_passage
61 |
62 | # Return the expanded passage
63 | return f"{context_before} {target_passage} {context_after}".strip()
64 |
65 |
66 | async def classify_passages(
67 | input_path: str,
68 | zero_shot_model: str,
69 | classification_classes: dict[str, str],
70 | min_confidence: float = 0.7,
71 | top_k: int = 3,
72 | batch_size: int = 32
73 | ) -> int:
74 | """
75 | Classify passages into thematic categories using zero-shot classification.
76 |
77 | This performs multi-label classification where each passage can receive multiple
78 | category labels based on confidence thresholds.
79 |
80 | Args:
81 | input_path: Path to alignments file (jsonl.lz4 format)
82 | zero_shot_model: Hugging Face model for zero-shot classification
83 | classification_classes: Dict mapping class names to their definitions/criteria
84 | min_confidence: Minimum confidence score (0-1) to assign a label (default: 0.3)
85 | top_k: Maximum number of labels to assign per passage (default: 3)
86 | batch_size: Number of passages to process at once (default: 32)
87 |
88 | Returns:
89 | Number of passages classified
90 | """
91 | if not classification_classes:
92 | print("No classification classes defined. Skipping passage classification.")
93 | return 0
94 |
95 | print(f"Loading passage classifier: {zero_shot_model}")
96 | classifier = pipeline(
97 | "zero-shot-classification",
98 | model=zero_shot_model,
99 | device=0 # Use GPU if available
100 | )
101 |
102 | # Extract class labels and their descriptions
103 | candidate_labels = list(classification_classes.keys())
104 |
105 | # Prepare output
106 | temp_output_path = input_path.replace(".jsonl.lz4", ".jsonl_temp.lz4")
107 |
108 | # Count lines for progress
109 | with lz4.frame.open(input_path, "rb") as f_count:
110 | num_lines = sum(1 for _ in f_count)
111 |
112 | if num_lines == 0:
113 | print("Input file is empty.")
114 | return 0
115 |
116 | classified_count = 0
117 | with (lz4.frame.open(temp_output_path, "wb") as output_file,
118 | lz4.frame.open(input_path, "rb") as f_in,
119 | tqdm(total=num_lines, desc="Passage classification") as pbar):
120 |
121 | batch = []
122 | batch_alignments = []
123 |
124 | for line_b in f_in:
125 | alignment = orjson.loads(line_b)
126 |
127 | # Expand passage with surrounding context for better classification
128 | expanded_passage = get_expanded_passage(alignment, context_bytes=1000)
129 |
130 | batch.append(expanded_passage)
131 | batch_alignments.append(alignment)
132 |
133 | # Process batch
134 | if len(batch) >= batch_size:
135 | results = classifier(
136 | batch,
137 | candidate_labels,
138 | multi_label=True, # Allow multiple labels per passage
139 | batch_size=batch_size
140 | )
141 |
142 | for alignment, result in zip(batch_alignments, results):
143 | # Filter labels by confidence threshold and take top_k
144 | labels_and_scores = list(zip(result["labels"], result["scores"]))
145 |
146 | # Filter by minimum confidence
147 | filtered = [(label, score) for label, score in labels_and_scores if score >= min_confidence]
148 |
149 | # Take top_k
150 | top_labels = filtered[:top_k]
151 |
152 | # Store results
153 | alignment["passage_categories"] = [label for label, _ in top_labels]
154 | alignment["passage_categories_scores"] = [round(score, 3) for _, score in top_labels]
155 |
156 | if top_labels:
157 | classified_count += 1
158 |
159 | output_file.write(orjson.dumps(alignment) + b"\n")
160 | pbar.update(1)
161 |
162 | batch = []
163 | batch_alignments = []
164 |
165 | # Process remaining batch
166 | if batch:
167 | results = classifier(
168 | batch,
169 | candidate_labels,
170 | multi_label=True,
171 | batch_size=len(batch)
172 | )
173 |
174 | for alignment, result in zip(batch_alignments, results):
175 | labels_and_scores = list(zip(result["labels"], result["scores"]))
176 | filtered = [(label, score) for label, score in labels_and_scores if score >= min_confidence]
177 | top_labels = filtered[:top_k]
178 |
179 | alignment["passage_categories"] = [label for label, _ in top_labels]
180 | alignment["passage_categories_scores"] = [round(score, 3) for _, score in top_labels]
181 |
182 | if top_labels:
183 | classified_count += 1
184 |
185 | output_file.write(orjson.dumps(alignment) + b"\n")
186 | pbar.update(1)
187 |
188 | # Replace original with classified version
189 | import os
190 | os.replace(temp_output_path, input_path)
191 |
192 | print(f"Classification complete: {classified_count}/{num_lines} passages received category labels")
193 | print(f"(Passages with no labels had all scores below {min_confidence} threshold)")
194 |
195 | return num_lines
196 |
197 |
198 | if __name__ == "__main__":
199 | import asyncio
200 | import sys
201 |
202 | if len(sys.argv) < 2:
203 | print("Usage: python passage_classifier.py ")
204 | sys.exit(1)
205 |
206 | file_path = sys.argv[1]
207 |
208 | # Test with example categories
209 | test_classes = {
210 | "Satire & Humor": "Passages using irony, satire, humor, parody, or comical situations",
211 | "Religion & Spirituality": "Speech about faith, God, theology, scripture, church",
212 | "Philosophy": "Speech about morality, ethics, virtue, reason, metaphysics",
213 | }
214 |
215 | total = asyncio.run(classify_passages(
216 | file_path,
217 | "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7",
218 | test_classes,
219 | min_confidence=0.3,
220 | top_k=3
221 | ))
222 | print(f"Total passages processed: {total}")
223 |
--------------------------------------------------------------------------------
/lib/textpair/vector_space_alignment/structures.py:
--------------------------------------------------------------------------------
1 | """Data structures for vector space alignment"""
2 |
3 | from __future__ import annotations
4 |
5 | import os
6 | import sqlite3
7 | from collections.abc import Iterable
8 | from shutil import rmtree
9 | from typing import Callable
10 |
11 | import dill as pickle
12 | import msgspec
13 | import numpy as np
14 | import torch
15 | from msgspec import field
16 | from text_preprocessing import Tokens
17 |
18 | # Global constants for serialization and path management
19 | TEMP_DIR = os.getcwd()
20 | PHILO_TEXT_OBJECT_LEVELS = {"doc": 1, "div1": 2, "div2": 3, "div3": 4, "para": 5, "sent": 6, "word": 7}
21 |
22 |
23 | class PassageGroup(msgspec.Struct, array_like=True):
24 | """Text passage with all associated properties and vector representation"""
25 |
26 | start_byte: int = 0
27 | end_byte: int = 0
28 | filename: str = ""
29 | metadata: dict = {}
30 |
31 |
32 | class MergedGroup(msgspec.Struct, array_like=True):
33 | """A source and target PassageGroup pair with similarity"""
34 |
35 | source: PassageGroup = field(default_factory=PassageGroup)
36 | target: PassageGroup = field(default_factory=PassageGroup)
37 | similarity: float = 0.0
38 |
39 |
40 | # Msgpack encoders/decoders for serialization
41 | ENCODER = msgspec.msgpack.Encoder()
42 | DECODER = msgspec.msgpack.Decoder(type=MergedGroup)
43 |
44 |
45 | class DocumentChunks:
46 | """A generator with caching"""
47 |
48 | def __init__(self, docs: Iterable[list[str]], save_path: str, transform_function: Callable):
49 | self.docs = docs
50 | self.doc_list: list[list[str]] = []
51 | self.doc_count = 0
52 | self.generator_exhausted = False
53 | self.transform_function = transform_function
54 | self.corpus_type = self.transform_function.__qualname__.split(".")[0]
55 | self.path = os.path.join(TEMP_DIR, "output/chunks/", save_path)
56 | if os.path.exists(self.path):
57 | rmtree(self.path)
58 | os.makedirs(self.path, exist_ok=True)
59 |
60 | def __iter__(self) -> Iterable[str | list[str] | torch.Tensor | np.ndarray]:
61 | if self.generator_exhausted is False:
62 | if self.doc_count == 0:
63 | for doc in self.docs:
64 | doc = self.__format_doc(doc)
65 | self.__save(doc)
66 | self.doc_count += 1
67 | yield doc
68 | else:
69 | for doc_name in range(self.doc_count):
70 | yield self.__load(doc_name)
71 | for doc in self.docs:
72 | doc = self.__format_doc(doc)
73 | self.__save(doc)
74 | self.doc_count += 1
75 | yield doc
76 | self.generator_exhausted = True
77 | else:
78 | for doc_name in self.doc_list:
79 | yield self.__load(doc_name)
80 |
81 | def __save(self, doc: list[str] | str):
82 | filename = os.path.join(self.path, str(self.doc_count))
83 | if self.transform_function is None:
84 | with open(filename, "wb") as output_file:
85 | pickle.dump(doc, output_file)
86 | transformed_doc = self.transform_function([doc])
87 | if self.corpus_type == "TransformerCorpus":
88 | torch.save(transformed_doc, f"{filename}.pt")
89 | else:
90 | np.save(f"{filename}.npy", transformed_doc)
91 |
92 | def __load(self, doc_name) -> list[str] | torch.Tensor | np.ndarray:
93 | filename = os.path.join(self.path, str(doc_name))
94 | if self.transform_function is None:
95 | with open(filename, "rb") as input_file:
96 | doc = pickle.load(input_file)
97 | return doc
98 | elif self.corpus_type == "TransformerCorpus":
99 | return torch.load(f"{filename}.pt")
100 | return np.load(f"{filename}.npy")[0]
101 |
102 | def __get_doc(self, index: int) -> list[str] | torch.Tensor | np.ndarray:
103 | doc = None
104 | while index > self.doc_count:
105 | try:
106 | doc = next(self.docs)
107 | self.__format_doc(doc)
108 | self.__save(doc)
109 | self.doc_count += 1
110 | except StopIteration as e:
111 | raise IndexError from e
112 | if doc is None:
113 | return self.__load(index)
114 | return doc
115 |
116 | def __getitem__(self, item: int | slice) -> list[str] | str | list[list[str] | str] | np.ndarray | torch.Tensor:
117 | if isinstance(item, slice):
118 | end = item.stop
119 | if item.stop > len(self): # avoid index out of range
120 | end = len(self)
121 | if self.transform_function is None or self.corpus_type == "Word2VecEmbeddingCorpus":
122 | return np.array([self.__get_doc(index) for index in range(item.start, end)])
123 | return torch.cat([self.__get_doc(index) for index in range(item.start, end)]) # type:ignore
124 | return self.__get_doc(item)
125 |
126 | def __format_doc(self, doc: list[str]) -> str:
127 | return " ".join(doc)
128 |
129 | def __len__(self):
130 | if self.generator_exhausted is False:
131 | for _ in self:
132 | pass
133 | return self.doc_count
134 |
135 |
136 | class Matches:
137 | """Matches cached to disk"""
138 |
139 | def __init__(self, matches: Iterable[MergedGroup]):
140 | self.path = os.path.join(TEMP_DIR, "output/results/matches")
141 | os.makedirs(self.path, exist_ok=True)
142 | self.count = 0
143 | if isinstance(matches, list) and matches:
144 | self.matches = matches
145 | self.is_cached = False
146 | self.count = len(self.matches)
147 | else:
148 | self.conn = sqlite3.connect(os.path.join(self.path, "matches.db"))
149 | self.cursor = self.conn.cursor()
150 | self.cursor.execute("DROP TABLE IF EXISTS matches")
151 | self.cursor.execute("CREATE TABLE matches (match_id INTEGER, match blob)")
152 | self.cursor.execute("CREATE INDEX match_id_index ON matches (match_id)")
153 | self.matches = None
154 | self.is_cached = True
155 | self.count = self.__save(matches) # save generator to disk
156 |
157 | def match_generator(self, new_matches):
158 | for match in new_matches:
159 | dump = ENCODER.encode(match)
160 | yield (self.count, dump)
161 | self.count += 1
162 |
163 | def extend(self, new_matches: Iterable[MergedGroup]):
164 | """Add new matches to existing matches"""
165 | encoded_matches = self.match_generator(new_matches)
166 | self.cursor.executemany("INSERT INTO matches VALUES (?, ?)", encoded_matches)
167 |
168 | def __save(self, matches):
169 | count = 0
170 | for count, match in enumerate(matches):
171 | dump = ENCODER.encode(match)
172 | self.cursor.execute("INSERT INTO matches VALUES (?, ?)", (self.count, dump))
173 | if count == 0:
174 | return 0
175 | self.conn.commit()
176 | return count + 1
177 |
178 | def done(self):
179 | """Commit changes to database"""
180 | self.conn.commit()
181 | self.conn.close()
182 |
183 | @classmethod
184 | def load(cls):
185 | """Load instance of class by reading previously cached matches"""
186 | matches = []
187 | conn = sqlite3.connect(os.path.join(TEMP_DIR, "output/results/matches/matches.db"))
188 | cursor = conn.cursor()
189 | cursor.execute("SELECT match from matches ORDER BY match_id")
190 | for match in cursor:
191 | matches.append(DECODER.decode(match[0]))
192 | conn.close()
193 | return cls(matches)
194 |
195 | def __len__(self):
196 | return self.count
197 |
198 | def __iter__(self):
199 | if self.is_cached is False:
200 | for index in range(self.count):
201 | yield self.matches[index] # type: ignore
202 | else:
203 | self.cursor.execute("SELECT match FROM matches ORDER BY match_id")
204 | for match in self.cursor:
205 | yield DECODER.decode(match[0])
206 |
207 |
208 | # Lightweight, serializable data structure for efficient sentence searching.
209 | class TokenSearchData(msgspec.Struct):
210 | """A lightweight container for token data needed for sentence searching."""
211 | start_bytes: list[int]
212 | end_bytes: list[int]
213 | surface_forms: list[str]
214 | sentence_ids: list[str]
215 |
216 |
217 | def save_tokens(tokens: Tokens, parsed_filename: str):
218 | """
219 | Saves token search data to a cache file using msgpack serialization.
220 | """
221 | start_bytes = [token.ext['start_byte'] for token in tokens.tokens]
222 | end_bytes = [token.ext['end_byte'] for token in tokens.tokens]
223 | surface_forms = [token.surface_form for token in tokens.tokens]
224 | sentence_ids = [get_sentence_id(token) for token in tokens.tokens]
225 |
226 | search_data = TokenSearchData(
227 | start_bytes=start_bytes,
228 | end_bytes=end_bytes,
229 | surface_forms=surface_forms,
230 | sentence_ids=sentence_ids,
231 | )
232 |
233 | # Save the data to the cache file
234 | encoder = msgspec.msgpack.Encoder()
235 | with open(parsed_filename, "wb") as f:
236 | f.write(encoder.encode(search_data))
237 |
238 | def load_token_search_data(parsed_filename: str) -> TokenSearchData:
239 | """
240 | Loads token search data from a cache if available, otherwise creates it
241 | from the full Tokens object and caches it.
242 | """
243 | decoder = msgspec.msgpack.Decoder(TokenSearchData)
244 |
245 | with open(parsed_filename, "rb") as f:
246 | return decoder.decode(f.read())
247 |
248 |
249 | def find_token_index_by_byte(bytes: list[int], byte_offset: int) -> int:
250 | """
251 | Finds the index of the token at a given byte offset using binary search
252 | on a pre-computed list of start_bytes.
253 | """
254 | import bisect
255 | if not bytes:
256 | return -1
257 |
258 | # bisect_left finds the insertion point for the byte_offset.
259 | index = bisect.bisect_left(bytes, byte_offset)
260 |
261 | # If the offset is exactly a token's start, we found it.
262 | if index < len(bytes) and bytes[index] == byte_offset:
263 | return index
264 |
265 | # If the insertion point is 0, it must be the first token.
266 | if index == 0:
267 | return 0
268 |
269 | # Otherwise, the correct token is the one *before* the insertion point.
270 | return index - 1
271 |
272 |
273 | def get_sentence_id(token) -> str:
274 | """Extracts the sentence ID from a token's position string."""
275 | try:
276 | # The sentence ID is composed of the first 6 integers of the position string.
277 | return " ".join(token.ext['position'].split()[:6])
278 | except (AttributeError, KeyError, IndexError):
279 | return ""
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | "Nous ne faisons que nous entregloser" Montaigne wrote famously in his Essais... Since all we do is glose over what's already been written, we may as well build a tool to detect these intertextual relationships...
2 |
3 | # TextPAIR (Pairwise Alignment for Intertextual Relations)
4 |
5 | TextPAIR is a scalable and high-performance sequence aligner for humanities text analysis designed to identify "similar passages" in large collections of texts. These may include direct quotations, plagiarism and other forms of borrowings, commonplace expressions and the like. It is a complete rewrite and rethink of the original implementation released in 2009.
6 |
7 | While TextPAIR was developed in response to the fairly specific phenomenon of similar passages across literary works, the sequence analysis techniques employed in TextPAIR were developed in widely disparate fields, such as bioinformatics and computer science, with applications ranging from genome sequencing to plagiarism detection. TextPAIR generates a set of overlapping word sequence shingles for every text in a corpus, then stores and indexes that information to be analyzed against shingles from other texts. For example, the opening declaration from Rousseau's Du Contrat Social,
8 |
9 | `"L'homme est né libre, est partout il est dans les fers. Tel se croit le maître des autres, qui ne laisse pas d'être plus esclave qu'eux,"`
10 |
11 | would be rendered in trigram shingles (with lemmatization, accents flattened and function words removed) as:
12 |
13 | `homme_libre_partout, libre_partout_fer, partout_fer_croire, fer_croire_maitre, croire_maitre_laisser, maitre_laisser_esclave`
14 |
15 | Common shingles across texts indicate many different types of textual borrowings, from direct citations to more ambiguous and unattributed usages of a passage. Using a simple search form, the user can quickly identify similar passages shared between different texts in one database, or even across databases, such as in the example below.
16 |
17 | 
18 |
19 | ## Installation
20 |
21 | The recommended install is to build your own Docker image and run TextPAIR inside a container.
22 |
23 | ### Docker container method
24 |
25 | - Go to the docker folder and build a docker image: `docker build -t textpair .`
26 | - Start a new container: `docker run -td -p 80:80 --name textpair artfl/textpair init_textpair_db`
27 | Note that you may want to customize the `run` command according to your needs (e.g. to mount a volume for your data)
28 | You will need to copy your texts to the container, and then follow the normal procedure described below once inside the container.
29 |
30 | If you do run into the issue where the web server does not respond, restart the web server with the following command:
31 | `/var/lib/text-pair/api_server/web_server.sh &`
32 |
33 | ### Manual installation
34 |
35 | If you wish to install TextPAIR on a host machine, note that TextPair will only run on 64 bit Linux, see below.
36 |
37 | #### Dependencies
38 |
39 | - Python 3.11 and up
40 | - Node and NPM
41 | - PostgreSQL: you will need to create a dedicated database and create a user with read/write permissions on that database. You will also need to create the pg_trgm extension on that database by running the following command in the PostgreSQL shell: `CREATE EXTENSION pg_trgm;` run as a superuser.
42 |
43 | #### Install script
44 |
45 | See Ubuntu install instructions
46 |
47 | - Run `install.sh` script. This should install all needed components
48 | - Make sure you include `/etc/text-pair/apache_wsgi.conf` in your main Apache configuration file to enable searching
49 | - Edit `/etc/text-pair/global_settings.ini` to provide your PostgreSQL user, database, and password.
50 |
51 | ## Quick start
52 |
53 | Before running any alignment, make sure you edit your copy of `config.ini`. See [below](#configuring-the-alignment) for details
54 |
55 | #### NOTE: source designates the source database from which reuses are deemed to originate, and target is the collection borrowing from source. In practice, the number of alignments won't vary significantly if you swap source and target
56 |
57 | The sequence aligner is executed via the `textpair` command. The basic command is:
58 | `textpair --config=/path/to/config [OPTIONS] [database_name]`
59 |
60 | `textpair` takes the following command-line arguments:
61 |
62 | - `--config`: This argument is required. It defines the path to the configuration file where preprocessing, matching, and web application settings are set
63 | - `--is_philo_db`: Define if files are from a PhiloLogic database. If set to `True` metadata will be fetched using the PhiloLogic metadata index. Set to False by default.
64 | - `--output_path`: path to results
65 | - `--debug`: turn on debugging
66 | - `--workers`: Set number of workers/threads to use for parsing, ngram generation, and alignment.
67 | - `--update_db`: update database without rebuilding web_app. Should be used in conjunction with the --file argument
68 | - `--file`: alignment results file to load into database. Only used with the --update_db argument.
69 | - `--source_metadata`: source metadata needed for loading database. Used only with the --update_db and --file argument.
70 | - `--target_metadata`: target metadata needed for loading database. Used only with the --update_db and --file argument.
71 | - `--only_align`: Run alignment based on preprocessed text data from a previous alignment.
72 | - `--load_only_web_app`: Define whether to load results into a database viewable via a web application. Set to True by default.
73 | - `--skip_web_app`: define whether to load results into a database and build a corresponding web app
74 |
75 | ## Configuring the alignment
76 |
77 | When running an alignment, you need to provide a configuration file to the `textpair` command.
78 | You can find a generic copy of the file in `/var/lib/text-pair/config/config.ini`.
79 | You should copy this file to the directory from which you are starting the alignment.
80 | Then you can start editing this file. Note that all parameters have comments explaining their role.
81 |
82 | While most values are reasonable defaults and don't require any edits, here are the most important settings you will want to checkout:
83 |
84 | #### In the TEXT_SOURCES section
85 |
86 | This is where you should define the paths for your source and target files. Note that if you define no target, files from source will be compared to one another. In this case, files will be compared only when the source file is older or of the same year as the target file. This is to avoid considering a source a document which was written after the target.
87 | To leverage a PhiloLogic database to extract text and relevant metadata, point to the directory of the PhiloLogic DB used. You should then use the `--is_philo_db` flag.
88 | To link your TextPAIR web app to PhiloLogic databases (for source and target), set source_url and target_url.
89 |
90 | #### In the TEXT_PARSING section
91 |
92 | - `parse_source_files`, and `parse_target_files`: both of these setting determine whether you want textPAIR to parse your TEI files or not.
93 | Set to `yes` by default. If you are relying on parsed output from PhiloLogic, you will want to set this to `no` or `false`.
94 | - `source_file_type` and `target_file_type`: defines the type of text file: either TEI or plain text. If using plain text, you will need to supply a metadata file in the TEXT_SOURCES section
95 | - `source_words_to_keep` and `target_words_to_keep`: defines files containing lists of words (separated by a newline) which the parser should keep.
96 | Other words are discarded.
97 |
98 | #### In the Preprocessing section
99 |
100 | - `source_text_object_level` and `target_text_object_level`: Define the individual text object from which to compare other texts with.
101 | Possible values are `doc`, `div1`, `div2`, `div3`, `para`, `sent`. This is only used when relying on a PhiloLogic database.
102 | - `ngram`: Size of your ngram. The default is 3, which seems to work well in most cases. A lower number tends to produce more uninteresting short matches.
103 | - `language`: This determines the language used by the Porter Stemmer as well as by Spacy (if using more advanced POS filtering features, lemmatization, or NER).
104 | Note that you should use language codes from the Spacy
105 | documentation.
106 | Note that there is a section on Vector Space Alignment preprocessing. These options are for the `vsa` matcher (see next section) only. It is not recommended that you use these at this time.
107 |
108 | #### In the Matching section
109 |
110 | Note that there are two different types of matching algorithms, with different parameters. The current recommended one is `sa` (for sequence alignment). The `vsa` algorith is HIGHLY experimental, still under heavy development, and is not guaranteed to work.
111 |
112 | ## Run comparison between preprocessed files manually
113 |
114 | It is possible to run a comparison between documents without having to regenerate ngrams. In this case you need to use the
115 | `--only_align` argument with the `textpair` command.
116 |
117 | Example:
118 |
119 | ```console
120 | textpair --config=config.ini--only_align --workers=10 my_database_name
121 | ```
122 |
123 | ## Configuring the Web Application
124 |
125 | The `textpair` script automatically generates a Web Application, and does so by relying on the defaults configured in the `appConfig.json` file which is copied to the directory where the Web Application lives, typically `/var/www/html/text-pair/database_name`.
126 |
127 | #### Note on metadata naming: metadata fields extracted for the text files are prepended by `source_` for source texts and `target_` for target texts.
128 |
129 | In this file, there are a number of fields that can be configured:
130 |
131 | - `webServer`: should not be changed as only Apache is supported for the foreseeable future.
132 | - `appPath`: this should match the WSGI configuration in `/etc/text-pair/apache_wsgi.conf`. Should not be changed without knowing how to work with `mod_wsgi`.
133 | - `databaseName`: Defines the name of the PostgreSQL database where the data lives.
134 | - `matchingAlgorithm`: DO NOT EDIT: tells the web app which matching method you used, and therefore impacts functionality within the Web UI.
135 | - `databaseLabel`: Title of the database used in the Web Application
136 | - `branding`: Defines links in the header
137 | - `sourcePhiloDBLink` and `targetPhiloDBLink`: Provide URL to PhiloLogic database to contextualize shared passages.
138 | - `sourceLabel` and `targetLabel` are the names of source DB and target DB. This field supports HTML tags.
139 | - `sourceCitation` and `targetCitation` define the bibliography citation in results. `field` defines the metadata field to use, and `style` is for CSS styling (using key/value for CSS rules)
140 | - `metadataFields` defines the fields available for searching in the search form for `source` and `target`.
141 | `label` is the name used in the form and `value` is the actual name of the metadata field as stored in the SQL database.
142 | - `facetFields` works the same way as `metadataFields` but for defining which fields are available in the faceted browser section.
143 | - `timeSeriesIntervals` defines the time intervals available for the time series functionnality.
144 | - `banalitiesStored` DO NOT EDIT: defines whether banalities (formulaic passages) have been stored.
145 |
146 | Once you've edited these fields to your liking, you can regenerate your database by running the `npm run build` command from the directory where the `appConfig.json` file is located.
147 |
148 | Built with support from the Mellon Foundation and the Fondation de la Maison des Sciences de l'Homme.
149 |
150 | ## Post processing alignment results
151 |
152 | TextPAIR produces two (or three if passage filtering is enabled) different files (found in the `output/results/` directory) as a result of each alignment task:
153 |
154 | - The `alignments.jsonl` file: this contains all alignments which were found by TextPAIR. Each line is formatted as an individual JSON string.
155 | - The `duplicate_files.csv` file: this contains a list of potential duplicate files TextPAIR identified between the source and target databases.
156 | - The `filtered_passages` file: shows source_passages which were filtered out based on phrase matching. Only generated if a file containing passages to filter has been provided.
157 |
158 | These files are designed to be used for further inspection of the alignments, and potential post processing tasks such as alignment filtering or clustering.
159 |
--------------------------------------------------------------------------------
/extras/restore_database.py:
--------------------------------------------------------------------------------
1 | """Restores TextPAIR database and web files from a backup tarball, and rebuilds the web application."""
2 |
3 | import json
4 | import os
5 | import shutil
6 | import subprocess
7 | from argparse import ArgumentParser
8 | from configparser import ConfigParser
9 | from pathlib import Path
10 |
11 | import lz4.frame
12 | import psycopg2
13 |
14 | GLOBAL_CONFIG = ConfigParser()
15 | GLOBAL_CONFIG.read("/etc/text-pair/global_settings.ini")
16 |
17 |
18 | def check_database_connection(user, password):
19 | """Test database connection and permissions."""
20 | try:
21 | conn = psycopg2.connect(
22 | database=GLOBAL_CONFIG.get("DATABASE", "database_name"),
23 | user=user,
24 | password=password
25 | )
26 | conn.close()
27 | return True
28 | except psycopg2.OperationalError as e:
29 | print(f"Database connection error: {e}")
30 | return False
31 |
32 |
33 | def update_app_config(web_app_path):
34 | """
35 | Update the appConfig.json file with the API server from global settings
36 | and update PhiloLogic paths to point to the backed up data.
37 | Returns True if successful, False otherwise.
38 | """
39 | try:
40 | config_path = web_app_path / "appConfig.json"
41 | if not config_path.exists():
42 | print(f"Warning: appConfig.json not found at {config_path}")
43 | return False
44 |
45 | # Read the current config
46 | with open(config_path) as f:
47 | config = json.load(f)
48 |
49 | # Update the apiServer value
50 | api_server = GLOBAL_CONFIG.get("WEB_APP", "api_server")
51 | config['apiServer'] = api_server
52 |
53 | # Update PhiloLogic paths to point to the restored data
54 | source_data_path = web_app_path / "source_data"
55 | if source_data_path.exists():
56 | config['sourcePhiloDBPath'] = str(source_data_path.absolute())
57 |
58 | target_data_path = web_app_path / "target_data"
59 | if target_data_path.exists():
60 | config['targetPhiloDBPath'] = str(target_data_path.absolute())
61 | elif 'targetPhiloDBPath' in config:
62 | # If target_data doesn't exist and there was a target path, remove it
63 | config['targetPhiloDBPath'] = ""
64 |
65 | # Write the updated config back
66 | with open(config_path, 'w') as f:
67 | json.dump(config, f, indent=2)
68 |
69 | print(f"Updated appConfig.json:")
70 | print(f" - apiServer: {api_server}")
71 | print(f" - sourcePhiloDBPath: {config['sourcePhiloDBPath']}")
72 | if config.get('targetPhiloDBPath'):
73 | print(f" - targetPhiloDBPath: {config['targetPhiloDBPath']}")
74 |
75 | return True
76 |
77 | except Exception as e:
78 | print(f"Error updating appConfig.json: {e}")
79 | return False
80 |
81 |
82 | def run_npm_build(web_app_path):
83 | """
84 | Run npm install and build in the web app directory.
85 | Returns True if successful, False otherwise.
86 | """
87 | try:
88 | # Change to web app directory
89 | original_dir = os.getcwd()
90 | os.chdir(web_app_path)
91 |
92 | # Run npm install
93 | print("Running npm install...")
94 | subprocess.run(['npm', 'install'], check=True)
95 |
96 | # Run npm build
97 | print("Running npm run build...")
98 | subprocess.run(['npm', 'run', 'build'], check=True)
99 |
100 | return True
101 |
102 | except subprocess.CalledProcessError as e:
103 | print(f"Error during npm build process: {e}")
104 | return False
105 | except Exception as e:
106 | print(f"Unexpected error during build process: {e}")
107 | return False
108 | finally:
109 | # Always return to original directory
110 | os.chdir(original_dir)
111 |
112 |
113 | def check_existing_resources(db_name, db_user, db_password, web_app_dest, backup_dir):
114 | """Check for existing database tables and web app directory."""
115 | existing_resources = []
116 |
117 | # Check for existing tables
118 | sql_files = list(backup_dir.glob("textpair_*.sql"))
119 | for sql_file in sql_files:
120 | table_name = sql_file.stem.replace('textpair_', '')
121 | with psycopg2.connect(database=db_name, user=db_user, password=db_password) as conn:
122 | with conn.cursor() as cursor:
123 | cursor.execute(
124 | "SELECT 1 FROM information_schema.tables WHERE table_name = %s",
125 | (table_name,)
126 | )
127 | if cursor.fetchone() is not None:
128 | existing_resources.append(f"database table '{table_name}'")
129 |
130 | # Check for existing web app directory
131 | web_dirs = [d for d in backup_dir.iterdir() if d.is_dir()]
132 | if web_dirs and (web_app_dest / web_dirs[0].name).exists():
133 | existing_resources.append(f"web application directory '{web_dirs[0].name}'")
134 |
135 | return existing_resources
136 |
137 |
138 | def restore_textpair_database(backup_path, web_app_dest=None, force=False):
139 | """
140 | Restore TextPAIR database and web files from a backup tarball.
141 |
142 | Args:
143 | backup_path: Path to the backup tarball
144 | web_app_dest: Optional destination for web app files. If not provided,
145 | uses the path from global_settings.ini
146 | force: If True, overwrite existing files/tables without prompting
147 | """
148 | print(f"\nStarting TextPAIR restoration from: {backup_path}")
149 |
150 | db_name = GLOBAL_CONFIG.get("DATABASE", "database_name")
151 | db_user = GLOBAL_CONFIG.get("DATABASE", "database_user")
152 | db_password = GLOBAL_CONFIG.get("DATABASE", "database_password")
153 |
154 | # Check database connection before proceeding
155 | print("\nChecking database connection...")
156 | if not check_database_connection(db_user, db_password):
157 | raise Exception("Cannot connect to database. Please check credentials and permissions.")
158 | print("✓ Database connection verified")
159 |
160 | backup_path = Path(backup_path)
161 | if not backup_path.exists():
162 | raise FileNotFoundError(f"Backup file not found: {backup_path}")
163 |
164 | # Create temporary directory for extraction
165 | print("\nPreparing temporary workspace...")
166 | temp_dir = Path("/tmp/textpair_restore_temp")
167 | if temp_dir.exists():
168 | print(" - Cleaning up existing temporary files...")
169 | shutil.rmtree(temp_dir)
170 | temp_dir.mkdir()
171 | print("✓ Workspace prepared")
172 |
173 | restored_web_app_path = None
174 |
175 | try:
176 | # Extract the tarball using lz4 module
177 | print("\nExtracting backup archive...")
178 | print(" - Decompressing with LZ4...")
179 | with open(backup_path, 'rb') as f:
180 | compressed_data = f.read()
181 | decompressed_data = lz4.frame.decompress(compressed_data)
182 | print(" - Extracting files...")
183 | temp_tar = temp_dir / "temp.tar"
184 | with open(temp_tar, 'wb') as f:
185 | f.write(decompressed_data)
186 | os.system(f"tar xf {temp_tar} -C {temp_dir}")
187 | os.remove(temp_tar)
188 | print("✓ Backup extracted successfully")
189 |
190 | backup_contents = list(temp_dir.iterdir())
191 | if not backup_contents:
192 | raise Exception("Backup archive appears to be empty")
193 |
194 | backup_dir = backup_contents[0]
195 | if not backup_dir.is_dir():
196 | raise Exception("Unexpected backup structure")
197 |
198 | # Set up web app destination path
199 | if not web_app_dest:
200 | web_app_dest = Path(GLOBAL_CONFIG.get("WEB_APP", "web_app_path"))
201 | else:
202 | web_app_dest = Path(web_app_dest)
203 |
204 | # Check for existing resources
205 | if not force:
206 | print("\nChecking for existing resources...")
207 | existing = check_existing_resources(db_name, db_user, db_password, web_app_dest, backup_dir)
208 | if existing:
209 | print("\nWARNING: The following resources will be overwritten:")
210 | for resource in existing:
211 | print(f" - {resource}")
212 | response = input("\nDo you want to proceed with the restoration? This will replace all existing resources (y/n): ")
213 | if response.lower() != 'y':
214 | print("Restoration cancelled")
215 | return
216 | print("") # Empty line for better readability
217 |
218 | # Restore database tables
219 | sql_files = list(backup_dir.glob("textpair_*.sql"))
220 | if not sql_files:
221 | raise Exception("No SQL files found in backup")
222 |
223 | print("\nRestoring database tables...")
224 | print(f"Found {len(sql_files)} tables to restore")
225 |
226 | for sql_file in sql_files:
227 | table_name = sql_file.stem.replace('textpair_', '')
228 |
229 | # Drop existing table if it exists
230 | print(f" - Processing {table_name}:")
231 | print(f" • Dropping existing table if present...")
232 | with psycopg2.connect(database=db_name, user=db_user, password=db_password) as conn:
233 | with conn.cursor() as cursor:
234 | cursor.execute(f"DROP TABLE IF EXISTS {table_name} CASCADE")
235 | conn.commit()
236 |
237 | # Restore table
238 | print(f" • Restoring table data...")
239 | os.system(f'PGPASSWORD={db_password} psql -U {db_user} -d {db_name} -f {sql_file}')
240 | print(f" ✓ Table {table_name} restored")
241 |
242 | print("✓ Database restoration complete")
243 |
244 | # Restore web app files
245 | web_dirs = [d for d in backup_dir.iterdir() if d.is_dir()]
246 | if web_dirs:
247 | web_app_dir = web_dirs[0]
248 | web_app_dest = web_app_dest / web_app_dir.name
249 |
250 | if web_app_dest.exists():
251 | print(f"\nRemoving existing web application at {web_app_dest}...")
252 | shutil.rmtree(web_app_dest)
253 |
254 | print(f"Copying web application files...")
255 | shutil.copytree(web_app_dir, web_app_dest)
256 | restored_web_app_path = web_app_dest
257 | print("✓ Web application files restored")
258 |
259 | # Update app configuration and rebuild web application if it was restored
260 | if restored_web_app_path:
261 | print("\nConfiguring web application...")
262 | if not update_app_config(restored_web_app_path):
263 | print("Failed to update web application configuration")
264 | if not force:
265 | raise Exception("Web application configuration update failed")
266 | print("✓ Configuration updated")
267 |
268 | print("\nRebuilding web application...")
269 | print(" - Installing dependencies...")
270 | if run_npm_build(restored_web_app_path):
271 | print("✓ Web application rebuilt successfully")
272 | else:
273 | print("✗ Failed to rebuild web application")
274 | if not force:
275 | raise Exception("Web application build failed")
276 |
277 | print("\n✓ Restore completed successfully!")
278 | db_url = Path(GLOBAL_CONFIG.get("WEB_APP", "api_server").replace("-api", "")) / web_app_dest.name
279 | print(f"The database is viewable at: {db_url}")
280 |
281 | finally:
282 | # Clean up
283 | print("\nCleaning up...")
284 | if temp_dir.exists():
285 | shutil.rmtree(temp_dir)
286 | os.remove(backup_path)
287 | print("✓ Cleanup completed")
288 |
289 |
290 | if __name__ == "__main__":
291 | parser = ArgumentParser()
292 | parser.add_argument("backup_path", type=str, help="Path to the backup tarball file")
293 | parser.add_argument("--web_app_dest", type=str, default="",
294 | help="Optional destination path for web app files")
295 | parser.add_argument("--force", action="store_true",
296 | help="Overwrite existing files/tables without prompting")
297 | args = parser.parse_args()
298 |
299 | restore_textpair_database(args.backup_path, args.web_app_dest, args.force)
--------------------------------------------------------------------------------
/web-app/src/components/passagePair.vue:
--------------------------------------------------------------------------------
1 |
2 |
80 | Frequency by
81 |
82 | {{ facetResults.facet.split("_")[1] }}
83 |
84 |
85 |
Showing top 100 results
86 |
87 |
90 |
91 |
{{ field.field || "N/A" }}
92 |
{{ field.count.toLocaleString() }}
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
255 |
256 |
--------------------------------------------------------------------------------
/config/config.ini:
--------------------------------------------------------------------------------
1 | ########################
2 | ## CONFIGURATION FILE ##
3 | ########################
4 |
5 | [TEXT_SOURCES]
6 | # Path to source files. This can be a path to TEI or plain text files or a path to a PhiloLogic database.
7 | source_file_path =
8 |
9 | # Path to metadata for plain text source files. Needs to be a CSV or TSV file with at least the filename as metadata
10 | source_metadata =
11 |
12 | # Path to target files. This can be a path to to TEI or plain text files or a path to a PhiloLogic database.
13 | target_file_path =
14 |
15 | # Path to metadata for plain text target files. Needs to be a CSV or TSV file with at least the filename as metadata
16 | target_metadata =
17 |
18 | # For backwards compatibility. Will remove in future versions
19 | source_url =
20 | target_url =
21 |
22 | [TEXT_PARSING]
23 | ##########################################################################
24 | ## If TEI parsing was not done by PhiloLogic, you can parse your source ##
25 | ## and target files directly from TextPAIR ##
26 | ##########################################################################
27 |
28 | # Defines whether to parse source files
29 | parse_source_files = yes
30 |
31 | # Source files type: specify tei for TEI files, and plain_text for plain-text files.
32 | source_file_type = tei
33 |
34 | # Defines path to file containing words to keep (useful for dirty OCR)
35 | # Default is keeping all words
36 | source_words_to_keep = all
37 |
38 | # Defines whether to parse target files
39 | parse_target_files = yes
40 |
41 | # Target files type: specify tei for TEI files, and plain_text for plain-text files.
42 | target_file_type = tei
43 |
44 | # Defines path to file containing words to keep (useful for dirty OCR)
45 | # Default is keeping all words
46 | target_words_to_keep = all
47 |
48 |
49 | [PREPROCESSING]
50 | # Defines what object type to divide each text into
51 | # Useful to break up a single document into smaller text units
52 | source_text_object_type = doc
53 | target_text_object_type = doc
54 |
55 | # Defines how many tokens constitute a ngram
56 | ngram = 3
57 |
58 | # Defines size of gap autorized in ngram. If not 0, this will generate multiple ngrams within a window size of ngram+gap
59 | # Note that you may need to adjust your minimum number of ngrams for matches to avoid short matches as a result.
60 | # USE WITH CAUTION as this will multiply the RAM usage for your alignment
61 | gap = 0
62 |
63 | # The word order must be respected
64 | word_order = yes
65 |
66 | # Language: set the language for various normalization tasks
67 | # such as stemming, lemmatizing, word mapping...etc
68 | language =
69 |
70 | # Language for target corpus: only set if your source and target corpus are in a different language
71 | # USE ONLY with vsa with transformer vectorization using a multilingual model
72 | target_language =
73 |
74 | # Modernize language if modernization is available for your language: currently only French is supported.
75 | modernize = yes
76 |
77 | # Transliterate characters to closest ascii representation.
78 | ascii = no
79 |
80 | # Stem words using the Porter Stemmer
81 | stemmer = yes
82 |
83 | # Lemmatizer: path to lemmatizer file where each line contains the inflected form and
84 | # the corresponding lemma separated by a tab. If set to spacy, make sure to also set spacy_model
85 | lemmatizer =
86 |
87 | # Lowercase words
88 | lowercase = yes
89 |
90 | # Remove numbers
91 | numbers = yes
92 |
93 | # Minimum word length
94 | minimum_word_length = 2
95 |
96 | # Stopwords: path to stopword list
97 | stopwords =
98 |
99 | # Define a language model to use for lemmatization, and POS tagging
100 | # See https://spacy.io/models for available models. Make sure to download the model first
101 | spacy_model =
102 |
103 | # Parts-of-speech to keep: specify which parts of speach to keep. Use Universal POS tag notation. See here for a complete list:
104 | # https://universaldependencies.org/docs/u/pos/
105 | # Separate each pos to keep by a comma
106 | pos_to_keep =
107 |
108 | #######################################################################
109 | ### VECTOR SPACE ALIGNMENT preprocessing options: VERY EXPERIMENTAL ###
110 | #######################################################################
111 |
112 | # If set to n_token, text object is constitued of n_tokens where n is min_text_object_length.
113 | # if set to text_object, text objects are defined by their level in the OHCO hierarchy as defined in source_text_object_type and
114 | # target_text_object_type.
115 | text_object_definition = n_token
116 |
117 | # Minimum size of text object length to be counted as a chunk
118 | min_text_object_length = 10
119 |
120 | # Defines how many text object should constitute a text chunk used for similarity comparison.
121 | n_chunk = 3
122 |
123 | # Vectorization method: either tfidf, w2v, or transformer
124 | vectorization = tfidf
125 |
126 | # Minimum frequency of token: expressed as a floating number between 0 and 1
127 | min_freq = 0.05
128 |
129 | # Maximum frequency of token: expressed as a floating number between 0 and 1
130 | max_freq = 0.9
131 |
132 | # Model used for creating a document embedding: required if using w2v or transformer vectorization
133 | # if using w2v vectorization, use a Spacy model
134 | # if using transformer, use a Hugging Face transformer model (supported by sentence-transformers)
135 | # Default: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 (small, fast, 50+ languages)
136 | # For better quality, consider language-specific models: https://huggingface.co/models?library=sentence-transformers&sort=downloads
137 | # Examples: CATIE-AQ/camembert-base-embedding (French), intfloat/multilingual-e5-large (high quality multilingual)
138 | embedding_model =
139 |
140 |
141 | [LLM_PARAMS]
142 | ##############################
143 | ## GENERATIVE AI SETTINGS ##
144 | ##############################
145 | # Generative AI mode to re-evaluate similarity of passage pairs
146 | # Uses Llama-cpp under the hood so requires a local model or a Hugging Face model
147 | llm_model =
148 |
149 | # Context window size for the LLM model. Must be equal to or smaller than the model's context window
150 | # Note that you may also be limited by your GPU VRAM if using a GPU
151 | llm_context_window = 8192
152 |
153 | # llm server port: only change if you have a port conflict
154 | llm_port = 8080
155 |
156 | # Concurrency limit for LLM requests: increase if you have the RAM/VRAM to handle more concurrent requests
157 | llm_concurrency_limit = 8
158 |
159 |
160 | [MATCHING]
161 | ########################
162 | ## PROCESSING OPTIONS ##
163 | ########################
164 |
165 | # Matching algorithm used to find similar passage pairs. Current options are
166 | # sa (for sequence alignment), and vsa (for vector space alignment).
167 | # DON'T USE vsa at this time, it may not work at all.
168 | matching_algorithm = sa
169 |
170 | # Sort files prior to matching. This may be important when wanting to avoid
171 | # comparing a source file with a target file that occurs later in time
172 | sort_by = year
173 |
174 | # Defines in how many batches your source or target corpus will be loaded: useful if your corpus is too big to fit in RAM
175 | # The default of 1 is to process the whole corpus at once.
176 | source_batch = 1
177 | target_batch = 1
178 |
179 | # Size of left and right context in bytes
180 | context_size = 300
181 |
182 | #########################
183 | ## MATCHING PARAMETERS ##
184 | #########################
185 |
186 | # Size of ngram window to be initially evaluated in the sequence aligner
187 | matching_window_size = 30
188 |
189 | # Minimum number of shared ngrams between docs to start a comparison
190 | minimum_matching_ngrams_in_docs = 4
191 |
192 | # Percentage of shared ngrams between 2 docs to consider the target as a duplicate of source
193 | duplicate_threshold = 80
194 |
195 | # Minimum number of matching ngrams in ngram window
196 | minimum_matching_ngrams_in_window = 4
197 |
198 | # Maximum gap authorized between matching ngrams
199 | max_gap = 15
200 |
201 | # Minimum number of matching ngrams to constitute a match
202 | minimum_matching_ngrams = 4
203 |
204 | # Automatically increase max_gap once minimum_matching_ngrams is reached
205 | flex_gap = true
206 |
207 | # ONLY FOR VSA: defines similarity threshold for initial matching. Value between 0 and 1, with values closer to one
208 | # meaning higher similarity.
209 | min_similarity = 0.5
210 |
211 | # ONLY FOR VSA: minimum number of matching words: this is to make sure your match is not based on just a couple of
212 | # highly weighted words
213 | min_matching_words = 5
214 |
215 | # Use LLM to re-evaluate similarity of passage pairs found by the initial matching algorithm
216 | llm_eval = false
217 |
218 | # Similarity threshold for the LLM to keep a passage pair as a match
219 | # if no value is provided, will default to min_similarity used for initial matching
220 | llm_similarity_threshold = 0.75
221 |
222 | # Output the reasoning of the LLM for each evaluation to a debug file
223 | # Debut file will be in output/debug/llm_evaluations.txt
224 | llm_debug = false
225 |
226 | ###################################
227 | ## PASSAGE MERGING AND EXTENDING ##
228 | ###################################
229 |
230 | # Merge passages within n number of byte: number defined by passage length and the passage_distance_multiplier option.
231 | merge_passages_on_byte_distance = true
232 |
233 | # Combine passage which are within (multiplier * length of previous passage) bytes. Needs merge_passages_on_byte_distance set to true
234 | passage_distance_multiplier = 0.5
235 |
236 | # Merge passages within n number of ngrams: the value used is the matching_window_size defaulting to 20
237 | merge_passages_on_ngram_distance = true
238 |
239 | #################################
240 | ## BANALITY DETECTION SETTINGS ##
241 | #################################
242 | # Whether to detect banalities, or formulaic expressions automatically
243 | banality_auto_detection = true
244 |
245 | # Whether to use the LLM to re-evaluate banalities detected by the automatic detection
246 | banality_llm_post_eval = false
247 |
248 | # Percentage of the most frequent ngrams in the corpus to use as the 'common ngrams' set for banality detection.
249 | # Lower values = only the most frequent ngrams are considered common (more selective).
250 | # The value is expressed as a percentage.
251 | most_common_ngram_proportion = 0.1
252 |
253 | # Expressed in percentage of ngrams flagged as common. Beyond that percentage, passages are
254 | # flagged as banalities. A passage is flagged as banal if this percentage or more of its ngrams
255 | # are from the common ngrams set.
256 | common_ngram_threshold = 50
257 |
258 | # Whether to store or dismiss formulaic expressions. If not stored, these are
259 | # saved in a separate file for further examination
260 | store_banalities = true
261 |
262 | # Path to file containing phrases used to flag banalities and non-interesting matches
263 | # Note that all matches are removed and saved in a separate file
264 | # Also of note, this filter will remove any passage which contains an instance of a phrase
265 | phrase_filter =
266 |
267 |
268 | [PASSAGE_CLASSIFICATION]
269 | ###################################
270 | ## PASSAGE_CLASSIFICATION SETTINGS ##
271 | ###################################
272 | # Whether to classify passages into thematic categories using a zero-shot transformer model
273 | classify_passage = false
274 |
275 | # Zero-shot model to use for classification. Should be a Hugging Face model compatible with the pipeline
276 | # See https://huggingface.co/models?pipeline_tag=zero-shot-classification
277 | zero_shot_model =
278 |
279 | # Define each class and the criteria to use to classify matches following the below model
280 | Satire & Humor = "Passages primarily using irony, satire, humor, parody, or comical situations to critique or entertain. Focus on the context of enunciation: the text must clearly mock or parody an important theme; stylistic choices are aimed at creating comic effects and exaggeration. It is not enough for the text to have a polemical intent; it must also display stylistic qualities that reveal a comic détournement of the original idea, a clear intention to ironize.",
281 | Religion & Spirituality = "Speech about faith, God, theology, scripture, church, sin, redemption, prayer, miracles, saints, religious practice, religious doubt, mysticism. It is not enough for this theme to be merely present — what matters is that it becomes the object of explicit reflection.",
282 | Philosophy = "Speech about morality, ethics, virtue, reason, metaphysics, logic, existence, knowledge, truth, justice (as a concept), free will, nature of humanity, ethical dilemmas. It is not enough for this theme to be merely present — what matters is that it becomes the object of explicit reflection.",
283 | Politics, Law, & Governance = "Speech about power and its nature, the state, specific laws/decrees (their content), rights, citizenship and citizen participation, social order, revolution, political factions, diplomacy, governance, monarchy, republic. It is not enough for this theme to be merely present — what matters is that it becomes the object of explicit reflection.",
284 | History & Memory = "Passages describing battles, military life, strategy, soldiers, heroism, the impacts of war, civil unrest, duels. They could include references to specific historical events or figures, chronicles, discussion of the past, memory, tradition, national identity. Unlike the Social & Cultural Commentary category, it is important that the focus is placed on major events, prominent figures, and the great themes in the history of nations or peoples.",
285 | Social & Cultural Commentary = "Observations or critiques of society, class structure, customs, manners, social norms, inequality, poverty, public behavior, specific social groups. The themes can also be other ones, relating of everyday activities, work, food, clothing, housing, common rituals, non-political/non-religious customs to practices of love or marriage, family and friendship, funeral practices or the sense of time. Even accounts of journeys, voyages, geographical discoveries, descriptions of foreign lands or peoples fall into this category. Unlike the History & Memory category, the focus is more strictly sociological, concerning the lives of individuals, their concrete practices, and their ideas about everyday life.",
286 | Nature & Science = "Any purely descriptions of the natural world (landscapes, animals, weather), natural philosophy, scientific thought, discovery, medicine, technology. Focus on the strictly descriptive aspect: whether it is the description of nature or of scientific practices, it should appear as detached as possible. The presence of appropriate naturalistic or scientific vocabulary is central to assigning the text to this category. Pay close attention to the context of enunciation — the text should clearly present itself as objective, observational, and uninvolved.",
287 | Art & Literature = "Discussions, analysis, commentary, or critique about literature (authors, works, characters, genres, style, rhetoric) or other art forms (visual arts, music, theatre, aesthetics, artists). Do not focus on the literary quality of the text, but only on its metatextual aspect — on the development of commentary and analysis concerning literature and art."
--------------------------------------------------------------------------------