├── .github
└── workflows
│ └── python-publish.yml
├── .gitignore
├── Greek_variety_classification
├── Ancient_greek_or_not.ipynb
├── Greek_variety_classifier.ipynb
├── ancient_greek_filter_dataset.csv
├── greek_classification_dataset.csv
├── models
│ ├── Ancient_Gr_classifier_model.zip
│ └── Gr_Var_Classifier_model.zip
└── preprocessing
│ └── clean_data_with_mask.py
├── README.md
├── dataset_progress.md
├── pipeline
├── LICENSE.md
├── MANIFEST.in
├── README.md
├── pyproject.toml
├── refactoring_todo.md
├── scripts
│ ├── concurrent_downloader.py
│ ├── sample_for_training.py
│ └── test_section_reconstruction.py
└── src
│ └── glossapi
│ ├── __init__.py
│ ├── corpus.py
│ ├── gloss_downloader.py
│ ├── gloss_extract.py
│ ├── gloss_section.py
│ ├── gloss_section_classifier.py
│ ├── models
│ ├── kmeans_weights.joblib
│ └── section_classifier.joblib
│ ├── parquet_schema.py
│ └── sampler.py
├── refactoring_plan.md
├── requirements.txt
├── scraping
├── download_and_extract_scripts
│ ├── __pycache__
│ │ ├── downloader_app.cpython-310.pyc
│ │ └── extractor_app.cpython-310.pyc
│ └── downloader.py
└── json_sitemaps
│ ├── anodos_pdf.json
│ ├── boithimata-glossas-G-Lyk_pdf.json
│ ├── cyprus-exams_pdf.json
│ ├── ebooks_list_pdf.json
│ ├── greek-language_pdf.json
│ ├── kallipos_pdf.json
│ ├── kentra-ekpaideusis-enhlikwn_pdf.json
│ ├── kodiko_pdf.json
│ ├── pergamos_list_pdf.json
│ ├── sitemap_explainer.txt
│ ├── themata-lyseis-panelladikwn_pdf.json
│ └── trapeza-thematwn_doc.json
└── test_script.py
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using GitHub Actions when a release is created
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-repositories
3 |
4 | name: Upload Python Package
5 |
6 | on:
7 | workflow_dispatch:
8 | release:
9 | types: [published]
10 |
11 | jobs:
12 | deploy:
13 | runs-on: ubuntu-latest
14 | permissions:
15 | # IMPORTANT: this permission is mandatory for trusted publishing
16 | id-token: write
17 | contents: read
18 |
19 | steps:
20 | - uses: actions/checkout@v3
21 | - name: Set up Python
22 | uses: actions/setup-python@v4
23 | with:
24 | python-version: '3.x'
25 | - name: Copy README to pipeline directory
26 | run: |
27 | cp README.md pipeline/
28 | - name: Install dependencies
29 | run: |
30 | python -m pip install --upgrade pip
31 | pip install build
32 | - name: Build package
33 | run: |
34 | cd pipeline
35 | python -m build
36 | - name: Publish package
37 | uses: pypa/gh-action-pypi-publish@release/v1
38 | with:
39 | packages-dir: pipeline/dist/
40 | password: ${{ secrets.PYPI_API_TOKEN }}
41 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | tokenization/preprocessing/preprocessor
2 | tokenization/preprocessing/text
3 | tokenization/preprocessing/text.txt
4 | tokenization/preprocessing/re_text.txt
5 | tokenization/new.bin
6 | tokenization/new.txt
7 | tokenization/text.bin
8 | tokenization/text.txt
9 | tokenization/tokenize
10 | tokenization/tokenizer
11 | .gitignore
12 | tokenization/freqency.txt
13 | tokenization/paper_1.txt
14 | tokenization/cleaned_filtered_extracted_txt
15 | tokenization/cleaned_filtered_extracted_txt_v2/*
16 | gutenberg_books
17 | clean_books
18 | # Python build artifacts
19 | __pycache__/
20 | *.py[cod]
21 | *.class
22 | *.so
23 | .Python
24 | build/
25 | develop-eggs/
26 | dist/
27 | downloads/
28 | eggs/
29 | .eggs/
30 | lib/
31 | lib64/
32 | parts/
33 | sdist/
34 | var/
35 | wheels/
36 | *.egg-info/
37 | .installed.cfg
38 | *.egg
39 |
40 |
--------------------------------------------------------------------------------
/Greek_variety_classification/Ancient_greek_or_not.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "e7063190-3c62-4d2b-9f17-cd4d6697e233",
7 | "metadata": {
8 | "tags": []
9 | },
10 | "outputs": [],
11 | "source": [
12 | "import os\n",
13 | "os.environ[\"WANDB_DISABLED\"] = \"true\"\n",
14 | "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "id": "3e971008-c8ea-431c-a84b-52a8531cc4b6",
21 | "metadata": {
22 | "tags": []
23 | },
24 | "outputs": [
25 | {
26 | "name": "stderr",
27 | "output_type": "stream",
28 | "text": [
29 | "2024-09-06 07:39:08.755777: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
30 | "2024-09-06 07:39:08.755873: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
31 | "2024-09-06 07:39:08.758162: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
32 | "2024-09-06 07:39:08.773280: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
33 | "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
34 | "2024-09-06 07:39:10.593585: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n"
35 | ]
36 | }
37 | ],
38 | "source": [
39 | "import pandas as pd\n",
40 | "import numpy as np\n",
41 | "from sklearn.model_selection import train_test_split\n",
42 | "from transformers import AutoTokenizer, TrainingArguments, Trainer\n",
43 | "import torch\n",
44 | "from torch import nn\n",
45 | "from transformers import AutoModel, AutoConfig\n",
46 | "from sklearn.preprocessing import LabelEncoder\n",
47 | "from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix\n",
48 | "import seaborn as sns\n",
49 | "import matplotlib.pyplot as plt"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 3,
55 | "id": "9be316ae-3ace-45c9-84c1-69f06b6a85d7",
56 | "metadata": {
57 | "tags": []
58 | },
59 | "outputs": [],
60 | "source": [
61 | "new_data = pd.read_csv(\"dataset_Sep_3_masked.csv\", sep=\",\", engine=\"python\")"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 4,
67 | "id": "f17d1888-6356-4b4e-9a6a-eb173e8cc870",
68 | "metadata": {
69 | "tags": []
70 | },
71 | "outputs": [
72 | {
73 | "data": {
74 | "application/vnd.jupyter.widget-view+json": {
75 | "model_id": "edf0731e88f040fca8d4f9d82d7d4e32",
76 | "version_major": 2,
77 | "version_minor": 0
78 | },
79 | "text/plain": [
80 | "tokenizer_config.json: 0%| | 0.00/2.00 [00:00, ?B/s]"
81 | ]
82 | },
83 | "metadata": {},
84 | "output_type": "display_data"
85 | },
86 | {
87 | "data": {
88 | "application/vnd.jupyter.widget-view+json": {
89 | "model_id": "1e2c98e64b384b44bcb39ff3bbfb7de3",
90 | "version_major": 2,
91 | "version_minor": 0
92 | },
93 | "text/plain": [
94 | "config.json: 0%| | 0.00/459 [00:00, ?B/s]"
95 | ]
96 | },
97 | "metadata": {},
98 | "output_type": "display_data"
99 | },
100 | {
101 | "data": {
102 | "application/vnd.jupyter.widget-view+json": {
103 | "model_id": "20c7e2df037e42d39f8f0372f248458f",
104 | "version_major": 2,
105 | "version_minor": 0
106 | },
107 | "text/plain": [
108 | "vocab.txt: 0%| | 0.00/530k [00:00, ?B/s]"
109 | ]
110 | },
111 | "metadata": {},
112 | "output_type": "display_data"
113 | },
114 | {
115 | "data": {
116 | "application/vnd.jupyter.widget-view+json": {
117 | "model_id": "787d9bcd8bab418388fbd93069adba31",
118 | "version_major": 2,
119 | "version_minor": 0
120 | },
121 | "text/plain": [
122 | "special_tokens_map.json: 0%| | 0.00/112 [00:00, ?B/s]"
123 | ]
124 | },
125 | "metadata": {},
126 | "output_type": "display_data"
127 | }
128 | ],
129 | "source": [
130 | "# Prepare the data\n",
131 | "sentences = new_data['text'].values\n",
132 | "labels = new_data['archaia_or_not'].values\n",
133 | "\n",
134 | "# Encode the labels\n",
135 | "label_encoder = LabelEncoder()\n",
136 | "encoded_labels = label_encoder.fit_transform(labels)\n",
137 | "\n",
138 | "# Split the data\n",
139 | "train_sentences, temp_sentences, train_labels, temp_labels = train_test_split(sentences, encoded_labels, \n",
140 | " test_size=0.3, random_state=42, stratify=encoded_labels)\n",
141 | "val_sentences, dev_sentences, val_labels, dev_labels = train_test_split(temp_sentences, temp_labels,\n",
142 | " test_size=0.5, random_state=42, stratify=temp_labels)\n",
143 | "\n",
144 | "tokenizer = AutoTokenizer.from_pretrained(\"nlpaueb/bert-base-greek-uncased-v1\")"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 5,
150 | "id": "0175c90b-47b5-4a75-9da5-6d2c17c997f2",
151 | "metadata": {
152 | "tags": []
153 | },
154 | "outputs": [
155 | {
156 | "name": "stdout",
157 | "output_type": "stream",
158 | "text": [
159 | "\n",
160 | "Training set label distribution:\n",
161 | "Label 0: 214\n",
162 | "Label 1: 1144\n",
163 | "\n",
164 | "Validation set label distribution:\n",
165 | "Label 0: 46\n",
166 | "Label 1: 245\n",
167 | "\n",
168 | "Dev set label distribution:\n",
169 | "Label 0: 46\n",
170 | "Label 1: 245\n"
171 | ]
172 | }
173 | ],
174 | "source": [
175 | "# Print label distribution\n",
176 | "def print_label_distribution(labels, name):\n",
177 | " unique, counts = np.unique(labels, return_counts=True)\n",
178 | " print(f\"\\n{name} set label distribution:\")\n",
179 | " for label, count in zip(unique, counts):\n",
180 | " print(f\"Label {label}: {count}\")\n",
181 | "\n",
182 | "print_label_distribution(train_labels, \"Training\")\n",
183 | "print_label_distribution(val_labels, \"Validation\")\n",
184 | "print_label_distribution(dev_labels, \"Dev\")\n",
185 | "\n",
186 | "# Tokenize and prepare the dataset\n",
187 | "max_length = 512\n",
188 | "train_encodings = tokenizer(train_sentences.tolist(), truncation=True, padding=True, max_length=max_length)\n",
189 | "val_encodings = tokenizer(val_sentences.tolist(), truncation=True, padding=True, max_length=max_length)\n",
190 | "dev_encodings = tokenizer(dev_sentences.tolist(), truncation=True, padding=True, max_length=max_length)"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": 6,
196 | "id": "b55164e9-125f-4edf-9fe8-28455d7d07b2",
197 | "metadata": {
198 | "tags": []
199 | },
200 | "outputs": [],
201 | "source": [
202 | "# Define the model\n",
203 | "class BertForSequenceClassification(nn.Module):\n",
204 | " def __init__(self, model_name_or_path, num_labels=2):\n",
205 | " super(BertForSequenceClassification, self).__init__()\n",
206 | " self.num_labels = num_labels\n",
207 | " self.config = AutoConfig.from_pretrained(model_name_or_path)\n",
208 | " self.bert = AutoModel.from_pretrained(model_name_or_path, config=self.config)\n",
209 | " self.classifier = nn.Sequential(\n",
210 | " nn.Linear(self.bert.config.hidden_size, 256),\n",
211 | " nn.Dropout(0.1),\n",
212 | " nn.Linear(256, num_labels),\n",
213 | " )\n",
214 | " self.init_weights()\n",
215 | "\n",
216 | " def init_weights(self):\n",
217 | " for module in self.classifier:\n",
218 | " if isinstance(module, nn.Linear):\n",
219 | " nn.init.xavier_uniform_(module.weight)\n",
220 | " if module.bias is not None:\n",
221 | " nn.init.zeros_(module.bias)\n",
222 | "\n",
223 | " def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):\n",
224 | " outputs = self.bert(input_ids, attention_mask=attention_mask)\n",
225 | " pooler_output = outputs.pooler_output\n",
226 | " logits = self.classifier(pooler_output)\n",
227 | "\n",
228 | " loss = None\n",
229 | " if labels is not None:\n",
230 | " loss_fct = nn.CrossEntropyLoss()\n",
231 | " loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))\n",
232 | "\n",
233 | " return (loss, logits) if loss is not None else logits\n",
234 | " \n",
235 | " def save_pretrained(self, save_directory):\n",
236 | " os.makedirs(save_directory, exist_ok=True)\n",
237 | " self.config.save_pretrained(save_directory)\n",
238 | " torch.save(self.state_dict(), os.path.join(save_directory, \"pytorch_model.bin\"))\n",
239 | "\n",
240 | " @classmethod\n",
241 | " def from_pretrained(cls, save_directory, model_name_or_path, num_labels=2):\n",
242 | " config = AutoConfig.from_pretrained(save_directory)\n",
243 | " model = cls(model_name_or_path, num_labels=num_labels)\n",
244 | " state_dict = torch.load(os.path.join(save_directory, \"pytorch_model.bin\"), map_location=torch.device('cpu'))\n",
245 | " model.load_state_dict(state_dict)\n",
246 | " return model"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 7,
252 | "id": "f7199568-57ba-4140-9434-69767c9a4954",
253 | "metadata": {
254 | "tags": []
255 | },
256 | "outputs": [],
257 | "source": [
258 | "# Create dataset class\n",
259 | "class GreekSentencesDataset(torch.utils.data.Dataset):\n",
260 | " def __init__(self, encodings, labels):\n",
261 | " self.encodings = encodings\n",
262 | " self.labels = labels\n",
263 | "\n",
264 | " def __getitem__(self, idx):\n",
265 | " item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
266 | " item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)\n",
267 | " return item\n",
268 | "\n",
269 | " def __len__(self):\n",
270 | " return len(self.labels)"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 8,
276 | "id": "c1c0eeac-7156-4137-abeb-096396309973",
277 | "metadata": {
278 | "tags": []
279 | },
280 | "outputs": [
281 | {
282 | "data": {
283 | "application/vnd.jupyter.widget-view+json": {
284 | "model_id": "d599dbc853b148a38412781fc0dfc38b",
285 | "version_major": 2,
286 | "version_minor": 0
287 | },
288 | "text/plain": [
289 | "pytorch_model.bin: 0%| | 0.00/454M [00:00, ?B/s]"
290 | ]
291 | },
292 | "metadata": {},
293 | "output_type": "display_data"
294 | },
295 | {
296 | "name": "stderr",
297 | "output_type": "stream",
298 | "text": [
299 | "Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).\n"
300 | ]
301 | }
302 | ],
303 | "source": [
304 | "train_dataset = GreekSentencesDataset(train_encodings, train_labels)\n",
305 | "val_dataset = GreekSentencesDataset(val_encodings, val_labels)\n",
306 | "dev_dataset = GreekSentencesDataset(dev_encodings, dev_labels)\n",
307 | "\n",
308 | "# Initialize the model\n",
309 | "model_name_or_path = \"nlpaueb/bert-base-greek-uncased-v1\"\n",
310 | "num_labels = len(label_encoder.classes_)\n",
311 | "model = BertForSequenceClassification(model_name_or_path, num_labels)\n",
312 | "\n",
313 | "# Define training arguments and trainer\n",
314 | "training_args = TrainingArguments(\n",
315 | " output_dir='./results',\n",
316 | " num_train_epochs=1,\n",
317 | " per_device_train_batch_size=16,\n",
318 | " per_device_eval_batch_size=16,\n",
319 | " warmup_steps=500,\n",
320 | " weight_decay=0.02,\n",
321 | " logging_dir='./logs',\n",
322 | " logging_steps=10,\n",
323 | " evaluation_strategy=\"epoch\",\n",
324 | " save_strategy=\"epoch\",\n",
325 | " load_best_model_at_end=True,\n",
326 | ")"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 9,
332 | "id": "8f8326f0-258d-4875-85d7-f82cec050c5e",
333 | "metadata": {
334 | "tags": []
335 | },
336 | "outputs": [],
337 | "source": [
338 | "def compute_metrics(pred):\n",
339 | " labels = pred.label_ids\n",
340 | " preds = pred.predictions.argmax(-1)\n",
341 | " precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')\n",
342 | " acc = accuracy_score(labels, preds)\n",
343 | " return {\n",
344 | " 'accuracy': acc,\n",
345 | " 'f1': f1,\n",
346 | " 'precision': precision,\n",
347 | " 'recall': recall\n",
348 | " }\n",
349 | "\n",
350 | "trainer = Trainer(\n",
351 | " model=model,\n",
352 | " args=training_args,\n",
353 | " train_dataset=train_dataset,\n",
354 | " eval_dataset=val_dataset,\n",
355 | " compute_metrics=compute_metrics\n",
356 | ")"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": 10,
362 | "id": "eec99140-6c03-492b-b573-4a2353f9e5ad",
363 | "metadata": {
364 | "tags": []
365 | },
366 | "outputs": [
367 | {
368 | "data": {
369 | "text/html": [
370 | "\n",
371 | "
\n",
372 | " \n",
373 | "
\n",
374 | " [85/85 02:12, Epoch 1/1]\n",
375 | "
\n",
376 | " \n",
377 | " \n",
378 | " \n",
379 | " Epoch | \n",
380 | " Training Loss | \n",
381 | " Validation Loss | \n",
382 | " Accuracy | \n",
383 | " F1 | \n",
384 | " Precision | \n",
385 | " Recall | \n",
386 | "
\n",
387 | " \n",
388 | " \n",
389 | " \n",
390 | " 1 | \n",
391 | " 0.000900 | \n",
392 | " 0.076329 | \n",
393 | " 0.989691 | \n",
394 | " 0.993915 | \n",
395 | " 0.987903 | \n",
396 | " 1.000000 | \n",
397 | "
\n",
398 | " \n",
399 | "
"
400 | ],
401 | "text/plain": [
402 | ""
403 | ]
404 | },
405 | "metadata": {},
406 | "output_type": "display_data"
407 | },
408 | {
409 | "data": {
410 | "text/plain": [
411 | "TrainOutput(global_step=85, training_loss=0.39309312596040613, metrics={'train_runtime': 134.9228, 'train_samples_per_second': 10.065, 'train_steps_per_second': 0.63, 'total_flos': 0.0, 'train_loss': 0.39309312596040613, 'epoch': 1.0})"
412 | ]
413 | },
414 | "execution_count": 10,
415 | "metadata": {},
416 | "output_type": "execute_result"
417 | }
418 | ],
419 | "source": [
420 | "# Train the model\n",
421 | "trainer.train()"
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": 11,
427 | "id": "5a874aba-57c9-4f13-9aa1-cc56776a40f1",
428 | "metadata": {
429 | "tags": []
430 | },
431 | "outputs": [
432 | {
433 | "data": {
434 | "text/html": [],
435 | "text/plain": [
436 | ""
437 | ]
438 | },
439 | "metadata": {},
440 | "output_type": "display_data"
441 | }
442 | ],
443 | "source": [
444 | "# Evaluate on dev set\n",
445 | "dev_pred = trainer.predict(dev_dataset)\n",
446 | "dev_preds = dev_pred.predictions.argmax(-1)\n",
447 | "dev_labels = dev_dataset.labels"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": 12,
453 | "id": "a2ccba6e-f2e4-4495-bfed-c66f1f73af1a",
454 | "metadata": {
455 | "tags": []
456 | },
457 | "outputs": [
458 | {
459 | "name": "stdout",
460 | "output_type": "stream",
461 | "text": [
462 | "\n",
463 | "Dev Set Evaluation:\n",
464 | "Accuracy: 0.9897\n",
465 | "Precision: 0.9879\n",
466 | "Recall: 1.0000\n",
467 | "F1 Score: 0.9939\n"
468 | ]
469 | },
470 | {
471 | "data": {
472 | "text/plain": [
473 | "('./binary_classifier_saved_model/tokenizer_config.json',\n",
474 | " './binary_classifier_saved_model/special_tokens_map.json',\n",
475 | " './binary_classifier_saved_model/vocab.txt',\n",
476 | " './binary_classifier_saved_model/added_tokens.json',\n",
477 | " './binary_classifier_saved_model/tokenizer.json')"
478 | ]
479 | },
480 | "execution_count": 12,
481 | "metadata": {},
482 | "output_type": "execute_result"
483 | }
484 | ],
485 | "source": [
486 | "dev_accuracy = accuracy_score(dev_labels, dev_preds)\n",
487 | "dev_precision, dev_recall, dev_f1, _ = precision_recall_fscore_support(dev_labels, dev_preds, average='binary')\n",
488 | "\n",
489 | "print(\"\\nDev Set Evaluation:\")\n",
490 | "print(f\"Accuracy: {dev_accuracy:.4f}\")\n",
491 | "print(f\"Precision: {dev_precision:.4f}\")\n",
492 | "print(f\"Recall: {dev_recall:.4f}\")\n",
493 | "print(f\"F1 Score: {dev_f1:.4f}\")\n",
494 | "\n",
495 | "# Create confusion matrix\n",
496 | "cm = confusion_matrix(dev_labels, dev_preds)\n",
497 | "plt.figure(figsize=(10, 8))\n",
498 | "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')\n",
499 | "plt.title('Confusion Matrix')\n",
500 | "plt.xlabel('Predicted')\n",
501 | "plt.ylabel('True')\n",
502 | "plt.savefig('confusion_matrix.png')\n",
503 | "plt.close()\n",
504 | "\n",
505 | "# Save the model\n",
506 | "model.save_pretrained(\"./binary_classifier_saved_model\")\n",
507 | "tokenizer.save_pretrained(\"./binary_classifier_saved_model\")"
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": 13,
513 | "id": "173d4aaa-3db0-44e3-a768-4f223616e5ae",
514 | "metadata": {
515 | "tags": []
516 | },
517 | "outputs": [
518 | {
519 | "name": "stdout",
520 | "output_type": "stream",
521 | "text": [
522 | "Using device: cuda\n"
523 | ]
524 | }
525 | ],
526 | "source": [
527 | "import pandas as pd\n",
528 | "import torch\n",
529 | "from transformers import AutoTokenizer\n",
530 | "from torch.utils.data import TensorDataset, DataLoader\n",
531 | "\n",
532 | "# Load the dataset\n",
533 | "df = pd.read_csv(\"../twok_masked.csv\")\n",
534 | "\n",
535 | "# Load the saved model and tokenizer\n",
536 | "loaded_model = BertForSequenceClassification.from_pretrained(\"./binary_classifier_saved_model\", model_name_or_path=\"nlpaueb/bert-base-greek-uncased-v1\")\n",
537 | "loaded_tokenizer = AutoTokenizer.from_pretrained(\"./binary_classifier_saved_model\")\n",
538 | "\n",
539 | "# Check if CUDA is available\n",
540 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
541 | "print(f\"Using device: {device}\")\n",
542 | "\n",
543 | "# Move the model to the appropriate device\n",
544 | "loaded_model.to(device)\n",
545 | "\n",
546 | "# Tokenize all texts\n",
547 | "encodings = loaded_tokenizer(df['text'].tolist(), truncation=True, padding=True, max_length=512)\n",
548 | "dataset = TensorDataset(torch.tensor(encodings['input_ids']), \n",
549 | " torch.tensor(encodings['attention_mask']))\n",
550 | "dataloader = DataLoader(dataset, batch_size=32) # Adjust batch size as needed\n",
551 | "\n",
552 | "# Make predictions\n",
553 | "loaded_model.eval()\n",
554 | "predictions = []"
555 | ]
556 | },
557 | {
558 | "cell_type": "code",
559 | "execution_count": 14,
560 | "id": "9949ca2b-2788-440e-8bf0-dd817d5a6b4e",
561 | "metadata": {
562 | "tags": []
563 | },
564 | "outputs": [],
565 | "source": [
566 | "with torch.no_grad():\n",
567 | " for batch in dataloader:\n",
568 | " input_ids, attention_mask = [b.to(device) for b in batch]\n",
569 | " outputs = loaded_model(input_ids, attention_mask=attention_mask)\n",
570 | " logits = outputs[0] if isinstance(outputs, tuple) else outputs\n",
571 | " preds = torch.argmax(logits, dim=1)\n",
572 | " predictions.extend(preds.cpu().numpy())"
573 | ]
574 | },
575 | {
576 | "cell_type": "code",
577 | "execution_count": 16,
578 | "id": "bcc439a5-1044-424b-a219-3e7506df522b",
579 | "metadata": {
580 | "tags": []
581 | },
582 | "outputs": [
583 | {
584 | "name": "stdout",
585 | "output_type": "stream",
586 | "text": [
587 | "Processed 2000 rows.\n",
588 | "Results saved to 'twok_masked_with_predictions.csv'\n",
589 | "\n",
590 | "Distribution of predictions:\n",
591 | "archaia\n",
592 | " 1 0.7825\n",
593 | "-9999 0.2175\n",
594 | "Name: proportion, dtype: float64\n",
595 | "\n",
596 | "Distribution of predictions for masked items:\n",
597 | "archaia\n",
598 | "1 1.0\n",
599 | "Name: proportion, dtype: float64\n"
600 | ]
601 | }
602 | ],
603 | "source": [
604 | "# Add predictions to the dataframe\n",
605 | "df['archaia'] = predictions\n",
606 | "\n",
607 | "# Check 'mask' column and set 'ΚΝΕ' to -9999 if mask is 0\n",
608 | "df.loc[df['mask'] == 0, 'archaia'] = -9999\n",
609 | "\n",
610 | "# Remove columns '1' through '5'\n",
611 | "columns_to_remove = ['1', '2', '3', '4', '5']\n",
612 | "df = df.drop(columns=[col for col in columns_to_remove if col in df.columns])\n",
613 | "\n",
614 | "# Save the results\n",
615 | "df.to_csv(\"twok_masked_with_predictions.csv\", index=False)\n",
616 | "\n",
617 | "print(f\"Processed {len(df)} rows.\")\n",
618 | "print(\"Results saved to 'twok_masked_with_predictions.csv'\")\n",
619 | "\n",
620 | "# Print distribution of predictions\n",
621 | "print(\"\\nDistribution of predictions:\")\n",
622 | "print(df['archaia'].value_counts(normalize=True))\n",
623 | "\n",
624 | "# Print distribution of predictions for masked items only\n",
625 | "masked_df = df[df['mask'] == 1]\n",
626 | "print(\"\\nDistribution of predictions for masked items:\")\n",
627 | "print(masked_df['archaia'].value_counts(normalize=True))"
628 | ]
629 | },
630 | {
631 | "cell_type": "code",
632 | "execution_count": null,
633 | "id": "db5e881d-6d8c-4e51-9548-eea2fcdac1d2",
634 | "metadata": {},
635 | "outputs": [],
636 | "source": []
637 | }
638 | ],
639 | "metadata": {
640 | "kernelspec": {
641 | "display_name": "Python 3 (ipykernel)",
642 | "language": "python",
643 | "name": "python3"
644 | },
645 | "language_info": {
646 | "codemirror_mode": {
647 | "name": "ipython",
648 | "version": 3
649 | },
650 | "file_extension": ".py",
651 | "mimetype": "text/x-python",
652 | "name": "python",
653 | "nbconvert_exporter": "python",
654 | "pygments_lexer": "ipython3",
655 | "version": "3.11.7"
656 | }
657 | },
658 | "nbformat": 4,
659 | "nbformat_minor": 5
660 | }
661 |
--------------------------------------------------------------------------------
/Greek_variety_classification/models/Ancient_Gr_classifier_model.zip:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:618bedd41fca77771aa37f966e168a02fb2016978debf7fa0ec041554372e430
3 | size 420182149
4 |
--------------------------------------------------------------------------------
/Greek_variety_classification/models/Gr_Var_Classifier_model.zip:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9a6a615614897bf7160529d80a321c4635e89c4b099b51efce6296fe3f7c5d0b
3 | size 420183135
4 |
--------------------------------------------------------------------------------
/Greek_variety_classification/preprocessing/clean_data_with_mask.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import re
3 | import os
4 | from transformers import AutoTokenizer
5 |
6 | # Set the working directory and filename
7 | working_directory = "/home/fivos/Downloads"
8 | file_name = "dataset_Sep_3.csv"
9 |
10 | os.chdir(working_directory)
11 |
12 | # Load the data
13 | data = pd.read_csv(file_name, sep=",", engine="python")
14 |
15 | # Load the tokenizer
16 | tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")
17 |
18 | print(data.columns)
19 |
20 | # Ensure 'text' column contains strings
21 | if 'text' in data.columns:
22 | data["text"] = data["text"].astype(str)
23 | else:
24 | print("Column 'text' not found in the dataset.")
25 |
26 | # Function to check if text is mostly Latin characters
27 | def is_mostly_latin(text, threshold=0.5):
28 | latin_chars = re.findall(r"[a-zA-Z]", text)
29 | return (len(latin_chars) / len(text)) > threshold if len(text) > 0 else False
30 |
31 | # Function to check if text is more than 50% numbers
32 | def is_mostly_numbers(text, threshold=0.5):
33 | num_chars = re.findall(r"[0-9]", text)
34 | return (len(num_chars) / len(text)) > threshold if len(text) > 0 else False
35 |
36 | # Function to check if text has fewer than 4 words
37 | def too_short(text):
38 | return len(text.split()) < 4
39 |
40 | # Function to check if text has more than 512 tokens
41 | def has_more_than_512_tokens(text):
42 | # Fragments should be smaller than 512 tokens for GreekBERT
43 | return len(tokenizer.encode(text)) > 512
44 |
45 | # Function to clean text
46 | def clean_text(text):
47 | # Remove formatting characters
48 | text = re.sub(r"[\n\t]", " ", text)
49 | # Remove leading, trailing, and multiple spaces
50 | text = ' '.join(text.split())
51 | # Remove numerical ordering elements with ()
52 | text = re.sub(r"\(?\d+\)|\d+\.", "", text)
53 | # Remove numerical ordering elements with {}
54 | text = re.sub(r"\{\d+\}", "", text)
55 | # Remove ordering elements with Greek numerals
56 | text = re.sub(
57 | r"(?", "()", "{}", and other similar characters
89 | data["text"] = data["text"].apply(lambda x: re.sub(r'[<>\[\]\(\)\{\}]', '', x))
90 |
91 | # Update mask for empty text cells
92 | data.loc[~data["text"].str.strip().astype(bool), 'mask'] = 0
93 |
94 | # Update mask for mostly Latin text
95 | data.loc[data["text"].apply(is_mostly_latin), 'mask'] = 0
96 |
97 | # Update mask for mostly numbers
98 | data.loc[data["text"].apply(is_mostly_numbers), 'mask'] = 0
99 |
100 | # Update mask for too_short text
101 | data.loc[data["text"].apply(too_short), 'mask'] = 0
102 |
103 | # Update mask for text with more than 512 tokens
104 | data.loc[data["text"].apply(has_more_than_512_tokens), 'mask'] = 0
105 |
106 | # Save the result to a new CSV file
107 | output_file_path = os.path.join(os.getcwd(), os.path.splitext(file_name)[0] + "_masked.csv")
108 | data.to_csv(output_file_path, index=False, quoting=1) # quoting=1 ensures all fields are quoted
109 |
110 | print("Cleaned data with mask saved")
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # GlossAPI
2 |
3 | [](https://pypi.org/project/glossapi/)
4 |
5 | A library for processing texts in Greek and other languages, developed by [Open Technologies Alliance(GFOSS)](https://gfoss.eu/).
6 |
7 | ## Features
8 |
9 | - **Document Processing**: Extract text content from academic PDFs, DOCX, HTML, and other formats with structure preservation
10 | - **Document Downloading**: Download documents from URLs with automatic handling of various formats
11 | - **Quality Control**: Filter and cluster documents based on extraction quality
12 | - **Section Extraction**: Identify and extract academic sections from documents
13 | - **Section Classification**: Classify sections using machine learning models
14 | - **Greek Language Support**: Specialized processing for Greek academic texts
15 | - **Metadata Handling**: Process academic texts with accompanying metadata
16 | - **Customizable Annotation**: Map section titles to standardized categories
17 | - **Flexible Pipeline**: Start the processing from any stage in the pipeline
18 |
19 | ## Installation
20 |
21 | ```bash
22 | pip install glossapi
23 | ```
24 |
25 | ## Usage
26 |
27 | The recommended way to use GlossAPI is through the `Corpus` class, which provides a complete pipeline for processing academic documents. You can use the same directory for both input and output:
28 |
29 | ```python
30 | from glossapi import Corpus
31 | import logging
32 |
33 | # Configure logging (optional)
34 | logging.basicConfig(level=logging.INFO)
35 |
36 | # Set the directory path (use the same for input and output)
37 | folder = "/path/to/corpus" # Use abstract path names
38 |
39 | # Initialize Corpus with input and output directories
40 | corpus = Corpus(
41 | input_dir=folder,
42 | output_dir=folder
43 | # metadata_path="/path/to/metadata.parquet", # Optional
44 | # annotation_mapping={
45 | # 'Κεφάλαιο': 'chapter',
46 | # # Add more mappings as needed
47 | # }
48 | )
49 |
50 | # The pipeline can start from any of these steps:
51 |
52 | # Step 1: Download documents (if URLs are provided)
53 | corpus.download(url_column='a_column_name') # Specify column with URLs, default column name is 'url'
54 |
55 | # Step 2: Extract documents
56 | corpus.extract()
57 |
58 | # Step 3: Extract sections from filtered documents
59 | corpus.section()
60 |
61 | # Step 4: Classify and annotate sections
62 | corpus.annotate() # or corpus.annotate(annotation_type="chapter") For texts without TOC or bibliography
63 | ```
64 |
65 | ## Folder Structure
66 |
67 | After running the pipeline, the following folder structure will be created:
68 |
69 | ```
70 | corpus/ # Your specified folder
71 | ├── download_results # stores metadata file with annotation from previous processing steps
72 | ├── downloads/ # Downloaded documents (if download() is used)
73 | ├── markdown/ # Extracted text files in markdown format
74 | ├── sections/ # Contains the processed sections in parquet format
75 | │ ├── sections_for_annotation.parquet
76 | ├── classified_sections.parquet # Intermediate processing form
77 | ├── fully_annotated_sections.parquet # Final processing form with section predictions
78 | ```
79 |
80 | The `fully_annotated_sections.parquet` file contains the final processing form. The `predicted_sections` column shows the type of section: 'π' (table of contents), 'β' (bibliography), 'ε.σ.' (introductory note), 'κ' (main text), or 'a' (appendix). For files without table of contents or bibliography, the annotation will be "άλλο" (other).
81 |
82 | ## Note on Starting Points
83 |
84 | **Option 1: Start with Document Download**
85 | Create a corpus folder and add a parquet file with URLs for downloading:
86 | ```
87 | corpus/
88 | └── metadata.parquet (with a column containing document URLs)
89 | ```
90 | Then use `corpus.download(url_column='column_name')` with the URL column name from your parquet file.
91 |
92 | **Option 2: Start with Document Extraction**
93 | Alternatively, place documents directly in the corpus folder and skip download:
94 | ```
95 | corpus/
96 | └── document1.pdf, document2.docx, etc.
97 | ```
98 | GlossAPI will automatically create a metadata folder in downloads if starting from extract.
99 |
100 | ## License
101 |
102 | This project is licensed under the [European Union Public Licence 1.2 (EUPL 1.2)](https://interoperable-europe.ec.europa.eu/collection/eupl/eupl-text-eupl-12).
103 |
--------------------------------------------------------------------------------
/dataset_progress.md:
--------------------------------------------------------------------------------
1 | Στόχος της [ΕΕΛΛΑΚ](https://eellak.gr/) είναι η ανάπτυξη ενός Ελληνικού μοντέλου τεχνητής νοημοσύνης(ΤΝ) ανοιχτού λογισμικού, που ο κώδικας του θα διατίθεται με την άδεια ανοιχτού λογισμικού [EUPL](https://eupl.eu/), τα βάρη και όλα τα δεδομένα θα είναι διαθέσιμα με την άδεια [Creative Commons BY-SA](https://creativecommons.org/licenses/by-sa/4.0/), **1ος στόχος του glossAPI είναι η συγκέντρωση, επεξεργασία και συντήρηση αντιπροσωπευτικών συνόλων ελληνικών κειμένων** ώστε να μπορεί ένα μοντέλο ΤΝ να χειρίζεται σωστά την Ελληνική γλώσσα.
2 |
3 | :rocket: **Δημιουργία καθαρισμένων κειμενικών δεδομένων με χρήσιμα μεταδεδομένα**
4 |
5 | ## Datasets
6 |
7 | ### 95Κ Δείγμα Ελληνικής (95K Greek Sample)
8 | - [✓] Scraped
9 | - [✓] Downloaded
10 | - [✓] Cleaned
11 | - [✓] Uploaded with metadata (https://huggingface.co/datasets/glossAPI/95k_deigma_ellinikis)
12 |
13 | A diverse sample of 95,000 Greek texts, providing a broad representation of modern Greek language usage. Useful for general NLP tasks and language modeling.
14 |
15 | ### Σχολικά Βιβλία (School Books)
16 | - [✓] Scraped
17 | - [✓] Downloaded
18 | - [✓] Cleaned
19 | - [✓] Uploaded with metadata (https://huggingface.co/datasets/glossAPI/Sxolika_vivlia)
20 |
21 | Collection of Greek school textbooks and educational materials. Great resource for educational NLP applications and studying formal Modern Greek.
22 |
23 | ### Δημώδης Λογοτεχνία (Folk Literature)
24 | - [✓] Scraped
25 | - [✓] Downloaded
26 | - [✓] Cleaned
27 | - [✓] Uploaded with metadata (https://huggingface.co/datasets/glossAPI/dimodis_logotexnia)
28 |
29 | Traditional Greek folk literature, including stories, songs, and poems. Valuable for cultural preservation and studying regional Greek variations.
30 |
31 | ### Ελληνικά Κείμενα Project Gutenberg (Project Gutenberg Greek Texts)
32 | - [✓] Scraped
33 | - [✓] Downloaded
34 | - [✓] Cleaned
35 | - [✓] Uploaded with metadata (https://huggingface.co/datasets/glossAPI/Ellinika_Keimena_Project_Gutenberg)
36 |
37 | Public domain Greek texts from Project Gutenberg, spanning various periods and genres. Excellent for literary analysis and historical language studies.
38 |
39 | ### 1000 Πρώτα Χρόνια Ελληνικής (First 1000 Years of Greek)
40 | - [✓] Scraped
41 | - [✓] Downloaded
42 | - [✓] Cleaned
43 | - [✓] Uploaded with metadata (https://huggingface.co/datasets/glossAPI/1000_prwta_xronia_ellhnikhs)
44 |
45 | Texts covering the first millennium of written Greek, crucial for studying the evolution of the Greek language and historical linguistics.
46 |
47 | ### Κλασική Αρχαία Ελληνική Γραμματεία (Classical Ancient Greek Literature)
48 | - [✓] Scraped
49 | - [✓] Downloaded
50 | - [✓] Cleaned
51 | - [✓] Uploaded with metadata (https://huggingface.co/datasets/glossAPI/klasikh_arx_ell_grammateia)
52 |
53 | Core works of Classical Greek literature, including philosophical, historical, and dramatic texts. Essential for classical studies and ancient Greek NLP.
54 |
55 | ### Ελληνικά Κείμενα Wikisource (Wikisource Greek Texts)
56 | - [✓] Scraped
57 | - [✓] Downloaded
58 | - [✓] Cleaned
59 | - [✓] Uploaded with metadata (https://huggingface.co/datasets/glossAPI/Wikisource_Greek_texts)
60 |
61 | ### Πέργαμος (Πέργαμος)
62 | - [✓] Scraped
63 | - [✓] Downloading
64 | - [✓] Preprocessed
65 | - [✓] Each article's sections categorized by type (introductory remarks, index etc.)
66 | - [✓] Uploaded with metadata
67 |
68 | Συλλογή κειμένων από την πλατφόρμα Πέργαμος. Collection of texts from the Pergamos' University theses archive.
69 |
70 | ### :construction: Υπό επεξεργασία (Work in Progress)
71 |
72 | ### Κάλλιπος (Kallipos)
73 | - [✓] Scraped
74 | - [✓] Downloaded
75 | - [ ] Cleaned
76 | - [ ] Uploaded with metadata
77 |
78 | Ακαδημαϊκά συγγράμματα από την πλατφόρμα Κάλλιπος. Open source academic textbooks from Kallipos.
79 |
80 | ### Έγγραφα ΕΕ (EU Documents)
81 | - [ ] Downloaded
82 | - [ ] Cleaned
83 | - [ ] Uploaded with metadata
84 |
85 | Επίσημα έγγραφα της Ευρωπαϊκής Ένωσης. Official documents of the European Union.
86 |
87 | [γlo'sapi]
88 |
89 | ### glossAPI, το
90 |
91 | Ένα έργο της ΕΕΛΛΑΚ στον χώρο των ψηφιακών ανθρωπιστικών επιστημών που αξιοποιεί ελεύθερα διαθέσιμες πηγές για τη συγκέντρωση ενός εκτενούς σώματος κειμένων υψηλής ποιότητας τα οποία παρέχονται με άδεια Creative Commons. Το glossAPI καλύπτει ένα ευρύ φάσμα θεματικών περιοχών, από την επιστήμη και τη λογοτεχνία έως τα νομικά κείμενα, με δεδομένα που υφίστανται επιμελή επεξεργασία και αποδελτίωση.
92 |
93 | Στόχος του glossAPI είναι να διευκολύνει την επεξεργασία κειμενικών δεδομένων και την εκπαίδευση σύγχρονων γλωσσικών μοντέλων. Όλα τα εργαλεία που αναπτύσσει διατίθενται ελεύθερα με άδεια EUPL μέσω του αποθετηρίου του στο Github.
94 |
95 | Το glossAPI συμβάλει στην ανάπτυξη των ελληνικών ανοιχτών κειμενικών δεδομένων, ενθαρρύνοντας ερευνητές και φοιτητές να χρησιμοποιήσουν τα εργαλεία που αναπτύχθηκαν, και να επεκτείνουν το κώδικα και τα δεδομένα προς κατευθύνσεις που τους ενδιαφέρουν.
96 |
97 | [ 1: greeklish < γλωσσάρι 2: αγγλ. gloss < μεσαιων. αγγλ. gloze < μεσαιων. λατ. glōsa < κλασ. λατ. glōssa < αρχ. γλῶσσα: "γλώσσα, λέξη" + αγγλ. API: Application Programming Interface ]
98 |
99 | Επικοινωνία/ contact at: glossapi.team@eellak.gr
100 |
--------------------------------------------------------------------------------
/pipeline/LICENSE.md:
--------------------------------------------------------------------------------
1 | ../LICENSE.md
--------------------------------------------------------------------------------
/pipeline/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE.md
3 | recursive-include src/glossapi/models *.joblib
4 |
--------------------------------------------------------------------------------
/pipeline/README.md:
--------------------------------------------------------------------------------
1 | # GlossAPI
2 |
3 | [](https://github.com/eellak/glossAPI/releases)
4 | [](https://pypi.org/project/glossapi/)
5 |
6 | A library for processing academic texts in Greek and other languages, developed by [ΕΕΛΛΑΚ](https://eellak.gr/).
7 |
8 | ## Features
9 |
10 | - **Document Processing**: Extract text content from academic PDFs, DOCX, XML, HTML, and other formats with structure preservation
11 | - **Robust Batch Processing**: Process documents in batches with error isolation and automatic resumption
12 | - **Quality Control**: Filter and cluster documents based on extraction quality
13 | - **Section Extraction**: Identify and extract academic sections from documents
14 | - **Section Classification**: Classify sections using machine learning models
15 | - **Greek Language Support**: Specialized processing for Greek academic texts
16 | - **Metadata Handling**: Process academic texts with accompanying metadata
17 | - **Customizable Annotation**: Map section titles to standardized categories
18 |
19 | ## Installation
20 |
21 | ```bash
22 | pip install glossapi==0.0.9
23 | ```
24 |
25 | ## Usage
26 |
27 | The recommended way to use GlossAPI is through the `Corpus` class, which provides a complete pipeline for processing academic documents:
28 |
29 | ```python
30 | from glossapi import Corpus
31 | import logging
32 |
33 | # Configure logging (optional)
34 | logging.basicConfig(level=logging.INFO)
35 |
36 | # Initialize Corpus with input and output directories
37 | corpus = Corpus(
38 | input_dir="/path/to/documents",
39 | output_dir="/path/to/output"
40 | # metadata_path="/path/to/metadata.parquet", # Optional
41 | # annotation_mapping={
42 | # 'Κεφάλαιο': 'chapter',
43 | # # Add more mappings as needed
44 | # }
45 | )
46 |
47 | # Step 1: Extract documents (quality control)
48 | corpus.extract()
49 |
50 | # Step 2: Extract sections from filtered documents
51 | corpus.section()
52 |
53 | # Step 3: Classify and annotate sections
54 | corpus.annotate()
55 | ```
56 |
57 | ## License
58 |
59 | This project is licensed under the European Union Public Licence 1.2 (EUPL 1.2).
60 |
--------------------------------------------------------------------------------
/pipeline/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=42", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "glossapi"
7 | version = "0.0.13"
8 | description = "A library for processing academic texts in Greek and other languages"
9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | license = {text = "European Union Public Licence 1.2 (EUPL 1.2)"}
12 | authors = [
13 | {name = "GlossAPI Team", email = "foivos@example.com"}
14 | ]
15 | classifiers = [
16 | "Programming Language :: Python :: 3",
17 | "License :: OSI Approved :: European Union Public Licence 1.2 (EUPL 1.2)",
18 | "Operating System :: OS Independent",
19 | "Development Status :: 3 - Alpha",
20 | ]
21 | dependencies = [
22 | "docling",
23 | "pandas",
24 | "numpy",
25 | "scikit-learn",
26 | "joblib",
27 | "dask",
28 | "pyarrow",
29 | "ftfy",
30 | "tenacity",
31 | "aiohttp",
32 | "aiofiles"
33 | ]
34 |
35 | [tool.setuptools]
36 | package-dir = {"" = "src"}
37 | include-package-data = true
38 |
39 | [tool.setuptools.packages.find]
40 | where = ["src"]
41 |
42 | [tool.setuptools.package-data]
43 | glossapi = ["models/*.joblib"]
44 |
45 | [project.urls]
46 | Repository = "https://github.com/eellak/glossAPI"
47 |
--------------------------------------------------------------------------------
/pipeline/refactoring_todo.md:
--------------------------------------------------------------------------------
1 | # GlossAPI Pipeline Refactoring TODO List
2 |
3 | ## Build and Install
4 |
5 | **IMPORTANT:** After implementing the changes, you need to build and install the package in the virtual environment for the changes to take effect:
6 |
7 | ```bash
8 | # Activate the virtual environment first
9 | source /mnt/data/venv/bin/activate
10 |
11 | # Go to the pipeline directory
12 | cd /mnt/data/glossAPI/pipeline
13 |
14 | # Install the package in development mode
15 | pip install -e .
16 |
17 | # Now you can run the simple_test.py script
18 | python /mnt/data/simple_test.py
19 | ```
20 |
21 | **ALWAYS KEEP IN MIND:** The pipeline must work with the existing interface in simple_test.py using the "corpus.command()" pattern.
22 |
23 | ## ✅ COMPLETED
24 |
25 | ### 1) Modified GlossDownloader
26 |
27 | - Updated `GlossDownloader` class to use a dedicated "downloads" folder:
28 | - Modified the `download_files()` method to use `self.output_dir / "downloads"` instead of `self.input_dir`
29 | - All downloaded files are now saved in this subdirectory
30 | - Updated the `Corpus.download()` method to create and use this downloads folder
31 | - Added validation to check if downloaded files are of the supported types (pdf, docx, xml, html, pptx, csv, md)
32 |
33 | ### 2) Updated GlossExtract
34 |
35 | - Modified the `extract()` method in `Corpus` class to:
36 | - Look for files in the "downloads" directory first
37 | - If "downloads" directory doesn't exist, check for supported file types in the input folder and move them to a new "downloads" folder
38 | - Continue processing from the "downloads" folder
39 | - Updated the file location handling across the pipeline to reflect this change
40 |
41 | ### 3) Created a Standardized Parquet Class
42 |
43 | - Created a new file called `parquet_schema.py` with a standardized schema class:
44 | - Defined required metadata fields for processing
45 | - Implemented standard schemas for different pipeline stages
46 | - Defined standard columns (id, row_id, filename, title, section, predicted_section)
47 | - Added methods for reading/writing with standard schema validation
48 |
49 | ### 4) Improved Bad File Filtering in Sectioning
50 |
51 | - Made `filenames_to_process` a required parameter in section.py
52 | - Enhanced filtering to ensure only good files (based on extraction quality in parquet) are processed
53 | - Added detailed logging for processed and skipped files
54 | - Verified that section.py correctly handles all sectioning scenarios:
55 | - Text between two headers
56 | - Text before the first header
57 | - Text after the last header
58 | - Documents with no headers at all
59 | - Fixed indentation issues in corpus.py that were causing execution problems
60 |
61 | ## TODO
62 |
63 | ### 1) Finish Removing Redundant Code
64 |
65 | - Remove the remaining redundant code related to good/bad folders:
66 | - The `extract_quality` method in corpus.py still deals with good/bad folders
67 | - Remove all code related to copying files to good/bad directories
68 | - Remove references to `good_markdown_dir` since we're using extraction quality markers in parquet
69 | - Update all methods to use the simplified directory structure
70 |
71 | ### 2) Complete Two-Parquet Pipeline Implementation
72 |
73 | **Progress**: We've successfully implemented the first parquet (downloader parquet with extraction quality) but need to consolidate the section-related parquets.
74 |
75 | - Currently we still have 3 section parquet files that need to be consolidated:
76 | - `sections_for_annotation.parquet`
77 | - `classified_sections.parquet`
78 | - `fully_annotated_sections.parquet`
79 |
80 | - Implementation tasks:
81 | - Consolidate the 3 section-related parquet files into a single sections parquet
82 | - Update all methods to work with the consolidated parquet structure
83 | - Ensure all metadata columns are preserved during consolidation
84 | - Add metadata column "processing_stage" to track progress through pipeline
85 | - Update the verification method to check for required columns rather than specific filenames
86 | - Throw clear error messages when required columns are missing
87 |
88 | ### 3) Make Split_Bad an Explicit Pipeline Step
89 |
90 | - Extract the split_bad functionality from internal GlossExtract methods
91 | - Create a dedicated method in Corpus class
92 | - Make it explicitly update extraction quality in the downloader parquet
93 | - Update the processing_stage column to include extraction as a completed stage
94 |
95 | ### 4) Remove All Fallback Options
96 |
97 | - **Critical**: Remove any remaining code that silently falls back to processing all files:
98 | - Some of these fallbacks have been removed, but others may still exist
99 | - Remove any code that ignores extraction quality filter failures
100 | - Flag fallbacks as explicit errors rather than silent recovery
101 | - Ensure section() and other methods require good quality files and don't have hidden fallbacks
102 |
103 | ### 5) Add More Robust Error Messages
104 |
105 | - Add clear error messages when filtering operations fail instead of using defaults
106 | - For example: "No good quality files found. Pipeline stopped." instead of using all files
107 | - Document all pipeline decision points in code comments
108 | - Specify where the pipeline can branch and under what conditions
109 | - Explain the rationale for each decision point
110 |
111 | ### 6) Testing and Documentation
112 |
113 | - Test the refactored pipeline using the examples in /mnt/data/eu_test
114 | - Ensure the extraction_test_bad_file.py script correctly filters bad files
115 | - Add detailed logging for all pipeline stages
116 | - Document the new two-parquet approach in comments and docstrings
117 | - Update the parquet schema documentation to reflect the new approach
118 |
--------------------------------------------------------------------------------
/pipeline/scripts/concurrent_downloader.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Concurrent Downloader
5 |
6 | A versatile concurrent downloader that uses asyncio and aiohttp to efficiently download
7 | files from URLs. It accepts parquet files with URLs and metadata columns, downloads the files
8 | concurrently, and creates unique filenames with a structured naming pattern.
9 |
10 | Features:
11 | - Parquet file integration for metadata handling
12 | - Unique filename generation with the pattern paper_AAA000, paper_AAA001, etc.
13 | - Configurable concurrency
14 | - Retry mechanism for failed downloads
15 | - Download status tracking
16 | - Works with any file type
17 | """
18 |
19 | import aiohttp
20 | import asyncio
21 | import os
22 | import argparse
23 | import time
24 | import random
25 | import logging
26 | import re
27 | import string
28 | import aiofiles
29 | import pandas as pd
30 | from urllib.parse import urlparse
31 | from collections import deque
32 | from typing import Dict, List, Tuple, Set, Optional, Any, Iterator
33 | import mimetypes
34 | import string
35 | from tenacity import retry, stop_after_attempt, stop_after_delay, wait_exponential, retry_if_exception_type, retry_if_result, before_sleep_log
36 | import json
37 |
38 | # Configure logging
39 | logging.basicConfig(
40 | level=logging.INFO,
41 | format='%(asctime)s - %(levelname)s - %(message)s',
42 | handlers=[
43 | logging.StreamHandler(),
44 | logging.FileHandler("concurrent_download.log")
45 | ]
46 | )
47 | logger = logging.getLogger(__name__)
48 |
49 | # Configure tenacity logger
50 | tenacity_logger = logging.getLogger('tenacity')
51 | tenacity_logger.setLevel(logging.INFO)
52 |
53 | # Add specific loggers for libraries that can be noisy
54 | logging.getLogger('aiohttp').setLevel(logging.WARNING)
55 | logging.getLogger('asyncio').setLevel(logging.WARNING)
56 |
57 | # Rate limiter class for API limits
58 | class RateLimiter:
59 | """Rate limiter to enforce API rate limits"""
60 |
61 | def __init__(self, rate_limit: int, time_period: int = 60):
62 | """
63 | Initialize rate limiter
64 |
65 | Args:
66 | rate_limit: Maximum number of requests allowed in time_period
67 | time_period: Time period in seconds (default: 60 seconds = 1 minute)
68 | """
69 | self.rate_limit = rate_limit
70 | self.time_period = time_period
71 | self.request_timestamps = deque(maxlen=rate_limit)
72 | self.lock = asyncio.Lock()
73 |
74 | async def acquire(self):
75 | """
76 | Acquire permission to make a request, waiting if necessary
77 | """
78 | async with self.lock:
79 | current_time = time.time()
80 |
81 | # If we haven't reached the limit yet, allow immediately
82 | if len(self.request_timestamps) < self.rate_limit:
83 | self.request_timestamps.append(current_time)
84 | return
85 |
86 | # Check if the oldest request is outside the time window
87 | elapsed = current_time - self.request_timestamps[0]
88 | if elapsed < self.time_period:
89 | # We need to wait until a slot is available
90 | wait_time = self.time_period - elapsed
91 | logger.debug(f"Rate limit reached. Waiting {wait_time:.2f} seconds")
92 | # Release the lock while waiting
93 | await asyncio.sleep(wait_time)
94 | # Reacquire and check again (recursive call)
95 | await self.acquire()
96 | else:
97 | # We can make a request now
98 | self.request_timestamps.popleft() # Remove oldest
99 | self.request_timestamps.append(current_time)
100 |
101 | # Constants for filename generation
102 | LETTERS = string.ascii_uppercase
103 | DIGITS = string.digits
104 |
105 |
106 | def generate_filename(index: int, file_ext: str = None) -> str:
107 | """
108 | Generate a filename in the format AAA_000, AAA_001, etc.
109 |
110 | Args:
111 | index: Sequential number to convert to the AAA_000 format
112 | file_ext: Optional file extension (with dot)
113 |
114 | Returns:
115 | str: Unique filename
116 | """
117 | # Calculate letter part (AAA, AAB, etc.)
118 | letter_base = ord('A') # ASCII code for 'A'
119 | first_letter = chr(letter_base + (index // (26*26)) % 26)
120 | second_letter = chr(letter_base + (index // 26) % 26)
121 | third_letter = chr(letter_base + index % 26)
122 |
123 | # Calculate number part (000, 001, etc.)
124 | number_part = f"{(index % 1000):03d}"
125 |
126 | letters = f"{first_letter}{second_letter}{third_letter}"
127 | digits = number_part
128 |
129 | if file_ext:
130 | return f"{letters}_{digits}.{file_ext}"
131 | else:
132 | return f"{letters}_{digits}"
133 |
134 |
135 | def get_file_extension_from_url(url: str) -> str:
136 | """
137 | Extract file extension from URL or guess based on content type
138 |
139 | Args:
140 | url: URL to extract extension from
141 |
142 | Returns:
143 | str: File extension (without dot)
144 | """
145 | # First try to get extension from URL path
146 | path = urlparse(url).path
147 | ext = os.path.splitext(path)[1].lower()
148 |
149 | if ext and ext.startswith('.'):
150 | return ext[1:] # Remove the leading dot
151 |
152 | # If no extension found, return a default
153 | return "bin"
154 |
155 |
156 | def get_mime_type(url: str) -> str:
157 | """
158 | Get MIME type from URL
159 |
160 | Args:
161 | url: URL to get MIME type for
162 |
163 | Returns:
164 | str: MIME type
165 | """
166 | mime_type, _ = mimetypes.guess_type(url)
167 | return mime_type if mime_type else "application/octet-stream"
168 |
169 |
170 | async def get_base_url(url: str) -> str:
171 | """
172 | Extract base URL from a full URL
173 |
174 | Args:
175 | url: Full URL
176 |
177 | Returns:
178 | str: Base URL (scheme + netloc)
179 | """
180 | if not url.startswith(("http://", "https://")):
181 | url = f"https://{url}"
182 | parsed_url = urlparse(url)
183 | base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
184 | return base_url
185 |
186 |
187 | async def setup_session(session: aiohttp.ClientSession, url: str, headers: Dict[str, str]) -> Dict[str, str]:
188 | """
189 | Initialize the session with base headers
190 |
191 | Args:
192 | session: aiohttp ClientSession
193 | url: URL to access
194 | headers: Headers to use
195 |
196 | Returns:
197 | Dict[str, str]: Updated headers
198 | """
199 | base_url = await get_base_url(url)
200 | initial_url = base_url
201 | try:
202 | async with session.get(initial_url, headers=headers, timeout=10) as response:
203 | await response.text()
204 | except Exception as e:
205 | logger.warning(f"Failed to setup session for {base_url}: {e}")
206 | return headers
207 |
208 |
209 | async def write_file(filename: str, content: bytes, output_path: str = "./") -> str:
210 | """
211 | Write downloaded content to a file
212 |
213 | Args:
214 | filename: Name of the file
215 | content: Binary content to write
216 | output_path: Directory to write to
217 |
218 | Returns:
219 | str: Path to the written file
220 | """
221 | path_to_file = os.path.join(output_path, filename)
222 | async with aiofiles.open(path_to_file, 'wb') as file:
223 | await file.write(content)
224 | return path_to_file
225 |
226 |
227 | def user_agent_generator() -> Iterator[str]:
228 | """
229 | Generate random user-agents to avoid bot detection
230 |
231 | Yields:
232 | str: Random user agent string
233 | """
234 | templates = [
235 | "Mozilla/5.0 ({os}) AppleWebKit/537.36 (KHTML, like Gecko) {browser}/{version} Safari/537.36",
236 | "Mozilla/5.0 ({os}) Gecko/20100101 {browser}/{version}",
237 | "Mozilla/5.0 ({os}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{version} Safari/537.36 {browser}/{alt_version}"
238 | ]
239 | operating_systems = [
240 | "Windows NT 10.0; Win64; x64",
241 | "Macintosh; Intel Mac OS X 10_15_7",
242 | "X11; Linux x86_64",
243 | "Windows NT 6.1; Win64; x64",
244 | "Android 12; Mobile"
245 | ]
246 | browsers = [
247 | ("Chrome", random.randint(90, 110), "Chrome"),
248 | ("Firefox", random.randint(90, 110), "Firefox"),
249 | ("Edge", random.randint(90, 110), "Edg"),
250 | ("Safari", random.randint(600, 610), "Safari")
251 | ]
252 | while True:
253 | template = random.choice(templates)
254 | os_name = random.choice(operating_systems)
255 | browser, version, alt_browser = random.choice(browsers)
256 | full_version = f"{version}.0.{random.randint(1000, 9999)}"
257 | alt_version = f"{random.randint(90, 110)}.0.{random.randint(1000, 9999)}"
258 | user_agent = template.format(os=os_name, browser=browser, version=full_version, alt_browser=alt_browser, alt_version=alt_version)
259 | yield user_agent
260 |
261 |
262 | @retry(stop=(stop_after_attempt(3) | stop_after_delay(30)),
263 | wait=wait_exponential(multiplier=1, min=2, max=10),
264 | retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)),
265 | reraise=True,
266 | before_sleep=before_sleep_log(tenacity_logger, logging.INFO))
267 | async def make_request(session, requester, url, headers, timeout):
268 | """Make a request with tenacity retry logic"""
269 | async with requester(
270 | url,
271 | headers=headers,
272 | allow_redirects=True,
273 | max_redirects=10,
274 | verify_ssl=False,
275 | timeout=timeout
276 | ) as response:
277 | content = None
278 | if response.status == 200:
279 | content = await response.read()
280 | return response.status, content
281 |
282 | async def download_file(row_index: int, url: str, semaphore: asyncio.Semaphore,
283 | args: argparse.Namespace, user_agent: str, rate_limiter: RateLimiter,
284 | retry_count: int = 0) -> Tuple[bool, str, str, int]:
285 | """
286 | Download a file from a URL
287 |
288 | Args:
289 | row_index: Index in the dataframe
290 | url: URL to download
291 | semaphore: Semaphore for concurrency control
292 | args: Command-line arguments
293 | user_agent: User agent to use
294 | retry_count: Current retry count
295 |
296 | Returns:
297 | Tuple[bool, str, str, int]: (success, filename, error_message, retry_count)
298 | """
299 | # Skip empty URLs
300 | if pd.isna(url) or not url:
301 | return (False, "", "Empty URL", retry_count + 1)
302 |
303 | # Get base URL for referer
304 | base_url = await get_base_url(url)
305 | parsed_url = urlparse(url)
306 | domain = parsed_url.netloc
307 |
308 | # Ensure URL has scheme
309 | if not url.startswith(("http://", "https://")):
310 | url = f"https://{url}"
311 |
312 | # Get file extension from URL
313 | file_ext = get_file_extension_from_url(url)
314 |
315 | # Generate unique filename
316 | filename = generate_filename(row_index, file_ext)
317 |
318 | # Enhanced headers with common browser-like attributes to bypass 403 errors
319 | headers = {
320 | 'User-Agent': user_agent,
321 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
322 | 'Accept-Language': 'en-US,en;q=0.5',
323 | 'Accept-Encoding': 'gzip, deflate, br',
324 | 'Connection': 'keep-alive',
325 | 'Upgrade-Insecure-Requests': '1',
326 | 'Sec-Fetch-Dest': 'document',
327 | 'Sec-Fetch-Mode': 'navigate',
328 | 'Sec-Fetch-Site': 'cross-site',
329 | 'Pragma': 'no-cache',
330 | 'Cache-Control': 'no-cache',
331 | 'TE': 'trailers',
332 | 'Referer': f"https://www.google.com/search?q={domain}",
333 | 'Origin': base_url,
334 | 'DNT': '1'
335 | }
336 |
337 | # Add cookie handling if needed for specific domains
338 | cookies = {}
339 | if 'europarl.europa.eu' in url or 'data.europarl.europa.eu' in url:
340 | cookies = {
341 | 'cookie_consent': 'accepted',
342 | 'ec_cookiepopin': 'NjY1ODJjNDg5NDc1ODlkNzYwZDA0OTU5NzJkYWI2ZTc',
343 | 'JSESSIONID': f"session-id-{random.randint(100000000, 999999999)}",
344 | 'loadedEP': 'true',
345 | 'GUEST_LANGUAGE_ID': 'en_US'
346 | }
347 |
348 | async with semaphore:
349 | # Implement exponential backoff
350 | sleep_time = args.sleep * (2 ** retry_count)
351 | await asyncio.sleep(random.uniform(sleep_time, sleep_time * 1.5))
352 |
353 | # Set up timeout with exponential backoff
354 | timeout = aiohttp.ClientTimeout(total=60 + (retry_count * 15))
355 |
356 | try:
357 | # Acquire permission from rate limiter before making request
358 | await rate_limiter.acquire()
359 |
360 | # Create session with proper connection pooling
361 | conn = aiohttp.TCPConnector(
362 | ssl=False,
363 | limit_per_host=2, # Limit concurrent connections per host
364 | force_close=False, # Keep connections open for reuse
365 | enable_cleanup_closed=True
366 | )
367 |
368 | async with aiohttp.ClientSession(
369 | connector=conn,
370 | timeout=timeout,
371 | trust_env=True, # Use environment for proxy information
372 | cookies=cookies # Use our cookies
373 | ) as session:
374 | # Try to access the base domain first to establish cookies
375 | if retry_count == 0: # Only do this on first attempt
376 | try:
377 | # Get permission from rate limiter for the base URL request
378 | await rate_limiter.acquire()
379 |
380 | async with session.get(
381 | base_url,
382 | headers=headers,
383 | allow_redirects=True,
384 | timeout=aiohttp.ClientTimeout(total=15)
385 | ) as response:
386 | await response.read()
387 | await asyncio.sleep(random.uniform(1.0, 2.0))
388 | except Exception as e:
389 | logger.debug(f"Initial base URL visit failed: {str(e)}")
390 |
391 | # Determine request method (get or post)
392 | request_method = args.request_method.lower()
393 | requester = getattr(session, request_method)
394 |
395 | # Attempt the download with tenacity-powered retry logic
396 | try:
397 | # Use tenacity retry wrapper for the actual request
398 | status, content = await asyncio.wait_for(
399 | make_request(session, requester, url, headers, timeout),
400 | timeout=args.request_timeout # Overall timeout for the whole operation
401 | )
402 |
403 | if status == 200 and content:
404 | await write_file(filename, content, args.output_dir)
405 | logger.info(f"Successfully downloaded {filename} from {url}")
406 | return (True, filename, "", retry_count)
407 | elif status in [403, 429]:
408 | # Special handling for 403/429 (Forbidden/Too Many Requests)
409 | await asyncio.sleep(random.uniform(3.0, 5.0)) # Longer wait
410 | logger.warning(f"Received {status} for {url}")
411 | error_msg = f"HTTP {status}"
412 | return (False, filename, error_msg, retry_count + 1)
413 | else:
414 | error_msg = f"HTTP {status}"
415 | logger.error(f"Failed to download {url}: {error_msg}")
416 | return (False, filename, error_msg, retry_count + 1)
417 |
418 | except asyncio.TimeoutError:
419 | logger.error(f"Overall timeout exceeded for {url}")
420 | return (False, filename, "Request timed out", retry_count + 1)
421 | except Exception as e:
422 | logger.error(f"Error downloading {url}: {str(e)}")
423 | return (False, filename, f"Download error: {str(e)}", retry_count + 1)
424 |
425 | except aiohttp.ClientError as e:
426 | error_msg = f"Client error: {str(e)}"
427 | logger.error(f"ClientError while downloading {url}: {error_msg}")
428 | except asyncio.TimeoutError:
429 | error_msg = "Timeout error"
430 | logger.error(f"Timeout while downloading {url}")
431 | except Exception as e:
432 | error_msg = f"Unexpected error: {str(e)}"
433 | logger.error(f"Error while downloading {url}: {error_msg}")
434 |
435 | return (False, filename, error_msg, retry_count + 1)
436 |
437 |
438 | async def download_files(df: pd.DataFrame, url_column: str, semaphore: asyncio.Semaphore,
439 | args: argparse.Namespace, rate_limiter: RateLimiter,
440 | max_retries: int = 3) -> pd.DataFrame:
441 | """
442 | Download files from URLs in a DataFrame using internal batching for memory efficiency
443 |
444 | Args:
445 | df: DataFrame with URLs
446 | url_column: Name of the column containing URLs
447 | semaphore: Semaphore for concurrency control
448 | args: Command-line arguments
449 | max_retries: Maximum number of retries per URL
450 |
451 | Returns:
452 | pd.DataFrame: Updated DataFrame with download results
453 | """
454 | # Add columns for filenames and download status if they don't exist
455 | if 'filename' not in df.columns:
456 | df['filename'] = None
457 | if 'download_success' not in df.columns:
458 | df['download_success'] = False
459 | if 'error_message' not in df.columns:
460 | df['error_message'] = ""
461 |
462 | # Create a user agent generator
463 | user_agent_gen = user_agent_generator()
464 |
465 | # Calculate output parquet path (needed for periodic saves)
466 | output_parquet = os.path.join(args.output_dir, os.path.basename(args.input_parquet))
467 | if args.output_parquet:
468 | output_parquet = args.output_parquet
469 |
470 | # Get total number of unprocessed rows
471 | unprocessed_mask = pd.isna(df['download_success']) | ~df['download_success']
472 | unprocessed_indices = df[unprocessed_mask].index.tolist()
473 | total_unprocessed = len(unprocessed_indices)
474 |
475 | logger.info(f"Found {total_unprocessed} unprocessed rows out of {len(df)} total")
476 |
477 | internal_batch_size = args.internal_batch_size
478 | successful_downloads = 0
479 | periodic_save_count = args.save_every
480 |
481 | # Process in batches to save memory
482 | for batch_start in range(0, total_unprocessed, internal_batch_size):
483 | batch_end = min(batch_start + internal_batch_size, total_unprocessed)
484 | current_batch_indices = unprocessed_indices[batch_start:batch_end]
485 |
486 | logger.info(f"Processing batch {batch_start//internal_batch_size + 1} of {(total_unprocessed + internal_batch_size - 1)//internal_batch_size}: rows {batch_start} to {batch_end-1}")
487 |
488 | # Create tasks for current batch
489 | tasks = []
490 | for row_idx in current_batch_indices:
491 | url = df.at[row_idx, url_column]
492 | # Get the retry count from the dataframe if it exists
493 | retry_count = int(df.at[row_idx, 'retry_count']) if 'retry_count' in df.columns and pd.notna(df.at[row_idx, 'retry_count']) else 0
494 |
495 | # Skip URLs that have failed too many times
496 | if args.skip_failed_after > 0 and retry_count >= args.skip_failed_after:
497 | logger.info(f"Skipping URL at row {row_idx} - too many failures: {retry_count}")
498 | continue
499 |
500 | if pd.notna(url):
501 | task = asyncio.create_task(
502 | download_file(
503 | row_idx, url, semaphore, args,
504 | next(user_agent_gen), rate_limiter, retry_count
505 | )
506 | )
507 | tasks.append((row_idx, task))
508 |
509 | # Process tasks in current batch
510 | for row_idx, task in tasks:
511 | try:
512 | success, filename, error_msg, new_retry_count = await task
513 | df.at[row_idx, 'filename'] = filename
514 | df.at[row_idx, 'download_success'] = success
515 | df.at[row_idx, 'error_message'] = error_msg
516 | df.at[row_idx, 'retry_count'] = new_retry_count
517 |
518 | # Count successful downloads and save periodically
519 | if success:
520 | successful_downloads += 1
521 | if successful_downloads % periodic_save_count == 0:
522 | logger.info(f"Periodic save: Completed {successful_downloads} downloads. Saving progress to {output_parquet}")
523 | df.to_parquet(output_parquet, index=False)
524 |
525 | except Exception as e:
526 | logger.error(f"Error processing task for row {row_idx}: {e}")
527 | df.at[row_idx, 'download_success'] = False
528 | df.at[row_idx, 'error_message'] = f"Task error: {str(e)}"
529 |
530 | # Save after each batch
531 | logger.info(f"Batch complete. Saving progress to {output_parquet}")
532 | df.to_parquet(output_parquet, index=False)
533 |
534 | return df
535 |
536 |
537 | async def run(args: argparse.Namespace) -> None:
538 | """
539 | Main function to run the downloader
540 |
541 | Args:
542 | args: Command-line arguments
543 | """
544 | # Ensure output directory exists
545 | os.makedirs(args.output_dir, exist_ok=True)
546 |
547 | # Determine output parquet path
548 | output_parquet = os.path.join(args.output_dir, os.path.basename(args.input_parquet))
549 | if args.output_parquet:
550 | output_parquet = args.output_parquet
551 |
552 | # Check if we're resuming from a previous run
553 | resuming = False
554 | if os.path.exists(output_parquet) and args.resume:
555 | try:
556 | logger.info(f"Found existing output parquet file at {output_parquet}. Attempting to resume.")
557 | df = pd.read_parquet(output_parquet)
558 | resuming = True
559 |
560 | # Count successful downloads for statistics
561 | existing_success_count = df['download_success'].sum() if 'download_success' in df.columns else 0
562 | logger.info(f"Resuming from previous run with {existing_success_count} already completed downloads")
563 |
564 | except Exception as e:
565 | logger.warning(f"Failed to read existing parquet for resuming: {e}. Starting fresh.")
566 | resuming = False
567 |
568 | # If not resuming, read the input parquet
569 | if not resuming:
570 | logger.info(f"Reading input parquet file: {args.input_parquet}")
571 | df = pd.read_parquet(args.input_parquet)
572 |
573 | original_count = len(df)
574 | logger.info(f"Loaded {original_count} rows from parquet file")
575 |
576 | # Check if URL column exists
577 | if args.url_column not in df.columns:
578 | raise ValueError(f"URL column '{args.url_column}' not found in parquet file. Available columns: {', '.join(df.columns)}")
579 |
580 | # Create semaphore for concurrency control
581 | semaphore = asyncio.Semaphore(args.concurrency)
582 |
583 | # Create rate limiter (100 requests per minute)
584 | rate_limiter = RateLimiter(args.rate_limit, args.rate_period)
585 | logger.info(f"Using rate limit of {args.rate_limit} requests per {args.rate_period} seconds")
586 |
587 | # Process downloads
588 | logger.info(f"Starting downloads with concurrency: {args.concurrency}")
589 | updated_df = await download_files(df, args.url_column, semaphore, args, rate_limiter, args.max_retries)
590 |
591 | # Save updated DataFrame to parquet
592 | logger.info(f"Saving updated parquet file to: {output_parquet}")
593 | updated_df.to_parquet(output_parquet, index=False)
594 |
595 | # Report statistics
596 | success_count = updated_df['download_success'].sum() if 'download_success' in updated_df.columns else 0
597 | logger.info(f"Download summary:")
598 | logger.info(f" Total URLs: {original_count}")
599 | logger.info(f" Successfully downloaded: {success_count}")
600 | logger.info(f" Failed: {original_count - success_count}")
601 | logger.info(f"Updated parquet file saved to: {output_parquet}")
602 |
603 |
604 | def parse_args() -> argparse.Namespace:
605 | """
606 | Parse command-line arguments
607 |
608 | Returns:
609 | argparse.Namespace: Parsed arguments
610 | """
611 | parser = argparse.ArgumentParser(
612 | description="Concurrent downloader for files from a parquet file",
613 | formatter_class=argparse.ArgumentDefaultsHelpFormatter
614 | )
615 |
616 | parser.add_argument('--input-parquet', required=True,
617 | help='Path to the input parquet file')
618 | parser.add_argument('--url-column', required=True,
619 | help='Column name containing URLs in the parquet file')
620 | parser.add_argument('--output-dir', default='./downloads',
621 | help='Directory to save downloaded files')
622 | parser.add_argument('--output-parquet',
623 | help='Path to save the updated parquet file')
624 | parser.add_argument('--internal-batch-size', type=int, default=100,
625 | help='Number of files to process in one internal batch (for memory efficiency)')
626 | parser.add_argument('--save-every', type=int, default=50,
627 | help='Save progress to parquet file every N successful downloads')
628 | parser.add_argument('--concurrency', type=int, default=5,
629 | help='Number of concurrent downloads')
630 | parser.add_argument('--max-retries', type=int, default=3,
631 | help='Maximum retry attempts for failed downloads')
632 | parser.add_argument('--sleep', type=float, default=0.5,
633 | help='Base sleep time between requests in seconds')
634 | parser.add_argument('--request-method', choices=['get', 'post'], default='get',
635 | help='HTTP request method to use')
636 | parser.add_argument('--resume', action='store_true',
637 | help='Resume downloading from a previously saved checkpoint')
638 | parser.add_argument('--debug', action='store_true',
639 | help='Enable debug logging')
640 | parser.add_argument('--retry-interval', type=float, default=5.0,
641 | help='Time to wait between retries for 403/429 errors (seconds)')
642 | parser.add_argument('--rate-limit', type=int, default=100,
643 | help='Maximum number of requests per time period (rate limiting)')
644 | parser.add_argument('--rate-period', type=int, default=60,
645 | help='Time period in seconds for rate limiting')
646 | parser.add_argument('--request-timeout', type=int, default=45,
647 | help='Overall timeout in seconds for each request')
648 | parser.add_argument('--skip-failed-after', type=int, default=3,
649 | help='Skip URLs that failed more than this many times')
650 |
651 | return parser.parse_args()
652 |
653 |
654 | async def main() -> None:
655 | """
656 | Main entry point
657 | """
658 | args = parse_args()
659 | try:
660 | await run(args)
661 | except Exception as e:
662 | logger.error(f"Error in main: {e}")
663 | raise
664 |
665 |
666 | if __name__ == "__main__":
667 | try:
668 | asyncio.run(main())
669 | except KeyboardInterrupt:
670 | logger.info("Process interrupted by user")
671 | except Exception as e:
672 | logger.error(f"Unhandled exception: {e}")
673 |
--------------------------------------------------------------------------------
/pipeline/scripts/sample_for_training.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Script to sample data from the kallipos processed data.
4 |
5 | This script performs the following:
6 | 1. Creates 200 samples from 'Κεφάλαιο' document type, split into 2 parts
7 | 2. Creates 200 samples from all document types except 'Κεφάλαιο', split into 2 parts
8 | 3. Converts all samples to text format for analysis
9 | """
10 |
11 | import os
12 | import logging
13 | from pathlib import Path
14 | from sampler import Sampler
15 |
16 | # Set up logging
17 | logging.basicConfig(
18 | level=logging.INFO,
19 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
20 | )
21 | logger = logging.getLogger(__name__)
22 |
23 | # Base directory with processed data
24 | WORKING_DIR = "downloads/"
25 |
26 | def main():
27 | logger.info("Creating sampler instance...")
28 | sampler = Sampler(WORKING_DIR)
29 |
30 | # Sample from Κεφάλαιο (200 samples in 2 parts)
31 | logger.info("Sampling from Κεφάλαιο document type...")
32 | kefalaia_parts = sampler.sample(
33 | sample_from={'document_type': 'Κεφάλαιο','header' : 'regex(Βλάχοι)'},
34 | n=5,
35 | parts=2,
36 | output_name="kefalaia_samples"
37 | )
38 |
39 | # Sample from everything except Κεφάλαιο (200 samples in 2 parts)
40 | logger.info("Sampling from all document types except Κεφάλαιο...")
41 | non_kefalaia_parts = sampler.sample(
42 | sample_from_all_except={'document_type': 'Κεφάλαιο','header' : 'regex(Ανάλυση)'},
43 | n=2,
44 | parts=2,
45 | output_name="non_kefalaia_samples"
46 | )
47 |
48 | # Convert each part to text with custom folder names
49 | logger.info("Converting kefalaia part 1 to text...")
50 | sampler.to_text(kefalaia_parts[0], folder_name="kefalaia_chapter_1")
51 |
52 | logger.info("Converting kefalaia part 2 to text...")
53 | sampler.to_text(kefalaia_parts[1], folder_name="kefalaia_chapter_2")
54 |
55 | logger.info("Converting non-kefalaia part 1 to text...")
56 | sampler.to_text(non_kefalaia_parts[0], folder_name="non_kefalaia_1")
57 |
58 | logger.info("Converting non-kefalaia part 2 to text...")
59 | sampler.to_text(non_kefalaia_parts[1], folder_name="non_kefalaia_2")
60 |
61 | # Print summary of samples
62 | logger.info("\nSampling summary:")
63 | logger.info(f"Kefalaia part 1: {len(kefalaia_parts[0])} rows from {len(kefalaia_parts[0]['filename'].unique())} unique files")
64 | logger.info(f"Kefalaia part 2: {len(kefalaia_parts[1])} rows from {len(kefalaia_parts[1]['filename'].unique())} unique files")
65 | logger.info(f"Non-kefalaia part 1: {len(non_kefalaia_parts[0])} rows from {len(non_kefalaia_parts[0]['filename'].unique())} unique files")
66 | logger.info(f"Non-kefalaia part 2: {len(non_kefalaia_parts[1])} rows from {len(non_kefalaia_parts[1]['filename'].unique())} unique files")
67 |
68 | # Print output information
69 | logger.info("\nOutput locations:")
70 | logger.info(f"CSV files: {sampler.datasets_dir}")
71 | logger.info(f"Text files: {sampler.text_dir}")
72 |
73 | if __name__ == "__main__":
74 | main()
75 |
--------------------------------------------------------------------------------
/pipeline/scripts/test_section_reconstruction.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import json
4 | import pandas as pd
5 |
6 | # Adjust path to import from the parent directory's src
7 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'src')))
8 |
9 | from glossapi.gloss_section import GlossSection, Section
10 |
11 | def reconstruct_section(processed_content: list) -> str:
12 | """
13 | Reconstructs the original raw text from the processed section content.
14 |
15 | Args:
16 | processed_content: The list of dictionaries representing the section's
17 | categorized content (e.g., [{'text': '...'}, {'list': '...'}]).
18 |
19 | Returns:
20 | The reconstructed raw text as a single string.
21 | """
22 | reconstructed_lines = []
23 | for item in processed_content:
24 | # The structure is {type: value}
25 | content_type, content_value = list(item.items())[0]
26 | # The value itself contains the original line breaks
27 | reconstructed_lines.append(content_value)
28 |
29 | # Join the content blocks with newlines to form the full raw text
30 | return "\n".join(reconstructed_lines)
31 |
32 |
33 |
34 | def test_reconstruction_from_parquet(parquet_path: str = "/mnt/data/pipeline_refactor/output/sections/sections_for_annotation.parquet", test_all: bool = True):
35 | """
36 | Tests reconstruction by reading data from the pipeline's output Parquet file.
37 |
38 | Args:
39 | parquet_path: Path to the sections Parquet file.
40 | sample_size: Number of sections to randomly sample and test.
41 | """
42 | print(f"\n--- Running Reconstruction Test from Parquet ({parquet_path}) ---")
43 |
44 | if not os.path.exists(parquet_path):
45 | print(f"❌ ERROR: Parquet file not found at {parquet_path}")
46 | return False
47 |
48 | try:
49 | df = pd.read_parquet(parquet_path)
50 | print(f"Loaded {len(df)} sections from Parquet.")
51 | except Exception as e:
52 | print(f"❌ ERROR: Failed to load Parquet file: {e}")
53 | return False
54 |
55 | if len(df) == 0:
56 | print("⚠️ WARN: Parquet file is empty. No sections to test.")
57 | return True # Technically passed as no failures
58 |
59 | # Test all sections
60 | sample_df = df
61 | print(f"Testing reconstruction for all {len(sample_df)} sections...")
62 |
63 | all_passed = True
64 | failures = []
65 |
66 | for index, row in sample_df.iterrows():
67 | raw_content = row['section'] # This column contains the raw text
68 | section_json_str = row['json_section'] # This column contains the JSON representation
69 | filename = row['filename']
70 | header = row['header']
71 |
72 | try:
73 | processed_content = json.loads(section_json_str)
74 | except json.JSONDecodeError as e:
75 | print(f"❌ FAILED: Section {index} (File: {filename}, Header: '{header}') - Invalid JSON: {e}")
76 | failures.append(f"Index {index} (File: {filename}, Header: '{header}') - JSON Decode Error")
77 | all_passed = False
78 | continue
79 |
80 | reconstructed_text = reconstruct_section(processed_content)
81 |
82 | if raw_content != reconstructed_text:
83 | all_passed = False
84 | failures.append(f"Index {index} (File: {filename}, Header: '{header}') - Content Mismatch")
85 | print(f"❌ FAILED: Section {index} (File: {filename}, Header: '{header}') - Mismatch detected!")
86 | # You could add detailed diff printing here if needed for debugging
87 | # print(f" Original:\n```\n{raw_content}\n```")
88 | # print(f" Reconstructed:\n```\n{reconstructed_text}\n```")
89 | # else:
90 | # Optional: Print pass messages for verbosity
91 | # print(f"✅ PASSED: Section {index} (File: {filename}, Header: '{header}')")
92 |
93 | print("\n--- Parquet Test Summary ---")
94 | if all_passed:
95 | print(f"✅ All {len(sample_df)} sampled sections reconstructed successfully from Parquet!")
96 | else:
97 | print(f"❌ Reconstruction failed for {len(failures)}/{len(sample_df)} sampled sections:")
98 | for failure in failures:
99 | print(f" - {failure}")
100 |
101 | return all_passed
102 |
103 | if __name__ == "__main__":
104 | # Run the test using the real Parquet data
105 | test_passed = test_reconstruction_from_parquet()
106 |
107 | print("\n--- Overall Test Results ---")
108 | if test_passed:
109 | print("✅✅ All sections reconstructed successfully! ✅✅")
110 | sys.exit(0)
111 | else:
112 | print("❌❌ Some sections failed reconstruction. ❌❌")
113 | sys.exit(1)
114 |
--------------------------------------------------------------------------------
/pipeline/src/glossapi/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | GlossAPI Library
3 |
4 | A library for processing academic texts in Greek and other languages:
5 | - Extracting content from PDFs and other formats with Docling
6 | - Robust batch processing with error isolation and automatic resumption
7 | - Clustering documents based on extraction quality
8 | - Extracting and cleaning academic sections
9 | - Classifying sections using machine learning
10 |
11 | This is an open source project that provides tools for linguistic annotations
12 | and text processing, with a special focus on the Greek language.
13 | """
14 |
15 | from .gloss_extract import GlossExtract
16 | from .gloss_section_classifier import GlossSectionClassifier
17 | from .corpus import Corpus
18 | from .sampler import Sampler
19 | from .gloss_section import Section, GlossSection
20 | from .gloss_downloader import GlossDownloader
21 |
22 | __all__ = [
23 | 'GlossExtract',
24 | 'GlossSection',
25 | 'GlossSectionClassifier',
26 | 'Corpus',
27 | 'Sampler',
28 | 'Section',
29 | 'NewGlossSection',
30 | 'GlossDownloader'
31 | ]
32 |
33 | __version__ = '0.0.10'
34 |
--------------------------------------------------------------------------------
/pipeline/src/glossapi/gloss_section.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 | import json
4 | from typing import List, Tuple, Dict, Any
5 | import pandas as pd
6 | import pyarrow as pa
7 | import pyarrow.parquet as pq
8 |
9 |
10 | class Section:
11 | """
12 | A data structure representing a section in an academic document.
13 |
14 | Attributes:
15 | title (str): The section title
16 | start_line (int): The starting line number in the original document
17 | end_line (int): The ending line number in the original document
18 | content (List[Dict]): List of content elements. Each element is a dict with one of these keys:
19 | - 'text': Regular text content including empty lines
20 | - 'table': Table content in markdown format
21 | - 'list': List items with their continuation lines
22 | - 'other': Standalone references, image placeholders, etc.
23 | raw_content (str): Raw text content of the section (unprocessed)
24 | has_table (bool): Flag indicating if section contains tables
25 | has_list (bool): Flag indicating if section contains lists
26 | has_text (bool): Flag indicating if section contains regular text
27 | has_other (bool): Flag indicating if section contains other content (refs, images, etc)
28 | """
29 | def __init__(self, title: str = "", start_line: int = 0):
30 | self.title = title
31 | self.start_line = start_line
32 | self.end_line = start_line
33 | self.content = []
34 | self.raw_content = ""
35 | self.has_table = False
36 | self.has_list = False
37 | self.has_other = False
38 | self.has_text = False
39 |
40 | def add_content(self, content_type: str, content_value: str):
41 | """Add a content element to this section"""
42 | # Create a dictionary with the content type as the key
43 | content_dict = {content_type: content_value}
44 | self.content.append(content_dict)
45 |
46 | # Update flags based on content type
47 | if content_type == "table":
48 | self.has_table = True
49 | elif content_type == "list":
50 | self.has_list = True
51 | elif content_type == "other":
52 | self.has_other = True
53 | elif content_type == "text":
54 | self.has_text = True
55 |
56 |
57 | class GlossSection:
58 | """
59 | A class for sectioning, processing, and exporting academic document sections to Parquet format.
60 | Handles parsing markdown documents, identifying structural elements like headers, tables,
61 | lists, and footnotes, and processes them for further analysis.
62 | """
63 |
64 | def _is_list_bullet_line(self, line: str) -> bool:
65 | """
66 | Check if a line indicates a bullet item.
67 | Examples:
68 | - 1. text
69 | - text
70 | - text
71 | 1. text
72 | etc.
73 |
74 | We'll unify them with a small regex set.
75 | """
76 | test = line.strip()
77 | if not test:
78 | return False
79 |
80 | # This pattern matches lines that begin with:
81 | # - optional dash, then optional digits, then optional bullet symbols
82 | # - final check for '.' or ' ' => bullet indicator
83 | # e.g. "- 1. ", "- ", "- ", "1. ", "2."
84 | bullet_pat = re.compile(r'''
85 | ^
86 | (
87 | -\s*\d*\.?\s*[\u2022\u2023\u25E6\u00BB\u2023]* # dash + optional digits + period + bullet char
88 | |\d+\.\s+
89 | |-\s*
90 | )
91 | .*
92 | ''', re.VERBOSE)
93 | return bool(bullet_pat.match(test))
94 |
95 | def _looks_like_list_paragraph(self, para: str) -> bool:
96 | """
97 | Check if a paragraph is marked as a bullet block by our sentinel.
98 | """
99 | return para.startswith("<<__LIST_ITEM__>>")
100 |
101 | ###############################################################################
102 | # 1) Other Utility Functions
103 | ###############################################################################
104 | def _wrap_text(self, text: str, width: int) -> List[str]:
105 | """Wrap text to a specified width while preserving words."""
106 | words = text.split()
107 | lines = []
108 | current_line = []
109 | current_length = 0
110 |
111 | for word in words:
112 | # +1 for space if not first in line
113 | if current_length + len(word) + (1 if current_line else 0) <= width:
114 | current_line.append(word)
115 | current_length += len(word) + (1 if current_line else 0)
116 | else:
117 | if current_line:
118 | lines.append(" ".join(current_line))
119 | current_line = [word]
120 | current_length = len(word)
121 |
122 | if current_line:
123 | lines.append(" ".join(current_line))
124 |
125 | return lines
126 |
127 | def _is_standalone_reference(self, para: str, min_text_length: int = 10) -> bool:
128 | """
129 | Determine if a paragraph appears to be a standalone reference/footnote marker.
130 | Only very short paragraphs (fewer than a threshold number of characters) that
131 | consist solely of reference-style tokens are flagged.
132 |
133 | Parameters:
134 | - para: The paragraph (as a string) to check.
135 | - min_text_length: A lower bound (in characters) below which the paragraph
136 | is considered too short to be meaningful text.
137 |
138 | Returns:
139 | True if the paragraph appears to be just a reference marker.
140 | """
141 | trimmed = para.strip()
142 |
143 | # Empty lines should never be considered footnotes
144 | if len(trimmed) == 0:
145 | return False
146 |
147 | # Only match if the entire paragraph exactly equals one of the expected reference tokens.
148 | reference_patterns = [
149 | #r'^\d+$', # Only digits (e.g., "12")
150 | #r'^\d+[\-–]\d+$', # A simple digit range (e.g., "12-14")
151 | r'^(Ibid|op\.cit\.?|loc\.cit\.?|et\.al\.?|cf\.)$', # Common citation markers
152 | r'^(βλ\.|πρβλ\.|σσ\.|σελ\.|ό\.π\.)$', # Greek shorthand markers
153 | ]
154 |
155 | # Try each pattern; if one matches the entire trimmed paragraph, flag it.
156 | for pattern in reference_patterns:
157 | if re.match(pattern, trimmed, re.IGNORECASE):
158 | return True
159 |
160 | # Otherwise, we do not consider it a standalone reference.
161 | return False
162 |
163 | def _detect_other_lines(self, paragraphs: List[str], max_length: int = 20, min_text_length: int = 20) -> List[Dict]:
164 | """
165 | Identify short paragraphs that should be categorized as "other" content rather than regular text.
166 |
167 | This function simply categorizes very short lines as "other" and everything else as "text".
168 |
169 | Parameters:
170 | - paragraphs: List of paragraph strings.
171 | - max_length: Maximum length (in characters) a paragraph can have
172 | to be considered for the "other" category.
173 | - min_text_length: Not currently used.
174 |
175 | Returns:
176 | A list of dictionaries with the content type as key and content as value.
177 | """
178 | categorized = []
179 | for para in paragraphs:
180 | trimmed = para.strip()
181 |
182 | # Simply categorize short lines as "other"
183 | if len(trimmed) > 0 and len(trimmed) < max_length:
184 | categorized.append({"other": para})
185 | else:
186 | # Regular text content or empty lines
187 | categorized.append({"text": para})
188 | return categorized
189 |
190 | def _should_merge_paragraphs(self, para1: str, para2: str) -> bool:
191 | """
192 | Decide if para1 and para2 likely form a single continued sentence.
193 | """
194 | if not para1 or not para2:
195 | return False
196 |
197 | p1_end = para1.rstrip()
198 | p2_start = para2.lstrip()
199 |
200 | # Hyphen or open parenthesis
201 | if p1_end.endswith('-') or p1_end.endswith('('):
202 | return True
203 |
204 | end_char_1 = p1_end[-1] if p1_end else ''
205 | start_char_2 = p2_start[0] if p2_start else ''
206 |
207 | # e.g. ends with lower, next starts with lower => likely a single sentence
208 | if end_char_1.islower() and start_char_2.islower():
209 | return True
210 | # ends with punctuation and next starts with lower
211 | if end_char_1 in ',:·' and start_char_2.islower():
212 | return True
213 | # ends with digit, next starts with '°'
214 | if end_char_1.isdigit() and start_char_2 == '°':
215 | return True
216 |
217 | return False
218 |
219 | def _is_table_line(self, line: str) -> bool:
220 | """Check if the line (stripped) starts & ends with '|' => table line."""
221 | ls = line.strip()
222 | return ls.startswith("|") and ls.endswith("|") if ls else False
223 |
224 | def _looks_like_table_block(self, paragraph: str) -> bool:
225 | """
226 | If every non-blank line in paragraph starts & ends with '|', treat as a table block.
227 | """
228 | lines = paragraph.splitlines()
229 | for ln in lines:
230 | ln_str = ln.strip()
231 | if ln_str and (not (ln_str.startswith("|") and ln_str.endswith("|"))):
232 | return False
233 | return True
234 |
235 | def _is_header(self, line: str) -> bool:
236 | """Check if line is a markdown header (#...)."""
237 | return line.strip().startswith('#')
238 |
239 | def _extract_section_level(self, line: str) -> Tuple[int, str]:
240 | """Extract header level and title from a markdown header line."""
241 | match = re.match(r'^(#+)\s*(.+)$', line.strip())
242 | if match:
243 | level = len(match.group(1)) # Count the number of # symbols
244 | title = match.group(2)
245 | return level, title
246 | return 0, line
247 |
248 | def _process_sections(self, lines: List[str]) -> List[Section]:
249 | """
250 | Process text to identify sections based on headers.
251 | Text between two headers becomes a section with the preceding header as title.
252 | This only divides the document into sections based on headers - content
253 | categorization happens in _process_section_content.
254 |
255 | Enhanced to handle:
256 | 1. Documents that start with content before the first header
257 | 2. Documents with no headers at all
258 | 3. Content after the last header
259 |
260 | Parameters:
261 | - lines: List of text lines from the document
262 |
263 | Returns:
264 | List of Section objects representing the document structure
265 | """
266 | sections = []
267 | current_section = None
268 | n = len(lines)
269 | found_any_headers = False
270 |
271 | # Store raw lines between headers
272 | raw_section_lines = []
273 |
274 | # Handle case 1: Document starts with content before any header
275 | # Create an initial section if the first line is not a header
276 | if n > 0 and not self._is_header(lines[0].strip()):
277 | # Use first line as title if it's not empty, otherwise use "Document"
278 | first_line = lines[0].strip() if lines[0].strip() else "Document"
279 | current_section = Section(title=first_line, start_line=0)
280 |
281 | i = 0
282 | while i < n:
283 | raw_line = lines[i].rstrip('\n')
284 |
285 | # Markdown heading - start of a new section
286 | if self._is_header(raw_line.strip()):
287 | found_any_headers = True
288 |
289 | # If we had a previous section, finalize it
290 | if current_section is not None:
291 | current_section.end_line = i - 1
292 |
293 | # Store raw section content
294 | current_section.raw_content = "\n".join(raw_section_lines)
295 | raw_section_lines = []
296 |
297 | sections.append(current_section)
298 |
299 | # Create a new section based on the header
300 | _, title = self._extract_section_level(raw_line)
301 | current_section = Section(title=title, start_line=i)
302 | i += 1
303 | continue
304 |
305 | # Just store the raw line - content processing happens later
306 | if current_section is not None:
307 | raw_section_lines.append(raw_line)
308 | else:
309 | # This should generally not happen since we create an initial section if needed,
310 | # but in case first_line logic changes, keep this safety check
311 | raw_section_lines.append(raw_line)
312 |
313 | i += 1
314 |
315 | # Handle case 2 & 3: Document has no headers or content after the last header
316 | # Finalize the last section if there is one
317 | if current_section:
318 | current_section.end_line = n - 1
319 | current_section.raw_content = "\n".join(raw_section_lines)
320 | sections.append(current_section)
321 | elif raw_section_lines: # Handle case where no section was created but we collected content
322 | first_line = raw_section_lines[0].strip() if raw_section_lines and raw_section_lines[0].strip() else "Document"
323 | default_section = Section(title=first_line, start_line=0)
324 | default_section.end_line = n - 1
325 | default_section.raw_content = "\n".join(raw_section_lines[1:] if len(raw_section_lines) > 1 else raw_section_lines)
326 | sections.append(default_section)
327 |
328 | # Handle case 2: If no headers were found and we have no sections yet, create a default section
329 | if not found_any_headers and not sections and n > 0:
330 | title = lines[0].strip() if lines[0].strip() else "Document"
331 | content = "\n".join(lines[1:] if len(lines) > 1 else lines)
332 | default_section = Section(title=title, start_line=0)
333 | default_section.end_line = n - 1
334 | default_section.raw_content = content
335 | sections.append(default_section)
336 |
337 | return sections
338 |
339 | def _process_section_content(self, sections: List[Section]):
340 | """
341 | Process the raw content of each section to categorize it into appropriate content types:
342 | 1. Tables: Identified by markdown table formatting (|)
343 | 2. Lists: Identified by bullet points or numbered items
344 | 3. Other: Standalone references, image placeholders, etc.
345 | 4. Text: All remaining content, including empty lines
346 |
347 | The original structure with line breaks is preserved within each content block.
348 |
349 | Parameters:
350 | - sections: List of Section objects to process
351 | """
352 | for section in sections:
353 | # Clear existing content and start fresh from raw content
354 | section.content = []
355 |
356 | # Split raw content into lines for processing
357 | if not section.raw_content:
358 | continue
359 |
360 | raw_lines = section.raw_content.split('\n')
361 | i = 0
362 | n = len(raw_lines)
363 |
364 | # Buffer to collect text content including empty lines
365 | text_buffer = []
366 |
367 | while i < n:
368 | line = raw_lines[i]
369 |
370 | # 1. Check for tables (lines with | at start and end)
371 | if self._is_table_line(line):
372 | # Flush any text buffer first
373 | if text_buffer:
374 | section.add_content("text", "\n".join(text_buffer))
375 | text_buffer = []
376 |
377 | # Collect all table lines
378 | table_lines = [line]
379 | i += 1
380 | while i < n and self._is_table_line(raw_lines[i]):
381 | table_lines.append(raw_lines[i])
382 | i += 1
383 | # Add as table content
384 | section.add_content("table", "\n".join(table_lines))
385 | continue
386 |
387 | # 2. Check for list items
388 | elif self._is_list_bullet_line(line):
389 | # Flush any text buffer first
390 | if text_buffer:
391 | section.add_content("text", "\n".join(text_buffer))
392 | text_buffer = []
393 |
394 | # Collect the list item and any continuation lines
395 | list_item = [line]
396 | i += 1
397 | while i < n:
398 | next_line = raw_lines[i]
399 | if (not next_line.strip() or
400 | self._is_list_bullet_line(next_line) or
401 | self._is_table_line(next_line) or
402 | self._is_header(next_line)):
403 | break
404 | # Add continuation line preserving its formatting
405 | list_item.append(next_line)
406 | i += 1
407 | # Add as list content, preserving line breaks
408 | section.add_content("list", "\n".join(list_item))
409 | continue
410 |
411 | # 3. Check for 'other' content (standalone refs, image placeholders, etc)
412 | elif self._detect_other_lines([line])[0].get('other'):
413 | # Flush any text buffer first
414 | if text_buffer:
415 | section.add_content("text", "\n".join(text_buffer))
416 | text_buffer = []
417 |
418 | section.add_content("other", line)
419 | i += 1
420 | continue
421 |
422 | # 4. Regular text content and empty lines - add to buffer
423 | else:
424 | # Add to text buffer (preserves empty lines and formatting)
425 | text_buffer.append(line)
426 | i += 1
427 |
428 | # Don't forget to add any remaining text in buffer
429 | if text_buffer:
430 | section.add_content("text", "\n".join(text_buffer))
431 |
432 | # Update section flags based on content
433 | section.has_table = any("table" in item for item in section.content)
434 | section.has_list = any("list" in item for item in section.content)
435 | section.has_text = any("text" in item for item in section.content)
436 | section.has_other = any("other" in item for item in section.content)
437 |
438 |
439 |
440 | def _format_academic_document(self, text: str, filename: str) -> List[Dict[str, Any]]:
441 | """
442 | Process a document and format it into structured data for output.
443 |
444 | Parameters:
445 | - text: The text content of the document
446 | - filename: The filename of the document
447 |
448 | Returns:
449 | A list of dictionaries with structured section data for Parquet output
450 | """
451 | lines = text.splitlines()
452 |
453 | # 1) Identify sections in the document based on markdown headers
454 | sections = self._process_sections(lines)
455 |
456 | # 2) Process section content - categorize each line appropriately
457 | self._process_section_content(sections)
458 |
459 | # 3) Format the data for output
460 | rows = []
461 | for section in sections:
462 | # Calculate section position as fraction of total document
463 | start_frac = section.start_line / max(1, len(lines))
464 | end_frac = section.end_line / max(1, len(lines))
465 | place_str = f"{start_frac:.2f}-{end_frac:.2f}"
466 |
467 | # Create a list of dictionaries for JSON serialization
468 | json_items = []
469 | for item in section.content:
470 | # Each item is a dict with a single key (content type) and value
471 | content_type = list(item.keys())[0]
472 | content_value = item[content_type]
473 |
474 | # Create proper dictionary object for JSON serialization
475 | json_items.append({content_type: content_value})
476 |
477 | # Use Python's json module for proper JSON serialization with all escaping handled
478 | json_content = json.dumps(json_items, ensure_ascii=False, indent=2)
479 |
480 | row = {
481 | "id": len(rows),
482 | "filename": filename,
483 | "has_table": section.has_table,
484 | "has_list": section.has_list,
485 | "has_other": section.has_other,
486 | "has_text": section.has_text,
487 | "header": section.title.strip(),
488 | "place": place_str,
489 | "section": section.raw_content, # Store the original unprocessed section text
490 | "json_section": json_content # Store the formatted JSON content
491 | }
492 | rows.append(row)
493 |
494 | return rows
495 |
496 | def to_parquet(self, input_dir, output_dir, filenames_to_process):
497 | """
498 | Process Markdown files from input_dir and write structured data to a Parquet file.
499 |
500 | Args:
501 | input_dir (str): Directory containing Markdown files to process
502 | output_dir (str): Directory where the output Parquet file will be written
503 | filenames_to_process (list): List of filenames (without extensions) to process.
504 | Only files matching these names will be processed.
505 | This should be a list of base filenames without extensions.
506 |
507 | The output Parquet file will contain structured data about sections from all documents,
508 | including information about tables, lists, footnotes, and regular text.
509 | """
510 | os.makedirs(output_dir, exist_ok=True)
511 |
512 | parquet_path = os.path.join(output_dir, "sections_for_annotation.parquet")
513 | schema = pa.schema([
514 | pa.field("id", pa.int64()),
515 | pa.field("row_id", pa.string()),
516 | pa.field("filename", pa.string()),
517 | pa.field("has_table", pa.bool_()),
518 | pa.field("has_list", pa.bool_()),
519 | pa.field("has_other", pa.bool_()),
520 | pa.field("has_text", pa.bool_()),
521 | pa.field("header", pa.string()),
522 | pa.field("place", pa.string()),
523 | pa.field("section", pa.string()), # Raw section text
524 | pa.field("json_section", pa.string()), # Formatted JSON content
525 | pa.field("section_length", pa.int64()), # Number of non-empty lines in section
526 | pa.field("section_propo", pa.int64()), # Proportion of document (0-1000)
527 | ])
528 |
529 | writer = pq.ParquetWriter(parquet_path, schema=schema)
530 | row_counter = 1 # global row id counter
531 |
532 | # Process each Markdown file individually to keep memory usage low.
533 | processed_files_count = 0
534 | skipped_files = []
535 | print(f"\n===== SECTIONING PHASE =====")
536 | print(f"Input directory: {input_dir}")
537 | print(f"Output directory: {output_dir}")
538 | print(f"Good files list (length {len(filenames_to_process)}): {filenames_to_process}")
539 | print(f"Available files in directory:")
540 | md_files = [f for f in os.listdir(input_dir) if f.endswith(".md")]
541 | for i, md_file in enumerate(md_files):
542 | print(f" {i+1}. {md_file} (basename: {os.path.splitext(md_file)[0]})")
543 |
544 | for filename in os.listdir(input_dir):
545 | if filename.endswith(".md"):
546 | # Get the base name without extension for filtering
547 | base_name = os.path.splitext(filename)[0]
548 |
549 | # Only process files that are in our whitelist
550 | if base_name not in filenames_to_process:
551 | skipped_files.append(base_name)
552 | print(f"⚠️ SKIPPED: {base_name} - not in the good files list")
553 | continue # Skip this file as it's not in our list of good files
554 |
555 | processed_files_count += 1
556 | print(f"✅ PROCESSING: {base_name} - in good files list")
557 | input_path = os.path.join(input_dir, filename)
558 | with open(input_path, 'r', encoding='utf-8') as f:
559 | text = f.read()
560 |
561 | short_name = os.path.splitext(filename)[0]
562 | doc_rows = self._format_academic_document(text, short_name)
563 |
564 | # Calculate section_length for each row (number of non-empty lines)
565 | for row in doc_rows:
566 | section_lines = row.get("section", "").splitlines()
567 | section_length = sum(1 for line in section_lines if line.strip())
568 | row['section_length'] = section_length
569 |
570 | # Calculate the total article length (sum of all section lengths)
571 | article_length = sum(row.get("section_length", 0) for row in doc_rows)
572 |
573 | # Calculate section_propo for each row (proportion * 1000, rounded)
574 | for row in doc_rows:
575 | if article_length > 0:
576 | section_propo = round((row.get("section_length", 0) / article_length) * 1000)
577 | else:
578 | section_propo = 0
579 | row['section_propo'] = section_propo
580 |
581 | # Add id and row_id to each row
582 | for row in doc_rows:
583 | row['id'] = row_counter
584 | row['row_id'] = f'row_{row_counter}'
585 | row_counter += 1
586 |
587 | if doc_rows:
588 | df = pd.DataFrame(doc_rows)
589 | table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
590 | writer.write_table(table)
591 |
592 | writer.close()
593 |
594 | # More informative logging
595 | print(f"\nSection processing summary:")
596 | print(f" - Good files list contained {len(filenames_to_process)} files: {filenames_to_process}")
597 | print(f" - Found {processed_files_count} markdown files matching good files list")
598 | if skipped_files:
599 | print(f" - Skipped {len(skipped_files)} files that weren't in good list: {skipped_files}")
600 | print(f" - Saved {row_counter - 1} total sections to {parquet_path}")
601 |
--------------------------------------------------------------------------------
/pipeline/src/glossapi/models/kmeans_weights.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eellak/glossAPI/02ba170f69681c6bb1ad0a52b48fb2309c8354f1/pipeline/src/glossapi/models/kmeans_weights.joblib
--------------------------------------------------------------------------------
/pipeline/src/glossapi/models/section_classifier.joblib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eellak/glossAPI/02ba170f69681c6bb1ad0a52b48fb2309c8354f1/pipeline/src/glossapi/models/section_classifier.joblib
--------------------------------------------------------------------------------
/pipeline/src/glossapi/parquet_schema.py:
--------------------------------------------------------------------------------
1 | """
2 | Standardized Parquet Schema definitions for GlossAPI pipeline.
3 |
4 | This module defines standard schemas for parquet files used throughout the GlossAPI
5 | pipeline, ensuring consistency between different pipeline stages.
6 | """
7 |
8 | import os
9 | import pandas as pd
10 | import pyarrow as pa
11 | import pyarrow.parquet as pq
12 | from pathlib import Path
13 | from typing import List, Dict, Any, Optional, Union, Tuple
14 |
15 |
16 | class ParquetSchema:
17 | """
18 | Defines standardized schema for parquet files in the GlossAPI pipeline.
19 |
20 | This class provides methods to validate, read, and write parquet files
21 | with consistent schemas for different pipeline stages.
22 |
23 | The pipeline uses two distinct types of parquet files:
24 |
25 | 1. Metadata Parquet:
26 | - Each row represents a file (one-to-one relationship with files)
27 | - Essential columns: filename, URL column (configurable), extraction quality
28 | - Used by: downloader, extractor, and filter stages
29 | - Example: download_results.parquet
30 | - Typical location: {output_dir}/download_results/
31 | - Schema: METADATA_SCHEMA, DOWNLOAD_SCHEMA
32 |
33 | 2. Sections Parquet:
34 | - Each row represents a section from a file (many-to-one relationship with files)
35 | - Essential columns: filename, title, content, section, predicted_section
36 | - Used by: section and annotation stages
37 | - Examples: sections_for_annotation.parquet, classified_sections.parquet
38 | - Typical location: {output_dir}/sections/
39 | - Schema: SECTION_SCHEMA, CLASSIFIED_SCHEMA
40 |
41 | When the pipeline runs, it first creates and populates a metadata parquet,
42 | then uses it to filter files, and finally creates section parquets from the
43 | filtered files.
44 | """
45 |
46 | def __init__(self, pipeline_config: Optional[Dict[str, Any]] = None):
47 | """
48 | Initialize the ParquetSchema with optional pipeline configuration.
49 |
50 | Args:
51 | pipeline_config: Configuration dictionary with settings such as
52 | url_column, which will be used throughout the pipeline
53 | """
54 | # TODO: Add more robust configuration options for each parquet type from input metadata and downloder, to section, and two phases of annotaiton.
55 | # TODO: Add support for consolidated sections parquet handling
56 | # TODO: Add methods to find the latest sections parquet in a pipeline
57 | self.config = pipeline_config or {}
58 | self.url_column = self.config.get('url_column', 'url')
59 |
60 | # Basic schema with common fields used across all parquet files
61 | COMMON_SCHEMA = pa.schema([
62 | ('id', pa.string()),
63 | ('row_id', pa.int64()),
64 | ('filename', pa.string()),
65 | ])
66 |
67 | # Metadata schema for files used by downloader and quality assessment
68 | METADATA_SCHEMA = pa.schema([
69 | ('filename', pa.string()),
70 | ('url', pa.string()), # Can be customized with url_column parameter
71 | ('download_success', pa.bool_()),
72 | ('download_error', pa.string()),
73 | ('extraction_quality', pa.string()), # Values: "good", "bad", "unknown"
74 | ('processing_stage', pa.string()), # Tracks progress through pipeline
75 | ])
76 |
77 | # Additional schemas for specific pipeline stages
78 | DOWNLOAD_SCHEMA = pa.schema([
79 | ('url', pa.string()), # Will be replaced with the actual url_column
80 | ('download_success', pa.bool_()),
81 | ('download_error', pa.string()),
82 | ('download_retry_count', pa.int32()),
83 | ('filename', pa.string()),
84 | ])
85 |
86 | SECTION_SCHEMA = pa.schema([
87 | ('id', pa.string()),
88 | ('row_id', pa.int64()),
89 | ('filename', pa.string()),
90 | ('title', pa.string()),
91 | ('content', pa.string()),
92 | ('section', pa.string()),
93 | ])
94 |
95 | CLASSIFIED_SCHEMA = pa.schema([
96 | ('id', pa.string()),
97 | ('row_id', pa.int64()),
98 | ('filename', pa.string()),
99 | ('title', pa.string()),
100 | ('content', pa.string()),
101 | ('section', pa.string()),
102 | ('predicted_section', pa.string()),
103 | ('probability', pa.float64()),
104 | ])
105 |
106 | def get_required_metadata(self) -> Dict[str, str]:
107 | """
108 | Get required metadata fields for GlossAPI parquet files.
109 |
110 | Returns:
111 | Dict[str, str]: Dictionary of required metadata fields and their descriptions
112 | """
113 | return {
114 | 'pipeline_version': 'GlossAPI pipeline version',
115 | 'created_at': 'ISO format timestamp when the file was created',
116 | 'source_file': 'Original source file that generated this parquet',
117 | 'processing_stage': 'Pipeline processing stage (download, extract, section, etc)'
118 | }
119 |
120 | def validate_schema(self, df: pd.DataFrame, schema_type: str = 'common') -> Tuple[bool, List[str]]:
121 | """
122 | Validate that a DataFrame conforms to the specified schema.
123 |
124 | Args:
125 | df: DataFrame to validate
126 | schema_type: Type of schema to validate against ('common', 'download', 'section', 'classified', 'metadata')
127 |
128 | Returns:
129 | Tuple[bool, List[str]]: (is_valid, missing_columns)
130 | """
131 | if schema_type.lower() == 'download':
132 | required_columns = [field.name for field in self.DOWNLOAD_SCHEMA]
133 | # Make sure to use the configured url_column
134 | if self.url_column != 'url' and 'url' in required_columns:
135 | required_columns.remove('url')
136 | required_columns.append(self.url_column)
137 | elif schema_type.lower() == 'section':
138 | required_columns = [field.name for field in self.SECTION_SCHEMA]
139 | elif schema_type.lower() == 'classified':
140 | required_columns = [field.name for field in self.CLASSIFIED_SCHEMA]
141 | elif schema_type.lower() == 'metadata':
142 | required_columns = ['filename']
143 | # Make sure to use the configured url_column
144 | required_columns.append(self.url_column)
145 | else: # Default to common schema
146 | required_columns = [field.name for field in self.COMMON_SCHEMA]
147 |
148 | # Check for missing columns
149 | missing_columns = [col for col in required_columns if col not in df.columns]
150 |
151 | return len(missing_columns) == 0, missing_columns
152 |
153 | def add_metadata(self, table: pa.Table, metadata: Dict[str, str]) -> pa.Table:
154 | """
155 | Add metadata to a PyArrow Table.
156 |
157 | Args:
158 | table: PyArrow Table to add metadata to
159 | metadata: Dictionary of metadata to add
160 |
161 | Returns:
162 | pa.Table: Table with added metadata
163 | """
164 | # Add pipeline configuration to metadata
165 | if self.config:
166 | for key, value in self.config.items():
167 | if key not in metadata:
168 | metadata[f'config_{key}'] = str(value)
169 | # Convert all metadata values to strings
170 | metadata_bytes = {k.encode(): str(v).encode() for k, v in metadata.items()}
171 |
172 | # Add required metadata if missing
173 | required_metadata = self.get_required_metadata()
174 | for key in required_metadata:
175 | if key not in metadata:
176 | metadata_bytes[key.encode()] = f"MISSING: {required_metadata[key]}".encode()
177 |
178 | return table.replace_schema_metadata(metadata_bytes)
179 |
180 | def read_parquet(self, file_path: Union[str, Path], validate: bool = True, schema_type: str = 'common') -> pd.DataFrame:
181 | """
182 | Read a parquet file with validation.
183 |
184 | Args:
185 | file_path: Path to parquet file
186 | validate: Whether to validate the schema
187 | schema_type: Type of schema to validate against
188 |
189 | Returns:
190 | pd.DataFrame: DataFrame from parquet file
191 | """
192 | df = pd.read_parquet(file_path)
193 |
194 | if validate:
195 | is_valid, missing_columns = self.validate_schema(df, schema_type)
196 | if not is_valid:
197 | print(f"Warning: Parquet file {file_path} is missing required columns: {missing_columns}")
198 |
199 | # Add missing columns with default values
200 | for col in missing_columns:
201 | if col in ['id', 'filename', 'title', 'section', 'predicted_section', 'download_error']:
202 | df[col] = ''
203 | elif col in ['row_id', 'download_retry_count']:
204 | df[col] = 0
205 | elif col == 'download_success':
206 | df[col] = False
207 | elif col == 'probability':
208 | df[col] = 0.0
209 |
210 | return df
211 |
212 | def find_metadata_parquet(self, directory: Union[str, Path], require_url_column: bool = False) -> Optional[Path]:
213 | """
214 | Find the first valid metadata parquet file in a directory.
215 |
216 | Looks for parquet files that don't have section-specific columns
217 | like 'title' and 'header', and prioritizes files with the url_column.
218 |
219 | Args:
220 | directory: Directory to search for parquet files
221 | require_url_column: If True, require the URL column to be present; if False, only require filename column
222 |
223 | Returns:
224 | Optional[Path]: Path to the first valid metadata parquet, or None if not found
225 | """
226 | import logging
227 | logger = logging.getLogger(__name__)
228 |
229 | directory = Path(directory)
230 | if not directory.exists():
231 | logger.debug(f"Directory {directory} does not exist")
232 | return None
233 |
234 | # Get all parquet files in the directory
235 | parquet_files = list(directory.glob('**/*.parquet'))
236 | if not parquet_files:
237 | logger.debug(f"No parquet files found in {directory}")
238 | return None
239 |
240 | # Check for download_results files first
241 | download_files = [f for f in parquet_files if 'download_results' in str(f)]
242 | if download_files:
243 | logger.debug(f"Found {len(download_files)} download_results files")
244 |
245 | # Examine all files
246 | for file_path in parquet_files:
247 | try:
248 | df = pd.read_parquet(file_path)
249 | columns = df.columns.tolist()
250 |
251 | # Skip section parquets - they have title/header columns
252 | if 'title' in columns or 'header' in columns or 'section' in columns:
253 | logger.debug(f"Skipping sections parquet: {file_path}")
254 | continue
255 |
256 | # For metadata parquets - they don't have title/header but have filename
257 | if 'filename' in columns:
258 | if require_url_column:
259 | # Check if required URL column exists
260 | if self.url_column in columns:
261 | logger.info(f"Found metadata parquet with filename and {self.url_column}: {file_path}")
262 | return file_path
263 | else:
264 | # Missing URL column
265 | logger.warning(f"Found parquet with filename column but no {self.url_column} column: {file_path}")
266 | logger.debug(f"Available columns: {columns}")
267 | else:
268 | # URL not required, filename is enough
269 | logger.info(f"Found metadata parquet with filename (URL not required): {file_path}")
270 | return file_path
271 | else:
272 | logger.debug(f"Found parquet without filename column: {file_path}")
273 | except Exception as e:
274 | logger.debug(f"Error reading parquet {file_path}: {e}")
275 | continue
276 |
277 | logger.warning(f"No suitable metadata parquet found in {directory}")
278 | return None
279 |
280 | def is_valid_metadata_parquet(self, filepath: Union[str, Path]) -> bool:
281 | """
282 | Check if a parquet file conforms to the metadata schema used by downloader.
283 |
284 | Args:
285 | filepath: Path to the parquet file to check
286 |
287 | Returns:
288 | bool: True if the file has the required metadata fields
289 | """
290 | try:
291 | schema = pq.read_schema(filepath)
292 | # Check for url_column (which might be custom) and filename
293 | required_fields = [self.url_column, 'filename']
294 | return all(field in schema.names for field in required_fields)
295 | except Exception:
296 | return False
297 |
298 | def create_basic_metadata_parquet(self, markdown_dir: Union[str, Path], output_dir: Union[str, Path]) -> Union[Path, None]:
299 | """
300 | Create a simple metadata parquet file from a directory of markdown files.
301 | This is used when there is no existing parquet file to update.
302 |
303 | Args:
304 | markdown_dir: Directory containing markdown files
305 | output_dir: Directory where to create the parquet file
306 |
307 | Returns:
308 | Path: Path to the created parquet file, or None if creation failed
309 | """
310 | try:
311 | markdown_dir = Path(markdown_dir)
312 | output_dir = Path(output_dir)
313 |
314 | # Create output directory if it doesn't exist
315 | download_results_dir = output_dir / "download_results"
316 | os.makedirs(download_results_dir, exist_ok=True)
317 |
318 | # Get all markdown files in the input directory
319 | markdown_files = list(markdown_dir.glob("*.md"))
320 | if not markdown_files:
321 | print(f"No markdown files found in {markdown_dir}")
322 | return None
323 |
324 | # Create a DataFrame with just filenames
325 | data = []
326 | for md_file in markdown_files:
327 | entry = {
328 | 'filename': md_file.name,
329 | self.url_column: "" # Minimal URL placeholder
330 | }
331 | data.append(entry)
332 |
333 | # Create DataFrame
334 | df = pd.DataFrame(data)
335 |
336 | # Set output path for the parquet file
337 | output_path = download_results_dir / "download_results.parquet"
338 |
339 | # Write to parquet without adding complex metadata
340 | pq.write_table(pa.Table.from_pandas(df), output_path)
341 |
342 | print(f"Created new metadata parquet file at {output_path}")
343 | return output_path
344 |
345 | except Exception as e:
346 | print(f"Error creating metadata parquet file: {e}")
347 | return None
348 |
349 | def is_download_result_parquet(self, filepath: Union[str, Path]) -> bool:
350 | """
351 | Check if a parquet file contains download results with success/error information.
352 |
353 | Args:
354 | filepath: Path to the parquet file to check
355 |
356 | Returns:
357 | bool: True if the file has download result fields
358 | """
359 | try:
360 | schema = pq.read_schema(filepath)
361 | # Check for download result fields
362 | required_fields = ['download_success', 'filename']
363 | return all(field in schema.names for field in required_fields)
364 | except Exception:
365 | return False
366 |
367 | def is_sections_parquet(self, filepath: Union[str, Path]) -> bool:
368 | """
369 | Check if a parquet file contains section data from extracted files.
370 | This identifies the second type of parquet in the pipeline - the sections parquet.
371 |
372 | Args:
373 | filepath: Path to the parquet file to check
374 |
375 | Returns:
376 | bool: True if the file has section data fields
377 | """
378 | try:
379 | schema = pq.read_schema(filepath)
380 | # Check for required section fields
381 | required_fields = ['filename', 'title', 'content', 'section']
382 | return all(field in schema.names for field in required_fields)
383 | except Exception:
384 | return False
385 |
386 | def add_processing_stage(self, df: pd.DataFrame, stage: str) -> pd.DataFrame:
387 | """
388 | Add or update processing stage column in a DataFrame.
389 |
390 | Args:
391 | df: Input DataFrame to update
392 | stage: Processing stage value to set (e.g., 'downloaded', 'extracted', 'classified')
393 |
394 | Returns:
395 | pd.DataFrame: Updated DataFrame with processing_stage column
396 | """
397 | df['processing_stage'] = stage
398 | return df
399 |
400 | def verify_required_columns(self, df: pd.DataFrame, required_columns: List[str]) -> Tuple[bool, List[str]]:
401 | """
402 | Check if a DataFrame contains all required columns and return missing ones.
403 |
404 | Args:
405 | df: DataFrame to check
406 | required_columns: List of column names that should be present
407 |
408 | Returns:
409 | Tuple containing:
410 | - bool: True if all required columns are present
411 | - List[str]: List of missing columns (empty if all present)
412 | """
413 | missing_columns = [col for col in required_columns if col not in df.columns]
414 | return (len(missing_columns) == 0, missing_columns)
415 |
416 | def write_parquet(
417 | self,
418 | df: pd.DataFrame,
419 | file_path: Union[str, Path],
420 | metadata: Optional[Dict[str, str]] = None,
421 | schema_type: str = 'common',
422 | validate: bool = True
423 | ) -> None:
424 | """
425 | Write a DataFrame to parquet with standard schema and metadata.
426 |
427 | Args:
428 | df: DataFrame to write
429 | file_path: Path to write parquet file
430 | metadata: Dictionary of metadata to include
431 | schema_type: Type of schema to use
432 | validate: Whether to validate the schema before writing
433 | """
434 | # Create a copy to avoid modifying the original
435 | df_copy = df.copy()
436 |
437 | # Validate and fix schema if needed
438 | if validate:
439 | is_valid, missing_columns = self.validate_schema(df_copy, schema_type)
440 | if not is_valid:
441 | print(f"Adding missing columns to DataFrame: {missing_columns}")
442 |
443 | # Add missing columns with default values
444 | for col in missing_columns:
445 | if col in ['id', 'filename', 'title', 'section', 'predicted_section', 'download_error']:
446 | df_copy[col] = ''
447 | elif col in ['row_id', 'download_retry_count']:
448 | df_copy[col] = 0
449 | elif col == 'download_success':
450 | df_copy[col] = False
451 | elif col == 'probability':
452 | df_copy[col] = 0.0
453 |
454 | # Convert to PyArrow Table
455 | table = pa.Table.from_pandas(df_copy)
456 |
457 | # Add metadata if provided
458 | if metadata:
459 | table = self.add_metadata(table, metadata)
460 |
461 | # Write to parquet
462 | pq.write_table(table, file_path)
463 | print(f"Parquet file written to {file_path} with schema type '{schema_type}'")
464 |
--------------------------------------------------------------------------------
/pipeline/src/glossapi/sampler.py:
--------------------------------------------------------------------------------
1 | """
2 | Sampler module for extracting samples from processed corpus data.
3 |
4 | This module provides functionality for sampling documents from processed
5 | parquet files, with options for filtering by column values and splitting
6 | into parts for cross-validation.
7 | """
8 |
9 | import logging
10 | import os
11 | import pandas as pd
12 | import random
13 | from pathlib import Path
14 | from typing import Dict, Optional, Union, List, Any, Tuple
15 |
16 | class Sampler:
17 | """
18 | A class for sampling documents from parquet files with flexible filtering options.
19 |
20 | This class allows sampling unique filenames based on specific criteria and
21 | extracting all their rows for analysis or further processing.
22 |
23 | Example:
24 | sampler = Sampler("/path/to/processed_data")
25 |
26 | # Sample 200 files where 'document_type' is 'Κεφάλαιο'
27 | sample_df = sampler.sample(sample_from={'document_type': 'Κεφάλαιο'}, n=200)
28 |
29 | # Sample 200 files from everything except where 'document_type' is 'Κεφάλαιο'
30 | sample_df = sampler.sample(sample_from_all_except={'document_type': 'Κεφάλαιο'}, n=200)
31 |
32 | # Sample and split into 2 equal parts for cross-validation
33 | sample_df = sampler.sample(n=200, parts=2)
34 | """
35 |
36 | def __init__(
37 | self,
38 | base_dir: Union[str, Path],
39 | parquet_file: Optional[Union[str, Path]] = None,
40 | project_dir: Optional[Union[str, Path]] = None,
41 | log_level: int = logging.INFO
42 | ):
43 | """
44 | Initialize the Sampler.
45 |
46 | Args:
47 | base_dir: Base directory where processed data is stored
48 | parquet_file: Optional specific parquet file to sample from
49 | (default: fully_annotated_sections.parquet in base_dir)
50 | project_dir: Optional project directory for text outputs
51 | (default: v2 directory in parent of base_dir)
52 | log_level: Logging level (default: INFO)
53 | """
54 | self.base_dir = Path(base_dir)
55 |
56 | # Set up logging
57 | self.logger = logging.getLogger(__name__)
58 | self.logger.setLevel(log_level)
59 |
60 | if not self.logger.handlers:
61 | handler = logging.StreamHandler()
62 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
63 | handler.setFormatter(formatter)
64 | self.logger.addHandler(handler)
65 |
66 | # Set the default parquet file if not specified
67 | if parquet_file is None:
68 | self.parquet_file = self.base_dir / "fully_annotated_sections.parquet"
69 | else:
70 | self.parquet_file = Path(parquet_file)
71 |
72 | # Set up datasets directory in the base directory
73 | self.datasets_dir = self.base_dir / "datasets"
74 | os.makedirs(self.datasets_dir, exist_ok=True)
75 |
76 | # Set up project directory for text outputs
77 | if project_dir is None:
78 | try:
79 | # Try to find 'v2' directory in parent of base_dir
80 | parent_dir = self.base_dir.parent
81 | if (parent_dir / "v2").exists():
82 | self.project_dir = parent_dir / "v2"
83 | else:
84 | # Fall back to base_dir if v2 not found
85 | self.project_dir = self.base_dir
86 | except Exception:
87 | self.project_dir = self.base_dir
88 | else:
89 | self.project_dir = Path(project_dir)
90 |
91 | # Set up text samples directory in the project directory
92 | self.text_dir = self.project_dir / "text_samples"
93 | os.makedirs(self.text_dir, exist_ok=True)
94 |
95 | def sample(
96 | self,
97 | n: int = 100,
98 | parts: int = 1,
99 | output_csv: Optional[Union[str, Path]] = None,
100 | sample_from: Optional[Dict[str, Any]] = None,
101 | sample_from_all_except: Optional[Dict[str, Any]] = None,
102 | output_name: Optional[str] = None
103 | ) -> Union[pd.DataFrame, List[pd.DataFrame]]:
104 | """
105 | Sample a specified number of unique filenames and extract all their rows.
106 |
107 | Args:
108 | n: Number of unique filenames to sample
109 | parts: Number of even parts to split the sample into (default: 1)
110 | output_csv: Optional path to save the sampled data as CSV
111 | If not specified, will use output_name with default location
112 | sample_from: Optional dictionary {column: value} to sample only from rows
113 | where column has the specified value
114 | sample_from_all_except: Optional dictionary {column: value} to sample only
115 | from rows where column does NOT have the specified value
116 | output_name: Base name for output files (without extension)
117 | If not specified, will generate based on sampling criteria
118 |
119 | Returns:
120 | If parts=1: DataFrame containing all rows for the sampled filenames
121 | If parts>1: List of DataFrames, each containing rows for a part of the sampled filenames
122 |
123 | Raises:
124 | ValueError: If the specified column or label doesn't exist in the data
125 | """
126 | if not self.parquet_file.exists():
127 | self.logger.error(f"Parquet file not found: {self.parquet_file}")
128 | return pd.DataFrame()
129 |
130 | self.logger.info(f"Reading data from {self.parquet_file}...")
131 |
132 | # Read the parquet file
133 | df = pd.read_parquet(self.parquet_file,engine='fastparquet')
134 |
135 | # Check if filtering criteria are valid
136 | if sample_from:
137 | for column, value in sample_from.items():
138 | if column not in df.columns:
139 | raise ValueError(f"Column '{column}' not found in the parquet file")
140 | if value not in df[column].values:
141 | if not value.startswith('regex') :
142 | raise ValueError(f"Value '{value}' not found in column '{column}'")
143 |
144 | if sample_from_all_except:
145 | for column, value in sample_from_all_except.items():
146 | if column not in df.columns:
147 | raise ValueError(f"Column '{column}' not found in the parquet file")
148 | if value not in df[column].values:
149 | if not value.startswith('regex') :
150 | raise ValueError(f"Value '{value}' not found in column '{column}'")
151 |
152 | # Apply filters to the DataFrame
153 | filtered_df = df.copy()
154 |
155 | # Generate default output name if not provided
156 | if output_name is None:
157 | if sample_from and len(sample_from) == 1:
158 | col, val = next(iter(sample_from.items()))
159 | output_name = f"{val.lower().replace(' ', '_')}_samples"
160 | elif sample_from_all_except and len(sample_from_all_except) == 1:
161 | col, val = next(iter(sample_from_all_except.items()))
162 | output_name = f"non_{val.lower().replace(' ', '_')}_samples"
163 | else:
164 | output_name = "samples"
165 |
166 | # Apply filters to the DataFrame
167 | if sample_from:
168 | for column, value in sample_from.items():
169 | if value.startswith('regex(') and value.endswith(')') :
170 | filter = (filtered_df[column].str.contains(value[6:-1]))
171 | filtered_df = filtered_df[filter]
172 | else :
173 | filtered_df = filtered_df[filtered_df[column] == value]
174 | self.logger.info(f"Filtered to rows where {column} = '{value}' ({len(filtered_df)} rows)")
175 |
176 | if sample_from_all_except:
177 | for column, value in sample_from_all_except.items():
178 | if value.startswith('regex(') and value.endswith(')') :
179 | filter = (filtered_df[column].str.contains(value[6:-1]))
180 | filtered_df = filtered_df[~filter]
181 | else :
182 | filtered_df = filtered_df[filtered_df[column] != value]
183 | self.logger.info(f"Filtered to rows where {column} != '{value}' ({len(filtered_df)} rows)")
184 |
185 | # Get unique filenames from the filtered data
186 | unique_filenames = filtered_df['filename'].unique()
187 | total_unique = len(unique_filenames)
188 |
189 | if total_unique == 0:
190 | self.logger.error("No matching filenames found after applying filters")
191 | return pd.DataFrame()
192 |
193 | self.logger.info(f"Found {total_unique} unique filenames after filtering")
194 |
195 | if total_unique <= n:
196 | self.logger.warning(f"Requested sample size ({n}) is greater than or equal to the number of unique filenames ({total_unique}). Using all available filenames.")
197 | sampled_filenames = unique_filenames
198 | else:
199 | # Randomly sample unique filenames
200 | sampled_filenames = random.sample(list(unique_filenames), n)
201 |
202 | # Extract all rows for the sampled filenames
203 | sampled_df = df[df['filename'].isin(sampled_filenames)]
204 |
205 | self.logger.info(f"Sampled {len(sampled_filenames)} unique filenames with {len(sampled_df)} total rows")
206 |
207 | # Set up default output CSV path if not provided
208 | if output_csv is None and parts == 1:
209 | output_csv = self.datasets_dir / f"{output_name}.csv"
210 |
211 | # Save to CSV if output path is provided
212 | if output_csv and parts == 1:
213 | output_path = Path(output_csv)
214 | os.makedirs(output_path.parent, exist_ok=True)
215 | sampled_df.to_csv(output_path, index=False)
216 | self.logger.info(f"Saved sampled data to {output_path}")
217 |
218 | # Split into parts if requested
219 | if parts > 1:
220 | self.logger.info(f"Splitting sample into {parts} equal parts")
221 |
222 | # Split the sampled filenames into equal parts
223 | random.shuffle(sampled_filenames)
224 | filename_parts = [sampled_filenames[i::parts] for i in range(parts)]
225 |
226 | # Create a DataFrame for each part
227 | result_parts = []
228 | for i, filenames in enumerate(filename_parts):
229 | part_df = df[df['filename'].isin(filenames)]
230 | result_parts.append(part_df)
231 | self.logger.info(f"Part {i+1}: {len(filenames)} filenames, {len(part_df)} rows")
232 |
233 | # Set up default output CSV path for each part if not provided
234 | if output_csv is None:
235 | part_output = self.datasets_dir / f"{output_name}_{i+1}.csv"
236 | else:
237 | # If output_csv is provided, create part-specific paths
238 | output_stem = Path(output_csv).stem
239 | output_suffix = Path(output_csv).suffix
240 | output_dir = Path(output_csv).parent
241 | part_output = output_dir / f"{output_stem}_{i+1}{output_suffix}"
242 |
243 | # Save each part
244 | os.makedirs(Path(part_output).parent, exist_ok=True)
245 | part_df.to_csv(part_output, index=False)
246 | self.logger.info(f"Saved part {i+1} to {part_output}")
247 |
248 | return result_parts
249 |
250 | return sampled_df
251 |
252 | def to_text(
253 | self,
254 | input_data: Union[str, Path, pd.DataFrame],
255 | output_dir: Optional[Union[str, Path]] = None,
256 | folder_name: Optional[str] = None
257 | ) -> None:
258 | """
259 | Convert parquet or CSV data to formatted text files.
260 |
261 | Args:
262 | input_data: Path to parquet/CSV file or DataFrame containing the data
263 | output_dir: Directory to save the output text files
264 | If None, creates a directory in text_samples based on folder_name
265 | folder_name: Name for the output directory if output_dir is None
266 | If None, uses a default name based on timestamp
267 | """
268 | # Set up output directory
269 | if output_dir is None:
270 | if folder_name is None:
271 | # Generate a timestamp-based name if no folder name provided
272 | if isinstance(input_data, pd.DataFrame):
273 | # Try to infer a good name from the DataFrame if available
274 | if 'document_type' in input_data.columns and len(input_data['document_type'].unique()) == 1:
275 | folder_name = f"{input_data['document_type'].iloc[0].lower().replace(' ', '_')}_samples"
276 | else:
277 | folder_name = f"samples_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}"
278 | else:
279 | # Use the input filename if it's a file
280 | if isinstance(input_data, (str, Path)):
281 | folder_name = Path(input_data).stem
282 | else:
283 | folder_name = f"samples_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}"
284 |
285 | output_dir = self.text_dir / folder_name
286 |
287 | self.logger.info(f"Converting data to formatted text files in {output_dir}...")
288 |
289 | # Create output directory
290 | output_dir = Path(output_dir)
291 | os.makedirs(output_dir, exist_ok=True)
292 |
293 | # Load data if input is a file path
294 | if isinstance(input_data, (str, Path)):
295 | input_path = Path(input_data)
296 | if input_path.suffix.lower() == '.csv':
297 | df = pd.read_csv(input_path)
298 | elif input_path.suffix.lower() == '.parquet':
299 | df = pd.read_parquet(input_path,engine='fatstparquet')
300 | else:
301 | self.logger.error(f"Unsupported file format: {input_path.suffix}")
302 | return
303 | else:
304 | # Assume input_data is a DataFrame
305 | df = input_data
306 |
307 | # Group by filename and sort by id
308 | self.logger.info("Grouping data by filename...")
309 | grouped = df.groupby('filename')
310 |
311 | # Process each unique filename
312 | for filename, group in grouped:
313 | # Sort by id to maintain the correct order of sections
314 | if 'id' in group.columns:
315 | group = group.sort_values('id')
316 |
317 | # Create output file path
318 | output_file_path = output_dir / f"{filename}.txt"
319 |
320 | # Write formatted content
321 | with open(output_file_path, 'w', encoding='utf-8') as f:
322 | # Write filename at the top
323 | f.write(f"# Document: {filename}\n\n")
324 |
325 | for _, row in group.iterrows():
326 | # Write row with formatting
327 | section_type = row.get('predicted_section', '')
328 | row_id = row.get('row_id', '')
329 | header = row.get('header', '')
330 | section = row.get('section', '')
331 |
332 | f.write(f"{{{row_id}, {section_type}}} {header}\n\n")
333 | f.write(f"{section}\n\n")
334 |
335 | self.logger.info(f"Processed file: {output_file_path}")
336 |
337 | self.logger.info(f"Conversion complete. Text files saved to {output_dir}")
338 |
--------------------------------------------------------------------------------
/refactoring_plan.md:
--------------------------------------------------------------------------------
1 | # GlossAPI Refactoring Plan
2 |
3 | ## Overview
4 | This document outlines the planned changes to the GlossAPI section classification pipeline, focusing on simplifying the section processing logic and changing the output structure.
5 |
6 | ## Key Changes
7 |
8 | ### 1. Simplification of Section Processing
9 | - Rename `_process_academic_text_with_positions` to `_process_sections` in all places in the code
10 | - Replace hierarchical section processing with flat processing:
11 | - Find text between two headers and define it as a section
12 | - Use the header above as the section's header
13 | - Process all markdown headers flatly instead of maintaining a hierarchical structure
14 | - **Important**: Maintain the existing functionality that protects lists and tables from cleaning and reformatting by detecting them and processing them differently
15 |
16 | ### 2. Changes to Output Schema
17 | - Remove the following columns from to_parquet:
18 | - `label` (string)
19 | - `section_propo` (int64)
20 | - `section_length` (int64)
21 | - Remove all related functionality for calculating `section_propo` and `section_length`
22 |
23 | ### 3. Section Content Structure Changes
24 | - Modify the logic in both `academic_section.py` and `gloss_section.py`
25 | - Return sections as JSON objects that contain, in the order they appear in the text, entries with keys:
26 | - "text" - for regular text content
27 | - "table" - for table content
28 | - "list" - for list content
29 | - "footnote" - for footnote content
30 | - Instead of deleting footnotes, annotate them appropriately
31 | - **Keep** the existing flags (`has_table`, `has_list`) in the output schema
32 | - **Add** new flags `has_footnote` and `has_text` to indicate presence of those content types
33 | - Implement detection logic to identify if a section contains non-empty lines that don't belong to tables, lists, or footnotes (for the `has_text` flag)
34 |
35 | ### 4. Implementation Plan
36 | 1. First, create new versions of the modules with the updated functionality
37 | 2. Ensure all dependencies and references are updated
38 | 3. Make sure the section processing works with these simplified changes
39 | 4. Test the pipeline with sample documents
40 |
41 | ## Files to be Changed
42 | - `/mnt/data/glossAPI/pipeline/src/glossapi/gloss_section.py`
43 | - `/mnt/data/glossAPI/pipeline/src/glossapi/academic_section.py`
44 | - Any other files that reference the renamed functions or changed outputs
45 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # GlossAPI Requirements
2 | # Automatically generated based on package imports
3 |
4 | # Core dependencies
5 | pandas>=1.3.0
6 | numpy>=1.20.0
7 | scikit-learn>=1.0.0
8 | joblib>=1.0.0
9 | dask>=2022.1.0
10 | pyarrow>=7.0.0
11 |
12 | # Document processing
13 | docling>=1.0.0
14 |
15 | # Python standard libraries (included for reference)
16 | # logging
17 | # os
18 | # pathlib
19 | # typing
20 | # re
21 | # random
22 | # shutil
23 |
--------------------------------------------------------------------------------
/scraping/download_and_extract_scripts/__pycache__/downloader_app.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eellak/glossAPI/02ba170f69681c6bb1ad0a52b48fb2309c8354f1/scraping/download_and_extract_scripts/__pycache__/downloader_app.cpython-310.pyc
--------------------------------------------------------------------------------
/scraping/download_and_extract_scripts/__pycache__/extractor_app.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eellak/glossAPI/02ba170f69681c6bb1ad0a52b48fb2309c8354f1/scraping/download_and_extract_scripts/__pycache__/extractor_app.cpython-310.pyc
--------------------------------------------------------------------------------
/scraping/download_and_extract_scripts/downloader.py:
--------------------------------------------------------------------------------
1 | import aiohttp
2 | import asyncio
3 | import os
4 | import argparse
5 | from urllib.parse import urlparse
6 | import random
7 | import aiofiles
8 | import logging
9 | import json
10 | import time
11 |
12 |
13 | #Configure logging for behavior tracking and errors
14 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15 |
16 | #Function for the highest index of papers downloaded for continuation
17 | def get_indexes(papers):
18 | if papers:
19 | nums = []
20 | for p in papers:
21 | num = p.split("_")[-1]
22 | nums.append(int(num))
23 | return sorted(nums)[-1:]
24 | return []
25 |
26 | #Function that is capable of downloading PDFs allowing retrial and concurrent downloads
27 | async def download_pdfs(metadata_dict, semaphore, visited, indexes, args, progress_report, retry=1):
28 |
29 | #Prepares tasks for download_pdf function and stores association of "paper_name.pdf" with original metadata.
30 |
31 | retry -= 1
32 | retries = {} #Dictionary holding files for download retrial
33 | tasks = [] #List to hold the tasks to be executed
34 | ordered_metadata = list(metadata_dict.items())
35 | user_agent_gen = user_agent_generator()
36 | i = 0
37 | reached_end_of_file = True #flag: if all metadata are in "visited"
38 |
39 | #Process metadata urls and schedule downloads
40 | for metadata, url in ordered_metadata:
41 | if i < args.batch and metadata not in visited:
42 | reached_end_of_file = False
43 | if indexes:
44 | index = indexes[-1] + 1
45 | else:
46 | index = 1
47 | indexes.append(index)
48 | task = asyncio.create_task(
49 | download_pdf(index, metadata, url, semaphore, args, next(user_agent_gen))
50 | )
51 | tasks.append(task)
52 | i += 1
53 | results = await asyncio.gather(*tasks)
54 | for r in results:
55 | if r:
56 | has_downloaded_file, metadata, pdf_file_name = r
57 | if has_downloaded_file:
58 | progress_report[pdf_file_name[:-4]] = metadata
59 | else:
60 | logging.warning(f"Failed to download file for metadata: {metadata}")
61 | if retry > 0:
62 | retries[url] = metadata
63 | if retries and retry > 0:
64 | logging.info(f"Retrying download for {len(retries)} files")
65 | await download_pdfs(retries, semaphore, visited, indexes, args, progress_report, retry-1)
66 | if i < args.batch: reached_end_of_file = True
67 | return reached_end_of_file
68 |
69 | #Function to extract base URL from a given full URL
70 | async def get_base_url(url):
71 | if not url.startswith("http"):
72 | url = f"http://{url}"
73 | parsed_url = urlparse(url)
74 | base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
75 | return base_url
76 |
77 | #Function for the initialization of session headers
78 | async def setup_session(session, url, headers):
79 | """ Initialize the session with base headers. """
80 | base_url = await get_base_url(url)
81 | initial_url = f"{base_url}"
82 | async with session.get(initial_url, headers=headers) as response:
83 | await response.text()
84 | return headers
85 |
86 | #Function that arranges concurrent download of a PDFs given pdf_url, then returns download status, metadata and filename as a tuple.
87 | async def download_pdf(index, metadata, pdf_url, semaphore, args, user_agent, referer=None):
88 |
89 | if not referer:
90 | base_url = await get_base_url(pdf_url)
91 | else:
92 | base_url = referer
93 | headers = {
94 | 'User-Agent': user_agent,
95 | 'Referer': base_url
96 | }
97 | if not pdf_url.startswith("http"):
98 | pdf_url = f"http://{pdf_url}"
99 | sleep_time, file_type, request_type = args.sleep, args.type, args.req
100 | async with semaphore:
101 | timeout = aiohttp.ClientTimeout(total=60)
102 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False), timeout=timeout) as session:
103 | # Randomized sleep time between args.sleep and args.sleep + 2 (better for passing bot detection)
104 | await asyncio.sleep(random.uniform(sleep_time, sleep_time + 2))
105 |
106 | file_name = f'paper_{index}.{file_type}' # Names file by order of appearance
107 | try:
108 | await setup_session(session, pdf_url, headers)
109 | requester = getattr(session, request_type) # sets session type as either session.get or session.post
110 | async with requester(pdf_url, headers=headers, allow_redirects=False) as response:
111 | if response.status in (301, 302):
112 | logging.error(f"Redirected: {pdf_url} to {response.headers['Location']}. Status code: {response.status}")
113 | return (False, metadata, file_name)
114 | elif response.status == 200:
115 | content = await response.read()
116 | if args.output: output_path = args.output
117 | await write_file(file_name, content, output_path)
118 | logging.info(f"Downloaded {file_name}")
119 | return (True, metadata, file_name)
120 | else:
121 | logging.error(f"Failed to download {pdf_url}. Status code: {response.status}")
122 | except aiohttp.ClientError as e:
123 | logging.error(f"ClientError while downloading {pdf_url}: {e}")
124 | except aiohttp.http_exceptions.HttpProcessingError as e:
125 | logging.error(f"HTTP processing error while downloading {pdf_url}: {e}")
126 | except asyncio.TimeoutError:
127 | logging.error(f"Timeout error while downloading {pdf_url}")
128 | except Exception as e:
129 | logging.error(f"Unexpected error while downloading {pdf_url}: {e}")
130 | return (False, metadata, file_name)
131 |
132 | #Function that writes downloaded content to a file
133 | async def write_file(filename, content, output_path = "./"):
134 | path_to_file = os.path.join(output_path, filename)
135 | async with aiofiles.open(path_to_file, 'wb') as file:
136 | await file.write(content)
137 |
138 | #Function to generate random user-agents for avoiding bot detection
139 | #to add proxy rotation option
140 | def user_agent_generator():
141 |
142 | templates = [
143 | "Mozilla/5.0 ({os}) AppleWebKit/537.36 (KHTML, like Gecko) {browser}/{version} Safari/537.36",
144 | "Mozilla/5.0 ({os}) Gecko/20100101 {browser}/{version}",
145 | "Mozilla/5.0 ({os}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{version} Safari/537.36"
146 | ]
147 | operating_systems = [
148 | "Windows NT 10.0; Win64; x64",
149 | "Macintosh; Intel Mac OS X 10_15_7",
150 | "X11; Linux x86_64",
151 | "Windows NT 6.1; Win64; x64",
152 | "Android 9; Mobile; rv:40.0"
153 | ]
154 | browsers = [
155 | ("Chrome", random.randint(70, 90)),
156 | ("Firefox", random.randint(50, 80)),
157 | ("Edge", random.randint(80, 90))
158 | ]
159 | while True:
160 | template = random.choice(templates)
161 | os = random.choice(operating_systems)
162 | browser, version = random.choice(browsers)
163 | full_version = f"{version}.0.{random.randint(1000, 9999)}"
164 | user_agent = template.format(os=os, browser=browser, version=full_version)
165 | yield user_agent
166 |
167 | #Function for overall program executon
168 | async def run(args):
169 | current_working_directory = os.getcwd()
170 | path_to_url_siteguide = os.path.join(current_working_directory, args.filename)
171 | with open(path_to_url_siteguide, 'r') as file:
172 | metadata_dict = json.load(file)
173 |
174 | semaphore = asyncio.Semaphore(3) #if you get flagged by bot detection try adjusting value
175 | try:
176 | try:
177 | with open('progress_report.json', 'r') as file:
178 | progress_report = json.load(file)
179 | logging.info("Existing progress report found and loaded")
180 | indexes = get_indexes(list(progress_report.keys()))
181 | except FileNotFoundError:
182 | progress_report = {}
183 | indexes = []
184 | logging.info("No existing progress report found")
185 | visited = list(progress_report.values())
186 | # Download PDFs and update progress report
187 | logging.info(f"Starting download from {args.filename}")
188 | finished = await download_pdfs(metadata_dict, semaphore, visited, indexes, args, progress_report)
189 | logging.info(f"Finished download from {args.filename}")
190 |
191 | except Exception as e:
192 | logging.error(f"An error occurred: {e}")
193 | raise
194 | finally:
195 | if finished:
196 | logging.info("All available have been downloaded - Finished!")
197 | # still write to progress_report.json in case it finished because of i < args.batch
198 | with open('progress_report.json', 'w') as file:
199 | json.dump(progress_report, file, ensure_ascii=False, indent=4)
200 | return True
201 | else:
202 | logging.info("PDF downloads completed")
203 | with open('progress_report.json', 'w') as file:
204 | json.dump(progress_report, file, ensure_ascii=False, indent=4)
205 | logging.info("Progress report written to progress_report.json")
206 | return False
207 |
208 | #Function for handling command-line arguments
209 | def parse_input():
210 | parser = argparse.ArgumentParser(description="Gets PDFs through URLs given as value entries in a JSON.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
211 | parser.add_argument("--json", help="Add path to JSON file with URLs siteguide", required=True)
212 | parser.add_argument("--sleep", type=int, default=1, help="Set delay before new request is made (in seconds)")
213 | parser.add_argument("--type", help="Select file type to be downloaded e.g., 'pdf', 'doc'", required=True)
214 | parser.add_argument("--req", choices=['get', 'post'], default='get', help="Set request type 'get' or 'post'")
215 | parser.add_argument("-o", "--output", default="./", help="Set download directory")
216 | parser.add_argument("--little_potato", help="Set directory for progress_report.json (previously little_potato), default value is set to --output")
217 | parser.add_argument("--batch", type=int, default=10, help="Set number of files to download per run")
218 | args = parser.parse_args()
219 |
220 | if not args.little_potato:
221 | args.little_potato = args.output
222 | logging.info(f"Arguments received: JSON file: {args.json}, Sleep time: {args.sleep}, File type: {args.type}, Request type: {args.req}, Output path: {args.output}, 'progress_report.json' path: {args.little_potato}")
223 | return args
224 |
225 | #The main function to parse input arguments, load URL metadata from a JSON file, manage download progress with semaphores for concurrency, and save the download progress to a JSON report file
226 | async def main():
227 | args = parse_input()
228 | with open(args.json, 'r') as file:
229 | metadata_dict = json.load(file)
230 | #Semaphore that limits concurrent downloads
231 | semaphore = asyncio.Semaphore(3) # Adjust the value as needed
232 |
233 | try:
234 | #Read existing progress report if any
235 | try:
236 | progress_report_path = os.path.join(args.little_potato, 'progress_report.json')
237 | with open(progress_report_path, 'r') as file:
238 | progress_report = json.load(file)
239 | logging.info("Existing progress report found and loaded")
240 | indexes = get_indexes(list(progress_report.keys()))
241 | except FileNotFoundError:
242 | progress_report = {}
243 | indexes = []
244 | logging.info("No existing progress report found")
245 | visited = list(progress_report.values())
246 | logging.info("Starting PDF downloads")
247 | finished = await download_pdfs(metadata_dict, semaphore, visited, indexes, args, progress_report)
248 | if finished:
249 | logging.info("All available files are in progress_report.json - Finished!")
250 | else:
251 | logging.info("PDF downloads completed")
252 | except Exception as e:
253 | logging.error(f"An error occurred: {e}")
254 | raise
255 | finally:
256 | #Write progress report to a JSON file
257 | progress_report_path = os.path.join(args.little_potato, 'progress_report.json')
258 | with open(progress_report_path, 'w') as file:
259 | json.dump(progress_report, file, ensure_ascii=False, indent=4)
260 | logging.info("Progress report written to progress_report.json")
261 |
262 | #Entry point of Downloader
263 | if __name__ == "__main__":
264 | asyncio.run(main())
--------------------------------------------------------------------------------
/scraping/json_sitemaps/boithimata-glossas-G-Lyk_pdf.json:
--------------------------------------------------------------------------------
1 | {
2 | "Προβολή σημειώσεων 1ο Διαγώνισμα - 1η Εκδοχή - Θέματα": "http://www.study4exams.gr/mod_greek/pdf/NG_D/NG_D1_THEMATA_1h_ekdoxh.pdf",
3 | "Προβολή σημειώσεων 1ο Διαγώνισμα - 1η Εκδοχή - Ενδεικτικές απαντήσεις": "http://www.study4exams.gr/mod_greek/pdf/NG_D/NG_D1_APANTHSEIS_1h_ekdoxh.pdf",
4 | "Προβολή σημειώσεων 1ο Διαγώνισμα - 2η Εκδοχή - Θέματα": "http://www.study4exams.gr/mod_greek/pdf/NG_D/NG_D1_THEMATA_2h_ekdoxh.pdf",
5 | "Προβολή σημειώσεων 1ο Διαγώνισμα - 2η Εκδοχή - Ενδεικτικές απαντήσεις": "http://www.study4exams.gr/mod_greek/pdf/NG_D/NG_D1_APANTHSEIS_2h_ekdoxh.pdf",
6 | "Προβολή σημειώσεων 2ο Διαγώνισμα - Θέματα (ΝΕΟ_2020)": "http://www.study4exams.gr/mod_greek/pdf/NG_D/NG_D2_THEMATA.pdf",
7 | "Προβολή σημειώσεων 2ο Διαγώνισμα - Ενδεικτικές απαντήσεις": "http://www.study4exams.gr/mod_greek/pdf/NG_D/NG_D2_APANTHSEIS.pdf"
8 | }
9 |
--------------------------------------------------------------------------------
/scraping/json_sitemaps/greek-language_pdf.json:
--------------------------------------------------------------------------------
1 | {
2 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 01": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 01.pdf",
3 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 02": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 02.pdf",
4 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 03": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 03.pdf",
5 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 04": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 04.pdf",
6 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 05": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 05.pdf",
7 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 06": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 06.pdf",
8 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 07": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 07.pdf",
9 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 08": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 08.pdf",
10 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 09": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 09.pdf",
11 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 10": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 10.pdf",
12 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 11": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 11.pdf",
13 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 12": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 12.pdf",
14 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 13": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 13.pdf",
15 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 14": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 14.pdf",
16 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 15": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 15.pdf",
17 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 16": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 16.pdf",
18 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 17": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 17.pdf",
19 | " 3. Υλικό εξάσκησης > ΚΓΛ_1_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α1/ΚΓΛ_1_Α1.pdf",
20 | " 3. Υλικό εξάσκησης > ΚΓΛ_2_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α1/ΚΓΛ_2_Α1.pdf",
21 | " 3. Υλικό εξάσκησης > ΚΓΛ_3_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α1/ΚΓΛ_3_Α1.pdf",
22 | " 3. Υλικό εξάσκησης > ΚΓΛ_4_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α1/ΚΓΛ_4_Α1.pdf",
23 | " 3. Υλικό εξάσκησης > ΚΓΛ_5_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α1/ΚΓΛ_5_Α1.pdf",
24 | " 3. Υλικό εξάσκησης > ΚΓΛ_1_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α2/ΚΓΛ_1_Α2.pdf",
25 | " 3. Υλικό εξάσκησης > ΚΓΛ_2_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α2/ΚΓΛ_2_Α2.pdf",
26 | " 3. Υλικό εξάσκησης > ΚΓΛ_3_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α2/ΚΓΛ_3_Α2.pdf",
27 | " 3. Υλικό εξάσκησης > ΚΓΛ_4_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α2/ΚΓΛ_4_Α2.pdf",
28 | " 3. Υλικό εξάσκησης > ΚΓΛ_5_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α2/ΚΓΛ_5_Α2.pdf",
29 | " 3. Υλικό εξάσκησης > ΚΠΛ_1_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α1/ΚΠΛ_1_Α1.pdf",
30 | " 3. Υλικό εξάσκησης > ΚΠΛ_2_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α1/ΚΠΛ_2_Α1.pdf",
31 | " 3. Υλικό εξάσκησης > ΚΠΛ_3_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α1/ΚΠΛ_3_Α1.pdf",
32 | " 3. Υλικό εξάσκησης > ΚΠΛ_4_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α1/ΚΠΛ_4_Α1.pdf",
33 | " 3. Υλικό εξάσκησης > ΚΠΛ_1_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α2/ΚΠΛ_1_Α2.pdf",
34 | " 3. Υλικό εξάσκησης > ΚΠΛ_2_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α2/ΚΠΛ_2_Α2.pdf",
35 | " 3. Υλικό εξάσκησης > ΚΠΛ_3_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α2/ΚΠΛ_3_Α2.pdf",
36 | " 3. Υλικό εξάσκησης > ΚΠΛ_4_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α2/ΚΠΛ_4_Α2.pdf",
37 | " 3. Υλικό εξάσκησης > ΠΓΛ_1_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α1/ΠΓΛ_1_Α1.pdf",
38 | " 3. Υλικό εξάσκησης > ΠΓΛ_2_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α1/ΠΓΛ_2_Α1.pdf",
39 | " 3. Υλικό εξάσκησης > ΠΓΛ_3_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α1/ΠΓΛ_3_Α1.pdf",
40 | " 3. Υλικό εξάσκησης > ΠΓΛ_4_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α1/ΠΓΛ_4_Α1.pdf",
41 | " 3. Υλικό εξάσκησης > ΠΓΛ_5_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α1/ΠΓΛ_5_Α1.pdf",
42 | " 3. Υλικό εξάσκησης > ΠΓΛ_1_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α2/ΠΓΛ_1_Α2.pdf",
43 | " 3. Υλικό εξάσκησης > ΠΓΛ_2_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α2/ΠΓΛ_2_Α2.pdf",
44 | " 3. Υλικό εξάσκησης > ΠΓΛ_3_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α2/ΠΓΛ_3_Α2.pdf",
45 | " 3. Υλικό εξάσκησης > ΠΓΛ_4_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α2/ΠΓΛ_4_Α2.pdf",
46 | " 3. Υλικό εξάσκησης > ΠΓΛ_5_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α2/ΠΓΛ_5_Α2.pdf",
47 | " 3. Υλικό εξάσκησης > ΠΠΛ_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΠΛ_Α1/ΠΠΛ_Α1.pdf",
48 | " 3. Υλικό εξάσκησης > ΠΠΛ_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΠΛ_Α2/ΠΠΛ_Α2.pdf",
49 | " 4. Κείμενα ΚΠΛ > Κείμενα ΚΠΛ": "https://www.greek-language.gr/certification/ΚΛΙΚ/4. Κείμενα ΚΠΛ/Κείμενα ΚΠΛ.pdf",
50 | " 5. Λύσεις των ασκήσεων > Απαντήσεις Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/5. Λύσεις των ασκήσεων/Απαντήσεις Α1.pdf",
51 | " 5. Λύσεις των ασκήσεων > Απαντήσεις Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/5. Λύσεις των ασκήσεων/Απαντήσεις Α2.pdf"
52 | }
--------------------------------------------------------------------------------
/scraping/json_sitemaps/kentra-ekpaideusis-enhlikwn_pdf.json:
--------------------------------------------------------------------------------
1 | {
2 | "ταξίδι στη γλώσσα": "http://repository.edulll.gr/edulll/retrieve/742/127.pdf"
3 | }
4 |
--------------------------------------------------------------------------------
/scraping/json_sitemaps/sitemap_explainer.txt:
--------------------------------------------------------------------------------
1 | JSON with {metadata : file_link} pairs.
2 |
3 | Each JSON file corresponds to a website source which contains eg files of university theses, school/ university books, or school/ uni entry exams.
4 |
5 | Metadata is the native categorization of the site, each level of recurssion is split by " > ", and ends with the file title.
6 |
7 | By using downloader10.py on each of these files you get files "paper_n.pdf" or similar and another JSON file
8 | associating {filename : metadata}. By running extractor4.py you get similar result but for "paper_n.txt" or similar.
9 |
--------------------------------------------------------------------------------
/scraping/json_sitemaps/themata-lyseis-panelladikwn_pdf.json:
--------------------------------------------------------------------------------
1 | {
2 | "2023 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2023/Istoria120623.pdf",
3 | "2023 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2023/fsm_fra_istoria120623new.pdf",
4 | "2023 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2023/Mathimatika060623.pdf",
5 | "2023 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2023/fsm_fra_mathimatika_060623.pdf",
6 | "2023 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2023/fsm_fra_mathimatika_060623.pdf",
7 | "2023 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2023/Fysiki120623.pdf",
8 | "2022 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2022/them_istoria100622.pdf",
9 | "2022 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2022/fsm_fra_istoria100622.pdf",
10 | "2022 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2022/them_math_gel_220606.pdf",
11 | "2022 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2022/fsm_fra_math060622.pdf",
12 | "2022 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2022/fsm_fra_math060622.pdf",
13 | "2022 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2022/them_fysiki100622.pdf",
14 | "2021 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2021/Istoria220621.pdf",
15 | "2021 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2021/fsm_fra_istoria220621.pdf",
16 | "2021 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2021/mathimatika_160621.pdf",
17 | "2021 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2021/fsm_fra_math160621.pdf",
18 | "2021 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2021/fsm_fra_math160621.pdf",
19 | "2021 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2021/Fysiki220621.pdf",
20 | "2020 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2020/Istoria240620.pdf",
21 | "2020 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2020/fsm_fra_istoria240620.pdf",
22 | "2020 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2020/mathimatika170620neo.pdf",
23 | "2020 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2020/fsm_fra_math170620final.pdf",
24 | "2020 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2020/fsm_fra_math170620final.pdf",
25 | "2020 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2020/Fysiki220620.pdf",
26 | "2019 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2019/Istoria_120619.pdf",
27 | "2019 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2019/fsm_fra_istoria120619final.pdf",
28 | "2019 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2019/Mathimatika_100619.pdf",
29 | "2019 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2019/fsm_fra_math100619final.pdf",
30 | "2019 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2019/fsm_fra_math100619final.pdf",
31 | "2019 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2019/Fysiki_120619.pdf",
32 | "2018 > Ιστορία > Θέματα": "https://eduadvisor.grhttp://www.minedu.gov.gr/publications/docs2018/EXETASEIS-2018/them_ist_op_c_hmer_180613.pdf",
33 | "2018 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2018/fsm_fra_ist_130618.pdf",
34 | "2018 > Μαθηματικά > Θέματα": "https://eduadvisor.grhttp://www.minedu.gov.gr/publications/docs2018/EXETASEIS-2018/them_mat_op_c_hmer_180611.pdf",
35 | "2018 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2018/fsm_fra_math_110618final.pdf",
36 | "2018 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2018/fsm_fra_math_110618final.pdf",
37 | "2018 > Φυσική > Λύσεις": "https://eduadvisor.grhttp://www.minedu.gov.gr/publications/docs2018/EXETASEIS-2018/them_fis_op_c_hmer_180613.pdf",
38 | "2017 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/Neo_Genikis_themata_070617.pdf",
39 | "2017 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/Neo_Genikis_lyseis_070617.pdf",
40 | "2017 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/MathimatikaKat_Themata090617.pdf",
41 | "2017 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/MathimatikaKat_Lyseis090617.pdf",
42 | "2017 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/Istoria_Themata_120617.pdf",
43 | "2017 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/Istoria_Lyseis_120617.pdf",
44 | "2017 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/MathimatikaKat_Lyseis090617.pdf",
45 | "2017 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/Fysiki_Themata_120617.pdf",
46 | "2016 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/Neoelliniki16_themata.pdf",
47 | "2016 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/fsm_fra_neo_16052016new.pdf",
48 | "2016 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/e_math_180516.pdf",
49 | "2016 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/fsm_fra_math_18052016new.pdf",
50 | "2016 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/e_ist_23052016.pdf",
51 | "2016 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/fsm_fra_ist_23052016new.pdf",
52 | "2016 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/fsm_fra_math_18052016new.pdf",
53 | "2016 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/e_fys_23052016.pdf",
54 | "2015 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_neo_18052015.pdf",
55 | "2015 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_neo_18052015_new.pdf",
56 | "2015 > Μαθηματικά και Στοιχεία Στατιστικής > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_neo_18052015_new.pdf",
57 | "2015 > Μαθηματικά και Στοιχεία Στατιστικής > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_math_20052015.pdf",
58 | "2015 > Βιολογία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_math_20052015.pdf",
59 | "2015 > Βιολογία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_math_20052015new.pdf",
60 | "2015 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_xhm_22052015.pdf",
61 | "2015 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_xhm_22052015new.pdf",
62 | "2015 > Λογοτεχνία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_neo_22052015.pdf",
63 | "2015 > Λογοτεχνία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_neo_22052015new.pdf",
64 | "2015 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_neo_22052015new.pdf",
65 | "2015 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_ist_29052015.pdf",
66 | "2015 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_math_25052015.pdf",
67 | "2015 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_math_25052015new.pdf",
68 | "2015 > Χημεία Βιοχημεία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_math_25052015new.pdf",
69 | "2015 > Χημεία Βιοχημεία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_xhm_22052015.pdf",
70 | "2015 > Αρχές Οργάνωσης και Διοίκησης Επιχειρήσεων > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_xhm_22052015new.pdf",
71 | "2015 > Αρχές Οργάνωσης και Διοίκησης Επιχειρήσεων > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_fys_29052015.pdf",
72 | "2014 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_neo_28052014.pdf",
73 | "2014 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_neo_updated.pdf",
74 | "2014 > Μαθηματικά και Στοιχεία Στατιστικής > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_neo_updated.pdf",
75 | "2014 > Μαθηματικά και Στοιχεία Στατιστικής > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_math_0530.pdf",
76 | "2014 > Βιολογία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_math_0530.pdf",
77 | "2014 > Βιολογία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_math_0530_updated.pdf",
78 | "2014 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_xhm_0604.pdf",
79 | "2014 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_xhm_0406_new.pdf",
80 | "2014 > Λογοτεχνία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_neo_0604.pdf",
81 | "2014 > Λογοτεχνία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_neo_0406_updated.pdf",
82 | "2014 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_neo_0406_updated.pdf",
83 | "2014 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_ist_1006.pdf",
84 | "2014 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_math_0206.pdf",
85 | "2014 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_math_0206_updated.pdf",
86 | "2014 > Χημεία Βιοχημεία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_math_0206_updated.pdf",
87 | "2014 > Χημεία Βιοχημεία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_xhm_0604.pdf",
88 | "2014 > Αρχές Οργάνωσης και Διοίκησης Επιχειρήσεων > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_xhm_0406_new.pdf",
89 | "2014 > Αρχές Οργάνωσης και Διοίκησης Επιχειρήσεων > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_fys_1006.pdf",
90 | "2013 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/%CE%A0%CE%91%CE%9D%CE%95%CE%9B%CE%9B%CE%97%CE%9D%CE%99%CE%95%CE%A3%202013/%CE%98%CE%95%CE%9C%CE%91%CE%A4%CE%91%20%CE%9A%CE%91%CE%99%20%CE%91%CE%A0%CE%91%CE%9D%CE%A4%CE%97%CE%A3%CE%95%CE%99%CE%A3/2013/Pan_neo_gen_17.pdf",
91 | "2013 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/%CE%A0%CE%91%CE%9D%CE%95%CE%9B%CE%9B%CE%97%CE%9D%CE%99%CE%95%CE%A3%202013/%CE%98%CE%95%CE%9C%CE%91%CE%A4%CE%91%20%CE%9A%CE%91%CE%99%20%CE%91%CE%A0%CE%91%CE%9D%CE%A4%CE%97%CE%A3%CE%95%CE%99%CE%A3/2013/fsm_fra_neo_gen_17.pdf",
92 | "2013 > Μαθηματικά και Στοιχεία Στατιστικής > Θέματα": "https://eduadvisor.gr/images/stories/pdf/%CE%A0%CE%91%CE%9D%CE%95%CE%9B%CE%9B%CE%97%CE%9D%CE%99%CE%95%CE%A3%202013/%CE%98%CE%95%CE%9C%CE%91%CE%A4%CE%91%20%CE%9A%CE%91%CE%99%20%CE%91%CE%A0%CE%91%CE%9D%CE%A4%CE%97%CE%A3%CE%95%CE%99%CE%A3/2013/fsm_fra_neo_gen_17.pdf",
93 | "2013 > Μαθηματικά και Στοιχεία Στατιστικής > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/%CE%A0%CE%91%CE%9D%CE%95%CE%9B%CE%9B%CE%97%CE%9D%CE%99%CE%95%CE%A3%202013/%CE%98%CE%95%CE%9C%CE%91%CE%A4%CE%91%20%CE%9A%CE%91%CE%99%20%CE%91%CE%A0%CE%91%CE%9D%CE%A4%CE%97%CE%A3%CE%95%CE%99%CE%A3/2013/pan_math_gen_20.pdf",
94 | "2013 > Βιολογία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/%CE%A0%CE%91%CE%9D%CE%95%CE%9B%CE%9B%CE%97%CE%9D%CE%99%CE%95%CE%A3%202013/%CE%98%CE%95%CE%9C%CE%91%CE%A4%CE%91%20%CE%9A%CE%91%CE%99%20%CE%91%CE%A0%CE%91%CE%9D%CE%A4%CE%97%CE%A3%CE%95%CE%99%CE%A3/2013/pan_math_gen_20.pdf",
95 | "2013 > Βιολογία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/%CE%A0%CE%91%CE%9D%CE%95%CE%9B%CE%9B%CE%97%CE%9D%CE%99%CE%95%CE%A3%202013/%CE%98%CE%95%CE%9C%CE%91%CE%A4%CE%91%20%CE%9A%CE%91%CE%99%20%CE%91%CE%A0%CE%91%CE%9D%CE%A4%CE%97%CE%A3%CE%95%CE%99%CE%A3/2013/fsm_fra_math_20_final.pdf",
96 | "2013 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/pan_xhm_vio_kat_24.pdf",
97 | "2013 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/fsm_fra_xhm_vio_kat_24.pdf",
98 | "2013 > Λογοτεχνία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/pan_neo_22.pdf",
99 | "2013 > Λογοτεχνία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/fsm_fra_neo_22.pdf",
100 | "2013 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/fsm_fra_neo_22.pdf",
101 | "2013 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/pan_ist_kat_24.pdf",
102 | "2013 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/pan_math_kat_27.pdf",
103 | "2013 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/fsm_fra_math_kat_27_final2.pdf",
104 | "2013 > Χημεία Βιοχημεία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/fsm_fra_math_kat_27_final2.pdf",
105 | "2013 > Χημεία Βιοχημεία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/pan_xhm_vio_kat_24.pdf",
106 | "2013 > Αρχές Οργάνωσης και Διοίκησης Επιχειρήσεων > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/fsm_fra_xhm_vio_kat_24.pdf",
107 | "2013 > Αρχές Οργάνωσης και Διοίκησης Επιχειρήσεων > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/pan_fys_22.pdf",
108 | "2012 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/NeoellinikiGlossa21.pdf",
109 | "2012 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/tmthma_neo.pdf",
110 | "2012 > Μαθηματικά και Στοιχεία Στατιστικής > Θέματα": "https://eduadvisor.gr/images/stories/pdf/tmthma_neo.pdf",
111 | "2012 > Μαθηματικά και Στοιχεία Στατιστικής > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Mathimatika23.pdf",
112 | "2012 > Βιολογία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Mathimatika23.pdf",
113 | "2012 > Βιολογία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/tmthma_math.pdf",
114 | "2012 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/tmthma_mathEPAL.pdf",
115 | "2012 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/FysikiKAT.pdf",
116 | "2012 > Λογοτεχνία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/NeoellKat.pdf",
117 | "2012 > Λογοτεχνία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/tmthma_neoKAT.pdf",
118 | "2012 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/tmthma_neoKAT.pdf",
119 | "2012 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Ist_kat30.pdf",
120 | "2012 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/MathI_EPAL24.pdf",
121 | "2012 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/tmthma_mathEPAL.pdf",
122 | "2012 > Χημεία Βιοχημεία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/tmthma_math28.pdf",
123 | "2012 > Χημεία Βιοχημεία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Biox_kat30.pdf",
124 | "2011 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Νεοελληνική%20Γλώσσα.pdf",
125 | "2011 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Νεοελληνική%20Γλώσσα%20(Λύσεις).pdf",
126 | "2011 > Μαθηματικά και Στοιχεία Στατιστικής > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Νεοελληνική%20Γλώσσα%20(Λύσεις).pdf",
127 | "2011 > Μαθηματικά και Στοιχεία Στατιστικής > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Μαθηματικά%20και%20Στοιχεία%20Στατιστικής.pdf",
128 | "2011 > Βιολογία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Μαθηματικά%20και%20Στοιχεία%20Στατιστικής.pdf",
129 | "2011 > Βιολογία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Μαθηματικά%20και%20Στοιχεία%20Στατιστικής%20(Λύσεις).pdf",
130 | "2011 > Λογοτεχνία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Λογοτεχνία.pdf",
131 | "2011 > Λογοτεχνία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Λογοτεχνία%20(Λύσεις).pdf",
132 | "2011 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Λογοτεχνία%20(Λύσεις).pdf",
133 | "2011 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Ιστορία.pdf",
134 | "2011 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Μαθηματικά.pdf",
135 | "2011 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Μαθηματικά%20(Λύσεις).pdf",
136 | "2011 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Μαθηματικά%20(Λύσεις).pdf",
137 | "2011 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Φυσική.pdf",
138 | "2011 > Χημεία Βιοχημεία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Μαθηματικά%20(Λύσεις).pdf",
139 | "2011 > Χημεία Βιοχημεία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Χημεία%20-%20Βιοχημεία.pdf",
140 | "2010-2001 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Νεοελληνική%20Γλώσσα.pdf",
141 | "2010-2001 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Μαθηματικά%20και%20Στοιχεία%20Στατιστικής.pdf",
142 | "2010-2001 > Μαθηματικά και Στοιχεία Στατιστικής > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Μαθηματικά%20και%20Στοιχεία%20Στατιστικής.pdf",
143 | "2010-2001 > Μαθηματικά και Στοιχεία Στατιστικής > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Βιολογία%20Γενικής%20Παιδείας.pdf",
144 | "2010-2001 > Βιολογία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Βιολογία%20Γενικής%20Παιδείας.pdf",
145 | "2010-2001 > Βιολογία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Φυσική%20Γενικής%20Παιδείας.pdf",
146 | "2010-2001 > Λογοτεχνία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Λογοτεχνία.pdf",
147 | "2010-2001 > Λογοτεχνία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Ιστορία%20Κατεύθυνσης.pdf",
148 | "2010-2001 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Ιστορία%20Κατεύθυνσης.pdf",
149 | "2010-2001 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Αρχαία.pdf",
150 | "2010-2001 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Μαθηματικά%20Κατεύθυνσης.pdf",
151 | "2010-2001 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Φυσική%20Κατεύθυνσης.pdf",
152 | "2010-2001 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Φυσική%20Κατεύθυνσης.pdf",
153 | "2010-2001 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Αρχές%20Οργάνωσης%20&%20Διοίκησης.pdf",
154 | "2010-2001 > Χημεία Βιοχημεία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Χημεία%20-%20Βιοχημεία.pdf",
155 | "2010-2001 > Χημεία Βιοχημεία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Φυσική%20Κατεύθυνσης.pdf"
156 | }
--------------------------------------------------------------------------------
/test_script.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Simple test of the GlossAPI Corpus functionality with the refactored pipeline
4 | """
5 | import logging
6 | from pathlib import Path
7 | from glossapi.corpus import Corpus
8 |
9 | # Configure logging
10 | logging.basicConfig(level=logging.INFO)
11 | logger = logging.getLogger("simple_test")
12 |
13 | # Test directory - using the directory where we downloaded the paper
14 | TEST_DIR = Path("/home/fivos/CascadeProjects/glossAPI/corpus_test")
15 |
16 | def main():
17 | # Create a basic corpus object - using same directory for input and output
18 | logger.info("Creating Corpus object")
19 | corpus = Corpus(
20 | input_dir=TEST_DIR,
21 | output_dir=TEST_DIR
22 | )
23 |
24 | # Skipping download step since we already have the PDF file
25 | logger.info("Skipping download step (already have the PDF file)")
26 |
27 | # 2. Extract
28 | logger.info("Running extract step")
29 | # Specify the formats we know are in the downloads directory
30 | corpus.extract()
31 |
32 | # 4. Section - now uses files marked as 'good' quality
33 | logger.info("Running section step")
34 | corpus.section()
35 |
36 | # 5. Annotate
37 | logger.info("Running annotate step")
38 | corpus.annotate(annotation_type="chapter")
39 |
40 | # Check results
41 | logger.info("Pipeline completed")
42 |
43 | if __name__ == "__main__":
44 | main()
45 |
--------------------------------------------------------------------------------