├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── Greek_variety_classification ├── Ancient_greek_or_not.ipynb ├── Greek_variety_classifier.ipynb ├── ancient_greek_filter_dataset.csv ├── greek_classification_dataset.csv ├── models │ ├── Ancient_Gr_classifier_model.zip │ └── Gr_Var_Classifier_model.zip └── preprocessing │ └── clean_data_with_mask.py ├── README.md ├── dataset_progress.md ├── pipeline ├── LICENSE.md ├── MANIFEST.in ├── README.md ├── pyproject.toml ├── refactoring_todo.md ├── scripts │ ├── concurrent_downloader.py │ ├── sample_for_training.py │ └── test_section_reconstruction.py └── src │ └── glossapi │ ├── __init__.py │ ├── corpus.py │ ├── gloss_downloader.py │ ├── gloss_extract.py │ ├── gloss_section.py │ ├── gloss_section_classifier.py │ ├── models │ ├── kmeans_weights.joblib │ └── section_classifier.joblib │ ├── parquet_schema.py │ └── sampler.py ├── refactoring_plan.md ├── requirements.txt ├── scraping ├── download_and_extract_scripts │ ├── __pycache__ │ │ ├── downloader_app.cpython-310.pyc │ │ └── extractor_app.cpython-310.pyc │ └── downloader.py └── json_sitemaps │ ├── anodos_pdf.json │ ├── boithimata-glossas-G-Lyk_pdf.json │ ├── cyprus-exams_pdf.json │ ├── ebooks_list_pdf.json │ ├── greek-language_pdf.json │ ├── kallipos_pdf.json │ ├── kentra-ekpaideusis-enhlikwn_pdf.json │ ├── kodiko_pdf.json │ ├── pergamos_list_pdf.json │ ├── sitemap_explainer.txt │ ├── themata-lyseis-panelladikwn_pdf.json │ └── trapeza-thematwn_doc.json └── test_script.py /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using GitHub Actions when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-repositories 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | workflow_dispatch: 8 | release: 9 | types: [published] 10 | 11 | jobs: 12 | deploy: 13 | runs-on: ubuntu-latest 14 | permissions: 15 | # IMPORTANT: this permission is mandatory for trusted publishing 16 | id-token: write 17 | contents: read 18 | 19 | steps: 20 | - uses: actions/checkout@v3 21 | - name: Set up Python 22 | uses: actions/setup-python@v4 23 | with: 24 | python-version: '3.x' 25 | - name: Copy README to pipeline directory 26 | run: | 27 | cp README.md pipeline/ 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | pip install build 32 | - name: Build package 33 | run: | 34 | cd pipeline 35 | python -m build 36 | - name: Publish package 37 | uses: pypa/gh-action-pypi-publish@release/v1 38 | with: 39 | packages-dir: pipeline/dist/ 40 | password: ${{ secrets.PYPI_API_TOKEN }} 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tokenization/preprocessing/preprocessor 2 | tokenization/preprocessing/text 3 | tokenization/preprocessing/text.txt 4 | tokenization/preprocessing/re_text.txt 5 | tokenization/new.bin 6 | tokenization/new.txt 7 | tokenization/text.bin 8 | tokenization/text.txt 9 | tokenization/tokenize 10 | tokenization/tokenizer 11 | .gitignore 12 | tokenization/freqency.txt 13 | tokenization/paper_1.txt 14 | tokenization/cleaned_filtered_extracted_txt 15 | tokenization/cleaned_filtered_extracted_txt_v2/* 16 | gutenberg_books 17 | clean_books 18 | # Python build artifacts 19 | __pycache__/ 20 | *.py[cod] 21 | *.class 22 | *.so 23 | .Python 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | 40 | -------------------------------------------------------------------------------- /Greek_variety_classification/Ancient_greek_or_not.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "e7063190-3c62-4d2b-9f17-cd4d6697e233", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import os\n", 13 | "os.environ[\"WANDB_DISABLED\"] = \"true\"\n", 14 | "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "id": "3e971008-c8ea-431c-a84b-52a8531cc4b6", 21 | "metadata": { 22 | "tags": [] 23 | }, 24 | "outputs": [ 25 | { 26 | "name": "stderr", 27 | "output_type": "stream", 28 | "text": [ 29 | "2024-09-06 07:39:08.755777: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", 30 | "2024-09-06 07:39:08.755873: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", 31 | "2024-09-06 07:39:08.758162: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", 32 | "2024-09-06 07:39:08.773280: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", 33 | "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 34 | "2024-09-06 07:39:10.593585: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "import pandas as pd\n", 40 | "import numpy as np\n", 41 | "from sklearn.model_selection import train_test_split\n", 42 | "from transformers import AutoTokenizer, TrainingArguments, Trainer\n", 43 | "import torch\n", 44 | "from torch import nn\n", 45 | "from transformers import AutoModel, AutoConfig\n", 46 | "from sklearn.preprocessing import LabelEncoder\n", 47 | "from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix\n", 48 | "import seaborn as sns\n", 49 | "import matplotlib.pyplot as plt" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "id": "9be316ae-3ace-45c9-84c1-69f06b6a85d7", 56 | "metadata": { 57 | "tags": [] 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "new_data = pd.read_csv(\"dataset_Sep_3_masked.csv\", sep=\",\", engine=\"python\")" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "id": "f17d1888-6356-4b4e-9a6a-eb173e8cc870", 68 | "metadata": { 69 | "tags": [] 70 | }, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "application/vnd.jupyter.widget-view+json": { 75 | "model_id": "edf0731e88f040fca8d4f9d82d7d4e32", 76 | "version_major": 2, 77 | "version_minor": 0 78 | }, 79 | "text/plain": [ 80 | "tokenizer_config.json: 0%| | 0.00/2.00 [00:00\n", 372 | " \n", 373 | " \n", 374 | " [85/85 02:12, Epoch 1/1]\n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | "
EpochTraining LossValidation LossAccuracyF1PrecisionRecall
10.0009000.0763290.9896910.9939150.9879031.000000

" 400 | ], 401 | "text/plain": [ 402 | "" 403 | ] 404 | }, 405 | "metadata": {}, 406 | "output_type": "display_data" 407 | }, 408 | { 409 | "data": { 410 | "text/plain": [ 411 | "TrainOutput(global_step=85, training_loss=0.39309312596040613, metrics={'train_runtime': 134.9228, 'train_samples_per_second': 10.065, 'train_steps_per_second': 0.63, 'total_flos': 0.0, 'train_loss': 0.39309312596040613, 'epoch': 1.0})" 412 | ] 413 | }, 414 | "execution_count": 10, 415 | "metadata": {}, 416 | "output_type": "execute_result" 417 | } 418 | ], 419 | "source": [ 420 | "# Train the model\n", 421 | "trainer.train()" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 11, 427 | "id": "5a874aba-57c9-4f13-9aa1-cc56776a40f1", 428 | "metadata": { 429 | "tags": [] 430 | }, 431 | "outputs": [ 432 | { 433 | "data": { 434 | "text/html": [], 435 | "text/plain": [ 436 | "" 437 | ] 438 | }, 439 | "metadata": {}, 440 | "output_type": "display_data" 441 | } 442 | ], 443 | "source": [ 444 | "# Evaluate on dev set\n", 445 | "dev_pred = trainer.predict(dev_dataset)\n", 446 | "dev_preds = dev_pred.predictions.argmax(-1)\n", 447 | "dev_labels = dev_dataset.labels" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 12, 453 | "id": "a2ccba6e-f2e4-4495-bfed-c66f1f73af1a", 454 | "metadata": { 455 | "tags": [] 456 | }, 457 | "outputs": [ 458 | { 459 | "name": "stdout", 460 | "output_type": "stream", 461 | "text": [ 462 | "\n", 463 | "Dev Set Evaluation:\n", 464 | "Accuracy: 0.9897\n", 465 | "Precision: 0.9879\n", 466 | "Recall: 1.0000\n", 467 | "F1 Score: 0.9939\n" 468 | ] 469 | }, 470 | { 471 | "data": { 472 | "text/plain": [ 473 | "('./binary_classifier_saved_model/tokenizer_config.json',\n", 474 | " './binary_classifier_saved_model/special_tokens_map.json',\n", 475 | " './binary_classifier_saved_model/vocab.txt',\n", 476 | " './binary_classifier_saved_model/added_tokens.json',\n", 477 | " './binary_classifier_saved_model/tokenizer.json')" 478 | ] 479 | }, 480 | "execution_count": 12, 481 | "metadata": {}, 482 | "output_type": "execute_result" 483 | } 484 | ], 485 | "source": [ 486 | "dev_accuracy = accuracy_score(dev_labels, dev_preds)\n", 487 | "dev_precision, dev_recall, dev_f1, _ = precision_recall_fscore_support(dev_labels, dev_preds, average='binary')\n", 488 | "\n", 489 | "print(\"\\nDev Set Evaluation:\")\n", 490 | "print(f\"Accuracy: {dev_accuracy:.4f}\")\n", 491 | "print(f\"Precision: {dev_precision:.4f}\")\n", 492 | "print(f\"Recall: {dev_recall:.4f}\")\n", 493 | "print(f\"F1 Score: {dev_f1:.4f}\")\n", 494 | "\n", 495 | "# Create confusion matrix\n", 496 | "cm = confusion_matrix(dev_labels, dev_preds)\n", 497 | "plt.figure(figsize=(10, 8))\n", 498 | "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')\n", 499 | "plt.title('Confusion Matrix')\n", 500 | "plt.xlabel('Predicted')\n", 501 | "plt.ylabel('True')\n", 502 | "plt.savefig('confusion_matrix.png')\n", 503 | "plt.close()\n", 504 | "\n", 505 | "# Save the model\n", 506 | "model.save_pretrained(\"./binary_classifier_saved_model\")\n", 507 | "tokenizer.save_pretrained(\"./binary_classifier_saved_model\")" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": 13, 513 | "id": "173d4aaa-3db0-44e3-a768-4f223616e5ae", 514 | "metadata": { 515 | "tags": [] 516 | }, 517 | "outputs": [ 518 | { 519 | "name": "stdout", 520 | "output_type": "stream", 521 | "text": [ 522 | "Using device: cuda\n" 523 | ] 524 | } 525 | ], 526 | "source": [ 527 | "import pandas as pd\n", 528 | "import torch\n", 529 | "from transformers import AutoTokenizer\n", 530 | "from torch.utils.data import TensorDataset, DataLoader\n", 531 | "\n", 532 | "# Load the dataset\n", 533 | "df = pd.read_csv(\"../twok_masked.csv\")\n", 534 | "\n", 535 | "# Load the saved model and tokenizer\n", 536 | "loaded_model = BertForSequenceClassification.from_pretrained(\"./binary_classifier_saved_model\", model_name_or_path=\"nlpaueb/bert-base-greek-uncased-v1\")\n", 537 | "loaded_tokenizer = AutoTokenizer.from_pretrained(\"./binary_classifier_saved_model\")\n", 538 | "\n", 539 | "# Check if CUDA is available\n", 540 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 541 | "print(f\"Using device: {device}\")\n", 542 | "\n", 543 | "# Move the model to the appropriate device\n", 544 | "loaded_model.to(device)\n", 545 | "\n", 546 | "# Tokenize all texts\n", 547 | "encodings = loaded_tokenizer(df['text'].tolist(), truncation=True, padding=True, max_length=512)\n", 548 | "dataset = TensorDataset(torch.tensor(encodings['input_ids']), \n", 549 | " torch.tensor(encodings['attention_mask']))\n", 550 | "dataloader = DataLoader(dataset, batch_size=32) # Adjust batch size as needed\n", 551 | "\n", 552 | "# Make predictions\n", 553 | "loaded_model.eval()\n", 554 | "predictions = []" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 14, 560 | "id": "9949ca2b-2788-440e-8bf0-dd817d5a6b4e", 561 | "metadata": { 562 | "tags": [] 563 | }, 564 | "outputs": [], 565 | "source": [ 566 | "with torch.no_grad():\n", 567 | " for batch in dataloader:\n", 568 | " input_ids, attention_mask = [b.to(device) for b in batch]\n", 569 | " outputs = loaded_model(input_ids, attention_mask=attention_mask)\n", 570 | " logits = outputs[0] if isinstance(outputs, tuple) else outputs\n", 571 | " preds = torch.argmax(logits, dim=1)\n", 572 | " predictions.extend(preds.cpu().numpy())" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 16, 578 | "id": "bcc439a5-1044-424b-a219-3e7506df522b", 579 | "metadata": { 580 | "tags": [] 581 | }, 582 | "outputs": [ 583 | { 584 | "name": "stdout", 585 | "output_type": "stream", 586 | "text": [ 587 | "Processed 2000 rows.\n", 588 | "Results saved to 'twok_masked_with_predictions.csv'\n", 589 | "\n", 590 | "Distribution of predictions:\n", 591 | "archaia\n", 592 | " 1 0.7825\n", 593 | "-9999 0.2175\n", 594 | "Name: proportion, dtype: float64\n", 595 | "\n", 596 | "Distribution of predictions for masked items:\n", 597 | "archaia\n", 598 | "1 1.0\n", 599 | "Name: proportion, dtype: float64\n" 600 | ] 601 | } 602 | ], 603 | "source": [ 604 | "# Add predictions to the dataframe\n", 605 | "df['archaia'] = predictions\n", 606 | "\n", 607 | "# Check 'mask' column and set 'ΚΝΕ' to -9999 if mask is 0\n", 608 | "df.loc[df['mask'] == 0, 'archaia'] = -9999\n", 609 | "\n", 610 | "# Remove columns '1' through '5'\n", 611 | "columns_to_remove = ['1', '2', '3', '4', '5']\n", 612 | "df = df.drop(columns=[col for col in columns_to_remove if col in df.columns])\n", 613 | "\n", 614 | "# Save the results\n", 615 | "df.to_csv(\"twok_masked_with_predictions.csv\", index=False)\n", 616 | "\n", 617 | "print(f\"Processed {len(df)} rows.\")\n", 618 | "print(\"Results saved to 'twok_masked_with_predictions.csv'\")\n", 619 | "\n", 620 | "# Print distribution of predictions\n", 621 | "print(\"\\nDistribution of predictions:\")\n", 622 | "print(df['archaia'].value_counts(normalize=True))\n", 623 | "\n", 624 | "# Print distribution of predictions for masked items only\n", 625 | "masked_df = df[df['mask'] == 1]\n", 626 | "print(\"\\nDistribution of predictions for masked items:\")\n", 627 | "print(masked_df['archaia'].value_counts(normalize=True))" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": null, 633 | "id": "db5e881d-6d8c-4e51-9548-eea2fcdac1d2", 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [] 637 | } 638 | ], 639 | "metadata": { 640 | "kernelspec": { 641 | "display_name": "Python 3 (ipykernel)", 642 | "language": "python", 643 | "name": "python3" 644 | }, 645 | "language_info": { 646 | "codemirror_mode": { 647 | "name": "ipython", 648 | "version": 3 649 | }, 650 | "file_extension": ".py", 651 | "mimetype": "text/x-python", 652 | "name": "python", 653 | "nbconvert_exporter": "python", 654 | "pygments_lexer": "ipython3", 655 | "version": "3.11.7" 656 | } 657 | }, 658 | "nbformat": 4, 659 | "nbformat_minor": 5 660 | } 661 | -------------------------------------------------------------------------------- /Greek_variety_classification/models/Ancient_Gr_classifier_model.zip: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:618bedd41fca77771aa37f966e168a02fb2016978debf7fa0ec041554372e430 3 | size 420182149 4 | -------------------------------------------------------------------------------- /Greek_variety_classification/models/Gr_Var_Classifier_model.zip: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9a6a615614897bf7160529d80a321c4635e89c4b099b51efce6296fe3f7c5d0b 3 | size 420183135 4 | -------------------------------------------------------------------------------- /Greek_variety_classification/preprocessing/clean_data_with_mask.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re 3 | import os 4 | from transformers import AutoTokenizer 5 | 6 | # Set the working directory and filename 7 | working_directory = "/home/fivos/Downloads" 8 | file_name = "dataset_Sep_3.csv" 9 | 10 | os.chdir(working_directory) 11 | 12 | # Load the data 13 | data = pd.read_csv(file_name, sep=",", engine="python") 14 | 15 | # Load the tokenizer 16 | tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1") 17 | 18 | print(data.columns) 19 | 20 | # Ensure 'text' column contains strings 21 | if 'text' in data.columns: 22 | data["text"] = data["text"].astype(str) 23 | else: 24 | print("Column 'text' not found in the dataset.") 25 | 26 | # Function to check if text is mostly Latin characters 27 | def is_mostly_latin(text, threshold=0.5): 28 | latin_chars = re.findall(r"[a-zA-Z]", text) 29 | return (len(latin_chars) / len(text)) > threshold if len(text) > 0 else False 30 | 31 | # Function to check if text is more than 50% numbers 32 | def is_mostly_numbers(text, threshold=0.5): 33 | num_chars = re.findall(r"[0-9]", text) 34 | return (len(num_chars) / len(text)) > threshold if len(text) > 0 else False 35 | 36 | # Function to check if text has fewer than 4 words 37 | def too_short(text): 38 | return len(text.split()) < 4 39 | 40 | # Function to check if text has more than 512 tokens 41 | def has_more_than_512_tokens(text): 42 | # Fragments should be smaller than 512 tokens for GreekBERT 43 | return len(tokenizer.encode(text)) > 512 44 | 45 | # Function to clean text 46 | def clean_text(text): 47 | # Remove formatting characters 48 | text = re.sub(r"[\n\t]", " ", text) 49 | # Remove leading, trailing, and multiple spaces 50 | text = ' '.join(text.split()) 51 | # Remove numerical ordering elements with () 52 | text = re.sub(r"\(?\d+\)|\d+\.", "", text) 53 | # Remove numerical ordering elements with {} 54 | text = re.sub(r"\{\d+\}", "", text) 55 | # Remove ordering elements with Greek numerals 56 | text = re.sub( 57 | r"(?", "()", "{}", and other similar characters 89 | data["text"] = data["text"].apply(lambda x: re.sub(r'[<>\[\]\(\)\{\}]', '', x)) 90 | 91 | # Update mask for empty text cells 92 | data.loc[~data["text"].str.strip().astype(bool), 'mask'] = 0 93 | 94 | # Update mask for mostly Latin text 95 | data.loc[data["text"].apply(is_mostly_latin), 'mask'] = 0 96 | 97 | # Update mask for mostly numbers 98 | data.loc[data["text"].apply(is_mostly_numbers), 'mask'] = 0 99 | 100 | # Update mask for too_short text 101 | data.loc[data["text"].apply(too_short), 'mask'] = 0 102 | 103 | # Update mask for text with more than 512 tokens 104 | data.loc[data["text"].apply(has_more_than_512_tokens), 'mask'] = 0 105 | 106 | # Save the result to a new CSV file 107 | output_file_path = os.path.join(os.getcwd(), os.path.splitext(file_name)[0] + "_masked.csv") 108 | data.to_csv(output_file_path, index=False, quoting=1) # quoting=1 ensures all fields are quoted 109 | 110 | print("Cleaned data with mask saved") -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GlossAPI 2 | 3 | [![PyPI Status](https://img.shields.io/pypi/v/glossapi?logo=pypi)](https://pypi.org/project/glossapi/) 4 | 5 | A library for processing texts in Greek and other languages, developed by [Open Technologies Alliance(GFOSS)](https://gfoss.eu/). 6 | 7 | ## Features 8 | 9 | - **Document Processing**: Extract text content from academic PDFs, DOCX, HTML, and other formats with structure preservation 10 | - **Document Downloading**: Download documents from URLs with automatic handling of various formats 11 | - **Quality Control**: Filter and cluster documents based on extraction quality 12 | - **Section Extraction**: Identify and extract academic sections from documents 13 | - **Section Classification**: Classify sections using machine learning models 14 | - **Greek Language Support**: Specialized processing for Greek academic texts 15 | - **Metadata Handling**: Process academic texts with accompanying metadata 16 | - **Customizable Annotation**: Map section titles to standardized categories 17 | - **Flexible Pipeline**: Start the processing from any stage in the pipeline 18 | 19 | ## Installation 20 | 21 | ```bash 22 | pip install glossapi 23 | ``` 24 | 25 | ## Usage 26 | 27 | The recommended way to use GlossAPI is through the `Corpus` class, which provides a complete pipeline for processing academic documents. You can use the same directory for both input and output: 28 | 29 | ```python 30 | from glossapi import Corpus 31 | import logging 32 | 33 | # Configure logging (optional) 34 | logging.basicConfig(level=logging.INFO) 35 | 36 | # Set the directory path (use the same for input and output) 37 | folder = "/path/to/corpus" # Use abstract path names 38 | 39 | # Initialize Corpus with input and output directories 40 | corpus = Corpus( 41 | input_dir=folder, 42 | output_dir=folder 43 | # metadata_path="/path/to/metadata.parquet", # Optional 44 | # annotation_mapping={ 45 | # 'Κεφάλαιο': 'chapter', 46 | # # Add more mappings as needed 47 | # } 48 | ) 49 | 50 | # The pipeline can start from any of these steps: 51 | 52 | # Step 1: Download documents (if URLs are provided) 53 | corpus.download(url_column='a_column_name') # Specify column with URLs, default column name is 'url' 54 | 55 | # Step 2: Extract documents 56 | corpus.extract() 57 | 58 | # Step 3: Extract sections from filtered documents 59 | corpus.section() 60 | 61 | # Step 4: Classify and annotate sections 62 | corpus.annotate() # or corpus.annotate(annotation_type="chapter") For texts without TOC or bibliography 63 | ``` 64 | 65 | ## Folder Structure 66 | 67 | After running the pipeline, the following folder structure will be created: 68 | 69 | ``` 70 | corpus/ # Your specified folder 71 | ├── download_results # stores metadata file with annotation from previous processing steps 72 | ├── downloads/ # Downloaded documents (if download() is used) 73 | ├── markdown/ # Extracted text files in markdown format 74 | ├── sections/ # Contains the processed sections in parquet format 75 | │ ├── sections_for_annotation.parquet 76 | ├── classified_sections.parquet # Intermediate processing form 77 | ├── fully_annotated_sections.parquet # Final processing form with section predictions 78 | ``` 79 | 80 | The `fully_annotated_sections.parquet` file contains the final processing form. The `predicted_sections` column shows the type of section: 'π' (table of contents), 'β' (bibliography), 'ε.σ.' (introductory note), 'κ' (main text), or 'a' (appendix). For files without table of contents or bibliography, the annotation will be "άλλο" (other). 81 | 82 | ## Note on Starting Points 83 | 84 | **Option 1: Start with Document Download** 85 | Create a corpus folder and add a parquet file with URLs for downloading: 86 | ``` 87 | corpus/ 88 | └── metadata.parquet (with a column containing document URLs) 89 | ``` 90 | Then use `corpus.download(url_column='column_name')` with the URL column name from your parquet file. 91 | 92 | **Option 2: Start with Document Extraction** 93 | Alternatively, place documents directly in the corpus folder and skip download: 94 | ``` 95 | corpus/ 96 | └── document1.pdf, document2.docx, etc. 97 | ``` 98 | GlossAPI will automatically create a metadata folder in downloads if starting from extract. 99 | 100 | ## License 101 | 102 | This project is licensed under the [European Union Public Licence 1.2 (EUPL 1.2)](https://interoperable-europe.ec.europa.eu/collection/eupl/eupl-text-eupl-12). 103 | -------------------------------------------------------------------------------- /dataset_progress.md: -------------------------------------------------------------------------------- 1 | Στόχος της [ΕΕΛΛΑΚ](https://eellak.gr/) είναι η ανάπτυξη ενός Ελληνικού μοντέλου τεχνητής νοημοσύνης(ΤΝ) ανοιχτού λογισμικού, που ο κώδικας του θα διατίθεται με την άδεια ανοιχτού λογισμικού [EUPL](https://eupl.eu/), τα βάρη και όλα τα δεδομένα θα είναι διαθέσιμα με την άδεια [Creative Commons BY-SA](https://creativecommons.org/licenses/by-sa/4.0/), **1ος στόχος του glossAPI είναι η συγκέντρωση, επεξεργασία και συντήρηση αντιπροσωπευτικών συνόλων ελληνικών κειμένων** ώστε να μπορεί ένα μοντέλο ΤΝ να χειρίζεται σωστά την Ελληνική γλώσσα. 2 | 3 | :rocket: **Δημιουργία καθαρισμένων κειμενικών δεδομένων με χρήσιμα μεταδεδομένα** 4 | 5 | ## Datasets 6 | 7 | ### 95Κ Δείγμα Ελληνικής (95K Greek Sample) 8 | - [✓] Scraped 9 | - [✓] Downloaded 10 | - [✓] Cleaned 11 | - [✓] Uploaded with metadata (https://huggingface.co/datasets/glossAPI/95k_deigma_ellinikis) 12 | 13 | A diverse sample of 95,000 Greek texts, providing a broad representation of modern Greek language usage. Useful for general NLP tasks and language modeling. 14 | 15 | ### Σχολικά Βιβλία (School Books) 16 | - [✓] Scraped 17 | - [✓] Downloaded 18 | - [✓] Cleaned 19 | - [✓] Uploaded with metadata (https://huggingface.co/datasets/glossAPI/Sxolika_vivlia) 20 | 21 | Collection of Greek school textbooks and educational materials. Great resource for educational NLP applications and studying formal Modern Greek. 22 | 23 | ### Δημώδης Λογοτεχνία (Folk Literature) 24 | - [✓] Scraped 25 | - [✓] Downloaded 26 | - [✓] Cleaned 27 | - [✓] Uploaded with metadata (https://huggingface.co/datasets/glossAPI/dimodis_logotexnia) 28 | 29 | Traditional Greek folk literature, including stories, songs, and poems. Valuable for cultural preservation and studying regional Greek variations. 30 | 31 | ### Ελληνικά Κείμενα Project Gutenberg (Project Gutenberg Greek Texts) 32 | - [✓] Scraped 33 | - [✓] Downloaded 34 | - [✓] Cleaned 35 | - [✓] Uploaded with metadata (https://huggingface.co/datasets/glossAPI/Ellinika_Keimena_Project_Gutenberg) 36 | 37 | Public domain Greek texts from Project Gutenberg, spanning various periods and genres. Excellent for literary analysis and historical language studies. 38 | 39 | ### 1000 Πρώτα Χρόνια Ελληνικής (First 1000 Years of Greek) 40 | - [✓] Scraped 41 | - [✓] Downloaded 42 | - [✓] Cleaned 43 | - [✓] Uploaded with metadata (https://huggingface.co/datasets/glossAPI/1000_prwta_xronia_ellhnikhs) 44 | 45 | Texts covering the first millennium of written Greek, crucial for studying the evolution of the Greek language and historical linguistics. 46 | 47 | ### Κλασική Αρχαία Ελληνική Γραμματεία (Classical Ancient Greek Literature) 48 | - [✓] Scraped 49 | - [✓] Downloaded 50 | - [✓] Cleaned 51 | - [✓] Uploaded with metadata (https://huggingface.co/datasets/glossAPI/klasikh_arx_ell_grammateia) 52 | 53 | Core works of Classical Greek literature, including philosophical, historical, and dramatic texts. Essential for classical studies and ancient Greek NLP. 54 | 55 | ### Ελληνικά Κείμενα Wikisource (Wikisource Greek Texts) 56 | - [✓] Scraped 57 | - [✓] Downloaded 58 | - [✓] Cleaned 59 | - [✓] Uploaded with metadata (https://huggingface.co/datasets/glossAPI/Wikisource_Greek_texts) 60 | 61 | ### Πέργαμος (Πέργαμος) 62 | - [✓] Scraped 63 | - [✓] Downloading 64 | - [✓] Preprocessed 65 | - [✓] Each article's sections categorized by type (introductory remarks, index etc.) 66 | - [✓] Uploaded with metadata 67 | 68 | Συλλογή κειμένων από την πλατφόρμα Πέργαμος. Collection of texts from the Pergamos' University theses archive. 69 | 70 | ### :construction: Υπό επεξεργασία (Work in Progress) 71 | 72 | ### Κάλλιπος (Kallipos) 73 | - [✓] Scraped 74 | - [✓] Downloaded 75 | - [ ] Cleaned 76 | - [ ] Uploaded with metadata 77 | 78 | Ακαδημαϊκά συγγράμματα από την πλατφόρμα Κάλλιπος. Open source academic textbooks from Kallipos. 79 | 80 | ### Έγγραφα ΕΕ (EU Documents) 81 | - [ ] Downloaded 82 | - [ ] Cleaned 83 | - [ ] Uploaded with metadata 84 | 85 | Επίσημα έγγραφα της Ευρωπαϊκής Ένωσης. Official documents of the European Union. 86 | 87 | [γlo'sapi] 88 | 89 | ### glossAPI, το 90 | 91 | Ένα έργο της ΕΕΛΛΑΚ στον χώρο των ψηφιακών ανθρωπιστικών επιστημών που αξιοποιεί ελεύθερα διαθέσιμες πηγές για τη συγκέντρωση ενός εκτενούς σώματος κειμένων υψηλής ποιότητας τα οποία παρέχονται με άδεια Creative Commons. Το glossAPI καλύπτει ένα ευρύ φάσμα θεματικών περιοχών, από την επιστήμη και τη λογοτεχνία έως τα νομικά κείμενα, με δεδομένα που υφίστανται επιμελή επεξεργασία και αποδελτίωση. 92 | 93 | Στόχος του glossAPI είναι να διευκολύνει την επεξεργασία κειμενικών δεδομένων και την εκπαίδευση σύγχρονων γλωσσικών μοντέλων. Όλα τα εργαλεία που αναπτύσσει διατίθενται ελεύθερα με άδεια EUPL μέσω του αποθετηρίου του στο Github. 94 | 95 | Το glossAPI συμβάλει στην ανάπτυξη των ελληνικών ανοιχτών κειμενικών δεδομένων, ενθαρρύνοντας ερευνητές και φοιτητές να χρησιμοποιήσουν τα εργαλεία που αναπτύχθηκαν, και να επεκτείνουν το κώδικα και τα δεδομένα προς κατευθύνσεις που τους ενδιαφέρουν. 96 | 97 | [ 1: greeklish < γλωσσάρι 2: αγγλ. gloss < μεσαιων. αγγλ. gloze < μεσαιων. λατ. glōsa < κλασ. λατ. glōssa < αρχ. γλῶσσα: "γλώσσα, λέξη" + αγγλ. API: Application Programming Interface ] 98 | 99 | Επικοινωνία/ contact at: glossapi.team@eellak.gr 100 | -------------------------------------------------------------------------------- /pipeline/LICENSE.md: -------------------------------------------------------------------------------- 1 | ../LICENSE.md -------------------------------------------------------------------------------- /pipeline/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE.md 3 | recursive-include src/glossapi/models *.joblib 4 | -------------------------------------------------------------------------------- /pipeline/README.md: -------------------------------------------------------------------------------- 1 | # GlossAPI 2 | 3 | [![Release Version](https://img.shields.io/github/v/release/eellak/glossAPI)](https://github.com/eellak/glossAPI/releases) 4 | [![PyPI Status](https://img.shields.io/pypi/v/glossapi?logo=pypi)](https://pypi.org/project/glossapi/) 5 | 6 | A library for processing academic texts in Greek and other languages, developed by [ΕΕΛΛΑΚ](https://eellak.gr/). 7 | 8 | ## Features 9 | 10 | - **Document Processing**: Extract text content from academic PDFs, DOCX, XML, HTML, and other formats with structure preservation 11 | - **Robust Batch Processing**: Process documents in batches with error isolation and automatic resumption 12 | - **Quality Control**: Filter and cluster documents based on extraction quality 13 | - **Section Extraction**: Identify and extract academic sections from documents 14 | - **Section Classification**: Classify sections using machine learning models 15 | - **Greek Language Support**: Specialized processing for Greek academic texts 16 | - **Metadata Handling**: Process academic texts with accompanying metadata 17 | - **Customizable Annotation**: Map section titles to standardized categories 18 | 19 | ## Installation 20 | 21 | ```bash 22 | pip install glossapi==0.0.9 23 | ``` 24 | 25 | ## Usage 26 | 27 | The recommended way to use GlossAPI is through the `Corpus` class, which provides a complete pipeline for processing academic documents: 28 | 29 | ```python 30 | from glossapi import Corpus 31 | import logging 32 | 33 | # Configure logging (optional) 34 | logging.basicConfig(level=logging.INFO) 35 | 36 | # Initialize Corpus with input and output directories 37 | corpus = Corpus( 38 | input_dir="/path/to/documents", 39 | output_dir="/path/to/output" 40 | # metadata_path="/path/to/metadata.parquet", # Optional 41 | # annotation_mapping={ 42 | # 'Κεφάλαιο': 'chapter', 43 | # # Add more mappings as needed 44 | # } 45 | ) 46 | 47 | # Step 1: Extract documents (quality control) 48 | corpus.extract() 49 | 50 | # Step 2: Extract sections from filtered documents 51 | corpus.section() 52 | 53 | # Step 3: Classify and annotate sections 54 | corpus.annotate() 55 | ``` 56 | 57 | ## License 58 | 59 | This project is licensed under the European Union Public Licence 1.2 (EUPL 1.2). 60 | -------------------------------------------------------------------------------- /pipeline/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=42", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "glossapi" 7 | version = "0.0.13" 8 | description = "A library for processing academic texts in Greek and other languages" 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | license = {text = "European Union Public Licence 1.2 (EUPL 1.2)"} 12 | authors = [ 13 | {name = "GlossAPI Team", email = "foivos@example.com"} 14 | ] 15 | classifiers = [ 16 | "Programming Language :: Python :: 3", 17 | "License :: OSI Approved :: European Union Public Licence 1.2 (EUPL 1.2)", 18 | "Operating System :: OS Independent", 19 | "Development Status :: 3 - Alpha", 20 | ] 21 | dependencies = [ 22 | "docling", 23 | "pandas", 24 | "numpy", 25 | "scikit-learn", 26 | "joblib", 27 | "dask", 28 | "pyarrow", 29 | "ftfy", 30 | "tenacity", 31 | "aiohttp", 32 | "aiofiles" 33 | ] 34 | 35 | [tool.setuptools] 36 | package-dir = {"" = "src"} 37 | include-package-data = true 38 | 39 | [tool.setuptools.packages.find] 40 | where = ["src"] 41 | 42 | [tool.setuptools.package-data] 43 | glossapi = ["models/*.joblib"] 44 | 45 | [project.urls] 46 | Repository = "https://github.com/eellak/glossAPI" 47 | -------------------------------------------------------------------------------- /pipeline/refactoring_todo.md: -------------------------------------------------------------------------------- 1 | # GlossAPI Pipeline Refactoring TODO List 2 | 3 | ## Build and Install 4 | 5 | **IMPORTANT:** After implementing the changes, you need to build and install the package in the virtual environment for the changes to take effect: 6 | 7 | ```bash 8 | # Activate the virtual environment first 9 | source /mnt/data/venv/bin/activate 10 | 11 | # Go to the pipeline directory 12 | cd /mnt/data/glossAPI/pipeline 13 | 14 | # Install the package in development mode 15 | pip install -e . 16 | 17 | # Now you can run the simple_test.py script 18 | python /mnt/data/simple_test.py 19 | ``` 20 | 21 | **ALWAYS KEEP IN MIND:** The pipeline must work with the existing interface in simple_test.py using the "corpus.command()" pattern. 22 | 23 | ## ✅ COMPLETED 24 | 25 | ### 1) Modified GlossDownloader 26 | 27 | - Updated `GlossDownloader` class to use a dedicated "downloads" folder: 28 | - Modified the `download_files()` method to use `self.output_dir / "downloads"` instead of `self.input_dir` 29 | - All downloaded files are now saved in this subdirectory 30 | - Updated the `Corpus.download()` method to create and use this downloads folder 31 | - Added validation to check if downloaded files are of the supported types (pdf, docx, xml, html, pptx, csv, md) 32 | 33 | ### 2) Updated GlossExtract 34 | 35 | - Modified the `extract()` method in `Corpus` class to: 36 | - Look for files in the "downloads" directory first 37 | - If "downloads" directory doesn't exist, check for supported file types in the input folder and move them to a new "downloads" folder 38 | - Continue processing from the "downloads" folder 39 | - Updated the file location handling across the pipeline to reflect this change 40 | 41 | ### 3) Created a Standardized Parquet Class 42 | 43 | - Created a new file called `parquet_schema.py` with a standardized schema class: 44 | - Defined required metadata fields for processing 45 | - Implemented standard schemas for different pipeline stages 46 | - Defined standard columns (id, row_id, filename, title, section, predicted_section) 47 | - Added methods for reading/writing with standard schema validation 48 | 49 | ### 4) Improved Bad File Filtering in Sectioning 50 | 51 | - Made `filenames_to_process` a required parameter in section.py 52 | - Enhanced filtering to ensure only good files (based on extraction quality in parquet) are processed 53 | - Added detailed logging for processed and skipped files 54 | - Verified that section.py correctly handles all sectioning scenarios: 55 | - Text between two headers 56 | - Text before the first header 57 | - Text after the last header 58 | - Documents with no headers at all 59 | - Fixed indentation issues in corpus.py that were causing execution problems 60 | 61 | ## TODO 62 | 63 | ### 1) Finish Removing Redundant Code 64 | 65 | - Remove the remaining redundant code related to good/bad folders: 66 | - The `extract_quality` method in corpus.py still deals with good/bad folders 67 | - Remove all code related to copying files to good/bad directories 68 | - Remove references to `good_markdown_dir` since we're using extraction quality markers in parquet 69 | - Update all methods to use the simplified directory structure 70 | 71 | ### 2) Complete Two-Parquet Pipeline Implementation 72 | 73 | **Progress**: We've successfully implemented the first parquet (downloader parquet with extraction quality) but need to consolidate the section-related parquets. 74 | 75 | - Currently we still have 3 section parquet files that need to be consolidated: 76 | - `sections_for_annotation.parquet` 77 | - `classified_sections.parquet` 78 | - `fully_annotated_sections.parquet` 79 | 80 | - Implementation tasks: 81 | - Consolidate the 3 section-related parquet files into a single sections parquet 82 | - Update all methods to work with the consolidated parquet structure 83 | - Ensure all metadata columns are preserved during consolidation 84 | - Add metadata column "processing_stage" to track progress through pipeline 85 | - Update the verification method to check for required columns rather than specific filenames 86 | - Throw clear error messages when required columns are missing 87 | 88 | ### 3) Make Split_Bad an Explicit Pipeline Step 89 | 90 | - Extract the split_bad functionality from internal GlossExtract methods 91 | - Create a dedicated method in Corpus class 92 | - Make it explicitly update extraction quality in the downloader parquet 93 | - Update the processing_stage column to include extraction as a completed stage 94 | 95 | ### 4) Remove All Fallback Options 96 | 97 | - **Critical**: Remove any remaining code that silently falls back to processing all files: 98 | - Some of these fallbacks have been removed, but others may still exist 99 | - Remove any code that ignores extraction quality filter failures 100 | - Flag fallbacks as explicit errors rather than silent recovery 101 | - Ensure section() and other methods require good quality files and don't have hidden fallbacks 102 | 103 | ### 5) Add More Robust Error Messages 104 | 105 | - Add clear error messages when filtering operations fail instead of using defaults 106 | - For example: "No good quality files found. Pipeline stopped." instead of using all files 107 | - Document all pipeline decision points in code comments 108 | - Specify where the pipeline can branch and under what conditions 109 | - Explain the rationale for each decision point 110 | 111 | ### 6) Testing and Documentation 112 | 113 | - Test the refactored pipeline using the examples in /mnt/data/eu_test 114 | - Ensure the extraction_test_bad_file.py script correctly filters bad files 115 | - Add detailed logging for all pipeline stages 116 | - Document the new two-parquet approach in comments and docstrings 117 | - Update the parquet schema documentation to reflect the new approach 118 | -------------------------------------------------------------------------------- /pipeline/scripts/concurrent_downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Concurrent Downloader 5 | 6 | A versatile concurrent downloader that uses asyncio and aiohttp to efficiently download 7 | files from URLs. It accepts parquet files with URLs and metadata columns, downloads the files 8 | concurrently, and creates unique filenames with a structured naming pattern. 9 | 10 | Features: 11 | - Parquet file integration for metadata handling 12 | - Unique filename generation with the pattern paper_AAA000, paper_AAA001, etc. 13 | - Configurable concurrency 14 | - Retry mechanism for failed downloads 15 | - Download status tracking 16 | - Works with any file type 17 | """ 18 | 19 | import aiohttp 20 | import asyncio 21 | import os 22 | import argparse 23 | import time 24 | import random 25 | import logging 26 | import re 27 | import string 28 | import aiofiles 29 | import pandas as pd 30 | from urllib.parse import urlparse 31 | from collections import deque 32 | from typing import Dict, List, Tuple, Set, Optional, Any, Iterator 33 | import mimetypes 34 | import string 35 | from tenacity import retry, stop_after_attempt, stop_after_delay, wait_exponential, retry_if_exception_type, retry_if_result, before_sleep_log 36 | import json 37 | 38 | # Configure logging 39 | logging.basicConfig( 40 | level=logging.INFO, 41 | format='%(asctime)s - %(levelname)s - %(message)s', 42 | handlers=[ 43 | logging.StreamHandler(), 44 | logging.FileHandler("concurrent_download.log") 45 | ] 46 | ) 47 | logger = logging.getLogger(__name__) 48 | 49 | # Configure tenacity logger 50 | tenacity_logger = logging.getLogger('tenacity') 51 | tenacity_logger.setLevel(logging.INFO) 52 | 53 | # Add specific loggers for libraries that can be noisy 54 | logging.getLogger('aiohttp').setLevel(logging.WARNING) 55 | logging.getLogger('asyncio').setLevel(logging.WARNING) 56 | 57 | # Rate limiter class for API limits 58 | class RateLimiter: 59 | """Rate limiter to enforce API rate limits""" 60 | 61 | def __init__(self, rate_limit: int, time_period: int = 60): 62 | """ 63 | Initialize rate limiter 64 | 65 | Args: 66 | rate_limit: Maximum number of requests allowed in time_period 67 | time_period: Time period in seconds (default: 60 seconds = 1 minute) 68 | """ 69 | self.rate_limit = rate_limit 70 | self.time_period = time_period 71 | self.request_timestamps = deque(maxlen=rate_limit) 72 | self.lock = asyncio.Lock() 73 | 74 | async def acquire(self): 75 | """ 76 | Acquire permission to make a request, waiting if necessary 77 | """ 78 | async with self.lock: 79 | current_time = time.time() 80 | 81 | # If we haven't reached the limit yet, allow immediately 82 | if len(self.request_timestamps) < self.rate_limit: 83 | self.request_timestamps.append(current_time) 84 | return 85 | 86 | # Check if the oldest request is outside the time window 87 | elapsed = current_time - self.request_timestamps[0] 88 | if elapsed < self.time_period: 89 | # We need to wait until a slot is available 90 | wait_time = self.time_period - elapsed 91 | logger.debug(f"Rate limit reached. Waiting {wait_time:.2f} seconds") 92 | # Release the lock while waiting 93 | await asyncio.sleep(wait_time) 94 | # Reacquire and check again (recursive call) 95 | await self.acquire() 96 | else: 97 | # We can make a request now 98 | self.request_timestamps.popleft() # Remove oldest 99 | self.request_timestamps.append(current_time) 100 | 101 | # Constants for filename generation 102 | LETTERS = string.ascii_uppercase 103 | DIGITS = string.digits 104 | 105 | 106 | def generate_filename(index: int, file_ext: str = None) -> str: 107 | """ 108 | Generate a filename in the format AAA_000, AAA_001, etc. 109 | 110 | Args: 111 | index: Sequential number to convert to the AAA_000 format 112 | file_ext: Optional file extension (with dot) 113 | 114 | Returns: 115 | str: Unique filename 116 | """ 117 | # Calculate letter part (AAA, AAB, etc.) 118 | letter_base = ord('A') # ASCII code for 'A' 119 | first_letter = chr(letter_base + (index // (26*26)) % 26) 120 | second_letter = chr(letter_base + (index // 26) % 26) 121 | third_letter = chr(letter_base + index % 26) 122 | 123 | # Calculate number part (000, 001, etc.) 124 | number_part = f"{(index % 1000):03d}" 125 | 126 | letters = f"{first_letter}{second_letter}{third_letter}" 127 | digits = number_part 128 | 129 | if file_ext: 130 | return f"{letters}_{digits}.{file_ext}" 131 | else: 132 | return f"{letters}_{digits}" 133 | 134 | 135 | def get_file_extension_from_url(url: str) -> str: 136 | """ 137 | Extract file extension from URL or guess based on content type 138 | 139 | Args: 140 | url: URL to extract extension from 141 | 142 | Returns: 143 | str: File extension (without dot) 144 | """ 145 | # First try to get extension from URL path 146 | path = urlparse(url).path 147 | ext = os.path.splitext(path)[1].lower() 148 | 149 | if ext and ext.startswith('.'): 150 | return ext[1:] # Remove the leading dot 151 | 152 | # If no extension found, return a default 153 | return "bin" 154 | 155 | 156 | def get_mime_type(url: str) -> str: 157 | """ 158 | Get MIME type from URL 159 | 160 | Args: 161 | url: URL to get MIME type for 162 | 163 | Returns: 164 | str: MIME type 165 | """ 166 | mime_type, _ = mimetypes.guess_type(url) 167 | return mime_type if mime_type else "application/octet-stream" 168 | 169 | 170 | async def get_base_url(url: str) -> str: 171 | """ 172 | Extract base URL from a full URL 173 | 174 | Args: 175 | url: Full URL 176 | 177 | Returns: 178 | str: Base URL (scheme + netloc) 179 | """ 180 | if not url.startswith(("http://", "https://")): 181 | url = f"https://{url}" 182 | parsed_url = urlparse(url) 183 | base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" 184 | return base_url 185 | 186 | 187 | async def setup_session(session: aiohttp.ClientSession, url: str, headers: Dict[str, str]) -> Dict[str, str]: 188 | """ 189 | Initialize the session with base headers 190 | 191 | Args: 192 | session: aiohttp ClientSession 193 | url: URL to access 194 | headers: Headers to use 195 | 196 | Returns: 197 | Dict[str, str]: Updated headers 198 | """ 199 | base_url = await get_base_url(url) 200 | initial_url = base_url 201 | try: 202 | async with session.get(initial_url, headers=headers, timeout=10) as response: 203 | await response.text() 204 | except Exception as e: 205 | logger.warning(f"Failed to setup session for {base_url}: {e}") 206 | return headers 207 | 208 | 209 | async def write_file(filename: str, content: bytes, output_path: str = "./") -> str: 210 | """ 211 | Write downloaded content to a file 212 | 213 | Args: 214 | filename: Name of the file 215 | content: Binary content to write 216 | output_path: Directory to write to 217 | 218 | Returns: 219 | str: Path to the written file 220 | """ 221 | path_to_file = os.path.join(output_path, filename) 222 | async with aiofiles.open(path_to_file, 'wb') as file: 223 | await file.write(content) 224 | return path_to_file 225 | 226 | 227 | def user_agent_generator() -> Iterator[str]: 228 | """ 229 | Generate random user-agents to avoid bot detection 230 | 231 | Yields: 232 | str: Random user agent string 233 | """ 234 | templates = [ 235 | "Mozilla/5.0 ({os}) AppleWebKit/537.36 (KHTML, like Gecko) {browser}/{version} Safari/537.36", 236 | "Mozilla/5.0 ({os}) Gecko/20100101 {browser}/{version}", 237 | "Mozilla/5.0 ({os}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{version} Safari/537.36 {browser}/{alt_version}" 238 | ] 239 | operating_systems = [ 240 | "Windows NT 10.0; Win64; x64", 241 | "Macintosh; Intel Mac OS X 10_15_7", 242 | "X11; Linux x86_64", 243 | "Windows NT 6.1; Win64; x64", 244 | "Android 12; Mobile" 245 | ] 246 | browsers = [ 247 | ("Chrome", random.randint(90, 110), "Chrome"), 248 | ("Firefox", random.randint(90, 110), "Firefox"), 249 | ("Edge", random.randint(90, 110), "Edg"), 250 | ("Safari", random.randint(600, 610), "Safari") 251 | ] 252 | while True: 253 | template = random.choice(templates) 254 | os_name = random.choice(operating_systems) 255 | browser, version, alt_browser = random.choice(browsers) 256 | full_version = f"{version}.0.{random.randint(1000, 9999)}" 257 | alt_version = f"{random.randint(90, 110)}.0.{random.randint(1000, 9999)}" 258 | user_agent = template.format(os=os_name, browser=browser, version=full_version, alt_browser=alt_browser, alt_version=alt_version) 259 | yield user_agent 260 | 261 | 262 | @retry(stop=(stop_after_attempt(3) | stop_after_delay(30)), 263 | wait=wait_exponential(multiplier=1, min=2, max=10), 264 | retry=retry_if_exception_type((aiohttp.ClientError, asyncio.TimeoutError)), 265 | reraise=True, 266 | before_sleep=before_sleep_log(tenacity_logger, logging.INFO)) 267 | async def make_request(session, requester, url, headers, timeout): 268 | """Make a request with tenacity retry logic""" 269 | async with requester( 270 | url, 271 | headers=headers, 272 | allow_redirects=True, 273 | max_redirects=10, 274 | verify_ssl=False, 275 | timeout=timeout 276 | ) as response: 277 | content = None 278 | if response.status == 200: 279 | content = await response.read() 280 | return response.status, content 281 | 282 | async def download_file(row_index: int, url: str, semaphore: asyncio.Semaphore, 283 | args: argparse.Namespace, user_agent: str, rate_limiter: RateLimiter, 284 | retry_count: int = 0) -> Tuple[bool, str, str, int]: 285 | """ 286 | Download a file from a URL 287 | 288 | Args: 289 | row_index: Index in the dataframe 290 | url: URL to download 291 | semaphore: Semaphore for concurrency control 292 | args: Command-line arguments 293 | user_agent: User agent to use 294 | retry_count: Current retry count 295 | 296 | Returns: 297 | Tuple[bool, str, str, int]: (success, filename, error_message, retry_count) 298 | """ 299 | # Skip empty URLs 300 | if pd.isna(url) or not url: 301 | return (False, "", "Empty URL", retry_count + 1) 302 | 303 | # Get base URL for referer 304 | base_url = await get_base_url(url) 305 | parsed_url = urlparse(url) 306 | domain = parsed_url.netloc 307 | 308 | # Ensure URL has scheme 309 | if not url.startswith(("http://", "https://")): 310 | url = f"https://{url}" 311 | 312 | # Get file extension from URL 313 | file_ext = get_file_extension_from_url(url) 314 | 315 | # Generate unique filename 316 | filename = generate_filename(row_index, file_ext) 317 | 318 | # Enhanced headers with common browser-like attributes to bypass 403 errors 319 | headers = { 320 | 'User-Agent': user_agent, 321 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 322 | 'Accept-Language': 'en-US,en;q=0.5', 323 | 'Accept-Encoding': 'gzip, deflate, br', 324 | 'Connection': 'keep-alive', 325 | 'Upgrade-Insecure-Requests': '1', 326 | 'Sec-Fetch-Dest': 'document', 327 | 'Sec-Fetch-Mode': 'navigate', 328 | 'Sec-Fetch-Site': 'cross-site', 329 | 'Pragma': 'no-cache', 330 | 'Cache-Control': 'no-cache', 331 | 'TE': 'trailers', 332 | 'Referer': f"https://www.google.com/search?q={domain}", 333 | 'Origin': base_url, 334 | 'DNT': '1' 335 | } 336 | 337 | # Add cookie handling if needed for specific domains 338 | cookies = {} 339 | if 'europarl.europa.eu' in url or 'data.europarl.europa.eu' in url: 340 | cookies = { 341 | 'cookie_consent': 'accepted', 342 | 'ec_cookiepopin': 'NjY1ODJjNDg5NDc1ODlkNzYwZDA0OTU5NzJkYWI2ZTc', 343 | 'JSESSIONID': f"session-id-{random.randint(100000000, 999999999)}", 344 | 'loadedEP': 'true', 345 | 'GUEST_LANGUAGE_ID': 'en_US' 346 | } 347 | 348 | async with semaphore: 349 | # Implement exponential backoff 350 | sleep_time = args.sleep * (2 ** retry_count) 351 | await asyncio.sleep(random.uniform(sleep_time, sleep_time * 1.5)) 352 | 353 | # Set up timeout with exponential backoff 354 | timeout = aiohttp.ClientTimeout(total=60 + (retry_count * 15)) 355 | 356 | try: 357 | # Acquire permission from rate limiter before making request 358 | await rate_limiter.acquire() 359 | 360 | # Create session with proper connection pooling 361 | conn = aiohttp.TCPConnector( 362 | ssl=False, 363 | limit_per_host=2, # Limit concurrent connections per host 364 | force_close=False, # Keep connections open for reuse 365 | enable_cleanup_closed=True 366 | ) 367 | 368 | async with aiohttp.ClientSession( 369 | connector=conn, 370 | timeout=timeout, 371 | trust_env=True, # Use environment for proxy information 372 | cookies=cookies # Use our cookies 373 | ) as session: 374 | # Try to access the base domain first to establish cookies 375 | if retry_count == 0: # Only do this on first attempt 376 | try: 377 | # Get permission from rate limiter for the base URL request 378 | await rate_limiter.acquire() 379 | 380 | async with session.get( 381 | base_url, 382 | headers=headers, 383 | allow_redirects=True, 384 | timeout=aiohttp.ClientTimeout(total=15) 385 | ) as response: 386 | await response.read() 387 | await asyncio.sleep(random.uniform(1.0, 2.0)) 388 | except Exception as e: 389 | logger.debug(f"Initial base URL visit failed: {str(e)}") 390 | 391 | # Determine request method (get or post) 392 | request_method = args.request_method.lower() 393 | requester = getattr(session, request_method) 394 | 395 | # Attempt the download with tenacity-powered retry logic 396 | try: 397 | # Use tenacity retry wrapper for the actual request 398 | status, content = await asyncio.wait_for( 399 | make_request(session, requester, url, headers, timeout), 400 | timeout=args.request_timeout # Overall timeout for the whole operation 401 | ) 402 | 403 | if status == 200 and content: 404 | await write_file(filename, content, args.output_dir) 405 | logger.info(f"Successfully downloaded {filename} from {url}") 406 | return (True, filename, "", retry_count) 407 | elif status in [403, 429]: 408 | # Special handling for 403/429 (Forbidden/Too Many Requests) 409 | await asyncio.sleep(random.uniform(3.0, 5.0)) # Longer wait 410 | logger.warning(f"Received {status} for {url}") 411 | error_msg = f"HTTP {status}" 412 | return (False, filename, error_msg, retry_count + 1) 413 | else: 414 | error_msg = f"HTTP {status}" 415 | logger.error(f"Failed to download {url}: {error_msg}") 416 | return (False, filename, error_msg, retry_count + 1) 417 | 418 | except asyncio.TimeoutError: 419 | logger.error(f"Overall timeout exceeded for {url}") 420 | return (False, filename, "Request timed out", retry_count + 1) 421 | except Exception as e: 422 | logger.error(f"Error downloading {url}: {str(e)}") 423 | return (False, filename, f"Download error: {str(e)}", retry_count + 1) 424 | 425 | except aiohttp.ClientError as e: 426 | error_msg = f"Client error: {str(e)}" 427 | logger.error(f"ClientError while downloading {url}: {error_msg}") 428 | except asyncio.TimeoutError: 429 | error_msg = "Timeout error" 430 | logger.error(f"Timeout while downloading {url}") 431 | except Exception as e: 432 | error_msg = f"Unexpected error: {str(e)}" 433 | logger.error(f"Error while downloading {url}: {error_msg}") 434 | 435 | return (False, filename, error_msg, retry_count + 1) 436 | 437 | 438 | async def download_files(df: pd.DataFrame, url_column: str, semaphore: asyncio.Semaphore, 439 | args: argparse.Namespace, rate_limiter: RateLimiter, 440 | max_retries: int = 3) -> pd.DataFrame: 441 | """ 442 | Download files from URLs in a DataFrame using internal batching for memory efficiency 443 | 444 | Args: 445 | df: DataFrame with URLs 446 | url_column: Name of the column containing URLs 447 | semaphore: Semaphore for concurrency control 448 | args: Command-line arguments 449 | max_retries: Maximum number of retries per URL 450 | 451 | Returns: 452 | pd.DataFrame: Updated DataFrame with download results 453 | """ 454 | # Add columns for filenames and download status if they don't exist 455 | if 'filename' not in df.columns: 456 | df['filename'] = None 457 | if 'download_success' not in df.columns: 458 | df['download_success'] = False 459 | if 'error_message' not in df.columns: 460 | df['error_message'] = "" 461 | 462 | # Create a user agent generator 463 | user_agent_gen = user_agent_generator() 464 | 465 | # Calculate output parquet path (needed for periodic saves) 466 | output_parquet = os.path.join(args.output_dir, os.path.basename(args.input_parquet)) 467 | if args.output_parquet: 468 | output_parquet = args.output_parquet 469 | 470 | # Get total number of unprocessed rows 471 | unprocessed_mask = pd.isna(df['download_success']) | ~df['download_success'] 472 | unprocessed_indices = df[unprocessed_mask].index.tolist() 473 | total_unprocessed = len(unprocessed_indices) 474 | 475 | logger.info(f"Found {total_unprocessed} unprocessed rows out of {len(df)} total") 476 | 477 | internal_batch_size = args.internal_batch_size 478 | successful_downloads = 0 479 | periodic_save_count = args.save_every 480 | 481 | # Process in batches to save memory 482 | for batch_start in range(0, total_unprocessed, internal_batch_size): 483 | batch_end = min(batch_start + internal_batch_size, total_unprocessed) 484 | current_batch_indices = unprocessed_indices[batch_start:batch_end] 485 | 486 | logger.info(f"Processing batch {batch_start//internal_batch_size + 1} of {(total_unprocessed + internal_batch_size - 1)//internal_batch_size}: rows {batch_start} to {batch_end-1}") 487 | 488 | # Create tasks for current batch 489 | tasks = [] 490 | for row_idx in current_batch_indices: 491 | url = df.at[row_idx, url_column] 492 | # Get the retry count from the dataframe if it exists 493 | retry_count = int(df.at[row_idx, 'retry_count']) if 'retry_count' in df.columns and pd.notna(df.at[row_idx, 'retry_count']) else 0 494 | 495 | # Skip URLs that have failed too many times 496 | if args.skip_failed_after > 0 and retry_count >= args.skip_failed_after: 497 | logger.info(f"Skipping URL at row {row_idx} - too many failures: {retry_count}") 498 | continue 499 | 500 | if pd.notna(url): 501 | task = asyncio.create_task( 502 | download_file( 503 | row_idx, url, semaphore, args, 504 | next(user_agent_gen), rate_limiter, retry_count 505 | ) 506 | ) 507 | tasks.append((row_idx, task)) 508 | 509 | # Process tasks in current batch 510 | for row_idx, task in tasks: 511 | try: 512 | success, filename, error_msg, new_retry_count = await task 513 | df.at[row_idx, 'filename'] = filename 514 | df.at[row_idx, 'download_success'] = success 515 | df.at[row_idx, 'error_message'] = error_msg 516 | df.at[row_idx, 'retry_count'] = new_retry_count 517 | 518 | # Count successful downloads and save periodically 519 | if success: 520 | successful_downloads += 1 521 | if successful_downloads % periodic_save_count == 0: 522 | logger.info(f"Periodic save: Completed {successful_downloads} downloads. Saving progress to {output_parquet}") 523 | df.to_parquet(output_parquet, index=False) 524 | 525 | except Exception as e: 526 | logger.error(f"Error processing task for row {row_idx}: {e}") 527 | df.at[row_idx, 'download_success'] = False 528 | df.at[row_idx, 'error_message'] = f"Task error: {str(e)}" 529 | 530 | # Save after each batch 531 | logger.info(f"Batch complete. Saving progress to {output_parquet}") 532 | df.to_parquet(output_parquet, index=False) 533 | 534 | return df 535 | 536 | 537 | async def run(args: argparse.Namespace) -> None: 538 | """ 539 | Main function to run the downloader 540 | 541 | Args: 542 | args: Command-line arguments 543 | """ 544 | # Ensure output directory exists 545 | os.makedirs(args.output_dir, exist_ok=True) 546 | 547 | # Determine output parquet path 548 | output_parquet = os.path.join(args.output_dir, os.path.basename(args.input_parquet)) 549 | if args.output_parquet: 550 | output_parquet = args.output_parquet 551 | 552 | # Check if we're resuming from a previous run 553 | resuming = False 554 | if os.path.exists(output_parquet) and args.resume: 555 | try: 556 | logger.info(f"Found existing output parquet file at {output_parquet}. Attempting to resume.") 557 | df = pd.read_parquet(output_parquet) 558 | resuming = True 559 | 560 | # Count successful downloads for statistics 561 | existing_success_count = df['download_success'].sum() if 'download_success' in df.columns else 0 562 | logger.info(f"Resuming from previous run with {existing_success_count} already completed downloads") 563 | 564 | except Exception as e: 565 | logger.warning(f"Failed to read existing parquet for resuming: {e}. Starting fresh.") 566 | resuming = False 567 | 568 | # If not resuming, read the input parquet 569 | if not resuming: 570 | logger.info(f"Reading input parquet file: {args.input_parquet}") 571 | df = pd.read_parquet(args.input_parquet) 572 | 573 | original_count = len(df) 574 | logger.info(f"Loaded {original_count} rows from parquet file") 575 | 576 | # Check if URL column exists 577 | if args.url_column not in df.columns: 578 | raise ValueError(f"URL column '{args.url_column}' not found in parquet file. Available columns: {', '.join(df.columns)}") 579 | 580 | # Create semaphore for concurrency control 581 | semaphore = asyncio.Semaphore(args.concurrency) 582 | 583 | # Create rate limiter (100 requests per minute) 584 | rate_limiter = RateLimiter(args.rate_limit, args.rate_period) 585 | logger.info(f"Using rate limit of {args.rate_limit} requests per {args.rate_period} seconds") 586 | 587 | # Process downloads 588 | logger.info(f"Starting downloads with concurrency: {args.concurrency}") 589 | updated_df = await download_files(df, args.url_column, semaphore, args, rate_limiter, args.max_retries) 590 | 591 | # Save updated DataFrame to parquet 592 | logger.info(f"Saving updated parquet file to: {output_parquet}") 593 | updated_df.to_parquet(output_parquet, index=False) 594 | 595 | # Report statistics 596 | success_count = updated_df['download_success'].sum() if 'download_success' in updated_df.columns else 0 597 | logger.info(f"Download summary:") 598 | logger.info(f" Total URLs: {original_count}") 599 | logger.info(f" Successfully downloaded: {success_count}") 600 | logger.info(f" Failed: {original_count - success_count}") 601 | logger.info(f"Updated parquet file saved to: {output_parquet}") 602 | 603 | 604 | def parse_args() -> argparse.Namespace: 605 | """ 606 | Parse command-line arguments 607 | 608 | Returns: 609 | argparse.Namespace: Parsed arguments 610 | """ 611 | parser = argparse.ArgumentParser( 612 | description="Concurrent downloader for files from a parquet file", 613 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 614 | ) 615 | 616 | parser.add_argument('--input-parquet', required=True, 617 | help='Path to the input parquet file') 618 | parser.add_argument('--url-column', required=True, 619 | help='Column name containing URLs in the parquet file') 620 | parser.add_argument('--output-dir', default='./downloads', 621 | help='Directory to save downloaded files') 622 | parser.add_argument('--output-parquet', 623 | help='Path to save the updated parquet file') 624 | parser.add_argument('--internal-batch-size', type=int, default=100, 625 | help='Number of files to process in one internal batch (for memory efficiency)') 626 | parser.add_argument('--save-every', type=int, default=50, 627 | help='Save progress to parquet file every N successful downloads') 628 | parser.add_argument('--concurrency', type=int, default=5, 629 | help='Number of concurrent downloads') 630 | parser.add_argument('--max-retries', type=int, default=3, 631 | help='Maximum retry attempts for failed downloads') 632 | parser.add_argument('--sleep', type=float, default=0.5, 633 | help='Base sleep time between requests in seconds') 634 | parser.add_argument('--request-method', choices=['get', 'post'], default='get', 635 | help='HTTP request method to use') 636 | parser.add_argument('--resume', action='store_true', 637 | help='Resume downloading from a previously saved checkpoint') 638 | parser.add_argument('--debug', action='store_true', 639 | help='Enable debug logging') 640 | parser.add_argument('--retry-interval', type=float, default=5.0, 641 | help='Time to wait between retries for 403/429 errors (seconds)') 642 | parser.add_argument('--rate-limit', type=int, default=100, 643 | help='Maximum number of requests per time period (rate limiting)') 644 | parser.add_argument('--rate-period', type=int, default=60, 645 | help='Time period in seconds for rate limiting') 646 | parser.add_argument('--request-timeout', type=int, default=45, 647 | help='Overall timeout in seconds for each request') 648 | parser.add_argument('--skip-failed-after', type=int, default=3, 649 | help='Skip URLs that failed more than this many times') 650 | 651 | return parser.parse_args() 652 | 653 | 654 | async def main() -> None: 655 | """ 656 | Main entry point 657 | """ 658 | args = parse_args() 659 | try: 660 | await run(args) 661 | except Exception as e: 662 | logger.error(f"Error in main: {e}") 663 | raise 664 | 665 | 666 | if __name__ == "__main__": 667 | try: 668 | asyncio.run(main()) 669 | except KeyboardInterrupt: 670 | logger.info("Process interrupted by user") 671 | except Exception as e: 672 | logger.error(f"Unhandled exception: {e}") 673 | -------------------------------------------------------------------------------- /pipeline/scripts/sample_for_training.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Script to sample data from the kallipos processed data. 4 | 5 | This script performs the following: 6 | 1. Creates 200 samples from 'Κεφάλαιο' document type, split into 2 parts 7 | 2. Creates 200 samples from all document types except 'Κεφάλαιο', split into 2 parts 8 | 3. Converts all samples to text format for analysis 9 | """ 10 | 11 | import os 12 | import logging 13 | from pathlib import Path 14 | from sampler import Sampler 15 | 16 | # Set up logging 17 | logging.basicConfig( 18 | level=logging.INFO, 19 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 20 | ) 21 | logger = logging.getLogger(__name__) 22 | 23 | # Base directory with processed data 24 | WORKING_DIR = "downloads/" 25 | 26 | def main(): 27 | logger.info("Creating sampler instance...") 28 | sampler = Sampler(WORKING_DIR) 29 | 30 | # Sample from Κεφάλαιο (200 samples in 2 parts) 31 | logger.info("Sampling from Κεφάλαιο document type...") 32 | kefalaia_parts = sampler.sample( 33 | sample_from={'document_type': 'Κεφάλαιο','header' : 'regex(Βλάχοι)'}, 34 | n=5, 35 | parts=2, 36 | output_name="kefalaia_samples" 37 | ) 38 | 39 | # Sample from everything except Κεφάλαιο (200 samples in 2 parts) 40 | logger.info("Sampling from all document types except Κεφάλαιο...") 41 | non_kefalaia_parts = sampler.sample( 42 | sample_from_all_except={'document_type': 'Κεφάλαιο','header' : 'regex(Ανάλυση)'}, 43 | n=2, 44 | parts=2, 45 | output_name="non_kefalaia_samples" 46 | ) 47 | 48 | # Convert each part to text with custom folder names 49 | logger.info("Converting kefalaia part 1 to text...") 50 | sampler.to_text(kefalaia_parts[0], folder_name="kefalaia_chapter_1") 51 | 52 | logger.info("Converting kefalaia part 2 to text...") 53 | sampler.to_text(kefalaia_parts[1], folder_name="kefalaia_chapter_2") 54 | 55 | logger.info("Converting non-kefalaia part 1 to text...") 56 | sampler.to_text(non_kefalaia_parts[0], folder_name="non_kefalaia_1") 57 | 58 | logger.info("Converting non-kefalaia part 2 to text...") 59 | sampler.to_text(non_kefalaia_parts[1], folder_name="non_kefalaia_2") 60 | 61 | # Print summary of samples 62 | logger.info("\nSampling summary:") 63 | logger.info(f"Kefalaia part 1: {len(kefalaia_parts[0])} rows from {len(kefalaia_parts[0]['filename'].unique())} unique files") 64 | logger.info(f"Kefalaia part 2: {len(kefalaia_parts[1])} rows from {len(kefalaia_parts[1]['filename'].unique())} unique files") 65 | logger.info(f"Non-kefalaia part 1: {len(non_kefalaia_parts[0])} rows from {len(non_kefalaia_parts[0]['filename'].unique())} unique files") 66 | logger.info(f"Non-kefalaia part 2: {len(non_kefalaia_parts[1])} rows from {len(non_kefalaia_parts[1]['filename'].unique())} unique files") 67 | 68 | # Print output information 69 | logger.info("\nOutput locations:") 70 | logger.info(f"CSV files: {sampler.datasets_dir}") 71 | logger.info(f"Text files: {sampler.text_dir}") 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /pipeline/scripts/test_section_reconstruction.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import json 4 | import pandas as pd 5 | 6 | # Adjust path to import from the parent directory's src 7 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'src'))) 8 | 9 | from glossapi.gloss_section import GlossSection, Section 10 | 11 | def reconstruct_section(processed_content: list) -> str: 12 | """ 13 | Reconstructs the original raw text from the processed section content. 14 | 15 | Args: 16 | processed_content: The list of dictionaries representing the section's 17 | categorized content (e.g., [{'text': '...'}, {'list': '...'}]). 18 | 19 | Returns: 20 | The reconstructed raw text as a single string. 21 | """ 22 | reconstructed_lines = [] 23 | for item in processed_content: 24 | # The structure is {type: value} 25 | content_type, content_value = list(item.items())[0] 26 | # The value itself contains the original line breaks 27 | reconstructed_lines.append(content_value) 28 | 29 | # Join the content blocks with newlines to form the full raw text 30 | return "\n".join(reconstructed_lines) 31 | 32 | 33 | 34 | def test_reconstruction_from_parquet(parquet_path: str = "/mnt/data/pipeline_refactor/output/sections/sections_for_annotation.parquet", test_all: bool = True): 35 | """ 36 | Tests reconstruction by reading data from the pipeline's output Parquet file. 37 | 38 | Args: 39 | parquet_path: Path to the sections Parquet file. 40 | sample_size: Number of sections to randomly sample and test. 41 | """ 42 | print(f"\n--- Running Reconstruction Test from Parquet ({parquet_path}) ---") 43 | 44 | if not os.path.exists(parquet_path): 45 | print(f"❌ ERROR: Parquet file not found at {parquet_path}") 46 | return False 47 | 48 | try: 49 | df = pd.read_parquet(parquet_path) 50 | print(f"Loaded {len(df)} sections from Parquet.") 51 | except Exception as e: 52 | print(f"❌ ERROR: Failed to load Parquet file: {e}") 53 | return False 54 | 55 | if len(df) == 0: 56 | print("⚠️ WARN: Parquet file is empty. No sections to test.") 57 | return True # Technically passed as no failures 58 | 59 | # Test all sections 60 | sample_df = df 61 | print(f"Testing reconstruction for all {len(sample_df)} sections...") 62 | 63 | all_passed = True 64 | failures = [] 65 | 66 | for index, row in sample_df.iterrows(): 67 | raw_content = row['section'] # This column contains the raw text 68 | section_json_str = row['json_section'] # This column contains the JSON representation 69 | filename = row['filename'] 70 | header = row['header'] 71 | 72 | try: 73 | processed_content = json.loads(section_json_str) 74 | except json.JSONDecodeError as e: 75 | print(f"❌ FAILED: Section {index} (File: {filename}, Header: '{header}') - Invalid JSON: {e}") 76 | failures.append(f"Index {index} (File: {filename}, Header: '{header}') - JSON Decode Error") 77 | all_passed = False 78 | continue 79 | 80 | reconstructed_text = reconstruct_section(processed_content) 81 | 82 | if raw_content != reconstructed_text: 83 | all_passed = False 84 | failures.append(f"Index {index} (File: {filename}, Header: '{header}') - Content Mismatch") 85 | print(f"❌ FAILED: Section {index} (File: {filename}, Header: '{header}') - Mismatch detected!") 86 | # You could add detailed diff printing here if needed for debugging 87 | # print(f" Original:\n```\n{raw_content}\n```") 88 | # print(f" Reconstructed:\n```\n{reconstructed_text}\n```") 89 | # else: 90 | # Optional: Print pass messages for verbosity 91 | # print(f"✅ PASSED: Section {index} (File: {filename}, Header: '{header}')") 92 | 93 | print("\n--- Parquet Test Summary ---") 94 | if all_passed: 95 | print(f"✅ All {len(sample_df)} sampled sections reconstructed successfully from Parquet!") 96 | else: 97 | print(f"❌ Reconstruction failed for {len(failures)}/{len(sample_df)} sampled sections:") 98 | for failure in failures: 99 | print(f" - {failure}") 100 | 101 | return all_passed 102 | 103 | if __name__ == "__main__": 104 | # Run the test using the real Parquet data 105 | test_passed = test_reconstruction_from_parquet() 106 | 107 | print("\n--- Overall Test Results ---") 108 | if test_passed: 109 | print("✅✅ All sections reconstructed successfully! ✅✅") 110 | sys.exit(0) 111 | else: 112 | print("❌❌ Some sections failed reconstruction. ❌❌") 113 | sys.exit(1) 114 | -------------------------------------------------------------------------------- /pipeline/src/glossapi/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | GlossAPI Library 3 | 4 | A library for processing academic texts in Greek and other languages: 5 | - Extracting content from PDFs and other formats with Docling 6 | - Robust batch processing with error isolation and automatic resumption 7 | - Clustering documents based on extraction quality 8 | - Extracting and cleaning academic sections 9 | - Classifying sections using machine learning 10 | 11 | This is an open source project that provides tools for linguistic annotations 12 | and text processing, with a special focus on the Greek language. 13 | """ 14 | 15 | from .gloss_extract import GlossExtract 16 | from .gloss_section_classifier import GlossSectionClassifier 17 | from .corpus import Corpus 18 | from .sampler import Sampler 19 | from .gloss_section import Section, GlossSection 20 | from .gloss_downloader import GlossDownloader 21 | 22 | __all__ = [ 23 | 'GlossExtract', 24 | 'GlossSection', 25 | 'GlossSectionClassifier', 26 | 'Corpus', 27 | 'Sampler', 28 | 'Section', 29 | 'NewGlossSection', 30 | 'GlossDownloader' 31 | ] 32 | 33 | __version__ = '0.0.10' 34 | -------------------------------------------------------------------------------- /pipeline/src/glossapi/gloss_section.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import json 4 | from typing import List, Tuple, Dict, Any 5 | import pandas as pd 6 | import pyarrow as pa 7 | import pyarrow.parquet as pq 8 | 9 | 10 | class Section: 11 | """ 12 | A data structure representing a section in an academic document. 13 | 14 | Attributes: 15 | title (str): The section title 16 | start_line (int): The starting line number in the original document 17 | end_line (int): The ending line number in the original document 18 | content (List[Dict]): List of content elements. Each element is a dict with one of these keys: 19 | - 'text': Regular text content including empty lines 20 | - 'table': Table content in markdown format 21 | - 'list': List items with their continuation lines 22 | - 'other': Standalone references, image placeholders, etc. 23 | raw_content (str): Raw text content of the section (unprocessed) 24 | has_table (bool): Flag indicating if section contains tables 25 | has_list (bool): Flag indicating if section contains lists 26 | has_text (bool): Flag indicating if section contains regular text 27 | has_other (bool): Flag indicating if section contains other content (refs, images, etc) 28 | """ 29 | def __init__(self, title: str = "", start_line: int = 0): 30 | self.title = title 31 | self.start_line = start_line 32 | self.end_line = start_line 33 | self.content = [] 34 | self.raw_content = "" 35 | self.has_table = False 36 | self.has_list = False 37 | self.has_other = False 38 | self.has_text = False 39 | 40 | def add_content(self, content_type: str, content_value: str): 41 | """Add a content element to this section""" 42 | # Create a dictionary with the content type as the key 43 | content_dict = {content_type: content_value} 44 | self.content.append(content_dict) 45 | 46 | # Update flags based on content type 47 | if content_type == "table": 48 | self.has_table = True 49 | elif content_type == "list": 50 | self.has_list = True 51 | elif content_type == "other": 52 | self.has_other = True 53 | elif content_type == "text": 54 | self.has_text = True 55 | 56 | 57 | class GlossSection: 58 | """ 59 | A class for sectioning, processing, and exporting academic document sections to Parquet format. 60 | Handles parsing markdown documents, identifying structural elements like headers, tables, 61 | lists, and footnotes, and processes them for further analysis. 62 | """ 63 | 64 | def _is_list_bullet_line(self, line: str) -> bool: 65 | """ 66 | Check if a line indicates a bullet item. 67 | Examples: 68 | - 1. text 69 | - text 70 | - text 71 | 1. text 72 | etc. 73 | 74 | We'll unify them with a small regex set. 75 | """ 76 | test = line.strip() 77 | if not test: 78 | return False 79 | 80 | # This pattern matches lines that begin with: 81 | # - optional dash, then optional digits, then optional bullet symbols 82 | # - final check for '.' or ' ' => bullet indicator 83 | # e.g. "- 1. ", "- ", "- ", "1. ", "2." 84 | bullet_pat = re.compile(r''' 85 | ^ 86 | ( 87 | -\s*\d*\.?\s*[\u2022\u2023\u25E6\u00BB\u2023]* # dash + optional digits + period + bullet char 88 | |\d+\.\s+ 89 | |-\s* 90 | ) 91 | .* 92 | ''', re.VERBOSE) 93 | return bool(bullet_pat.match(test)) 94 | 95 | def _looks_like_list_paragraph(self, para: str) -> bool: 96 | """ 97 | Check if a paragraph is marked as a bullet block by our sentinel. 98 | """ 99 | return para.startswith("<<__LIST_ITEM__>>") 100 | 101 | ############################################################################### 102 | # 1) Other Utility Functions 103 | ############################################################################### 104 | def _wrap_text(self, text: str, width: int) -> List[str]: 105 | """Wrap text to a specified width while preserving words.""" 106 | words = text.split() 107 | lines = [] 108 | current_line = [] 109 | current_length = 0 110 | 111 | for word in words: 112 | # +1 for space if not first in line 113 | if current_length + len(word) + (1 if current_line else 0) <= width: 114 | current_line.append(word) 115 | current_length += len(word) + (1 if current_line else 0) 116 | else: 117 | if current_line: 118 | lines.append(" ".join(current_line)) 119 | current_line = [word] 120 | current_length = len(word) 121 | 122 | if current_line: 123 | lines.append(" ".join(current_line)) 124 | 125 | return lines 126 | 127 | def _is_standalone_reference(self, para: str, min_text_length: int = 10) -> bool: 128 | """ 129 | Determine if a paragraph appears to be a standalone reference/footnote marker. 130 | Only very short paragraphs (fewer than a threshold number of characters) that 131 | consist solely of reference-style tokens are flagged. 132 | 133 | Parameters: 134 | - para: The paragraph (as a string) to check. 135 | - min_text_length: A lower bound (in characters) below which the paragraph 136 | is considered too short to be meaningful text. 137 | 138 | Returns: 139 | True if the paragraph appears to be just a reference marker. 140 | """ 141 | trimmed = para.strip() 142 | 143 | # Empty lines should never be considered footnotes 144 | if len(trimmed) == 0: 145 | return False 146 | 147 | # Only match if the entire paragraph exactly equals one of the expected reference tokens. 148 | reference_patterns = [ 149 | #r'^\d+$', # Only digits (e.g., "12") 150 | #r'^\d+[\-–]\d+$', # A simple digit range (e.g., "12-14") 151 | r'^(Ibid|op\.cit\.?|loc\.cit\.?|et\.al\.?|cf\.)$', # Common citation markers 152 | r'^(βλ\.|πρβλ\.|σσ\.|σελ\.|ό\.π\.)$', # Greek shorthand markers 153 | ] 154 | 155 | # Try each pattern; if one matches the entire trimmed paragraph, flag it. 156 | for pattern in reference_patterns: 157 | if re.match(pattern, trimmed, re.IGNORECASE): 158 | return True 159 | 160 | # Otherwise, we do not consider it a standalone reference. 161 | return False 162 | 163 | def _detect_other_lines(self, paragraphs: List[str], max_length: int = 20, min_text_length: int = 20) -> List[Dict]: 164 | """ 165 | Identify short paragraphs that should be categorized as "other" content rather than regular text. 166 | 167 | This function simply categorizes very short lines as "other" and everything else as "text". 168 | 169 | Parameters: 170 | - paragraphs: List of paragraph strings. 171 | - max_length: Maximum length (in characters) a paragraph can have 172 | to be considered for the "other" category. 173 | - min_text_length: Not currently used. 174 | 175 | Returns: 176 | A list of dictionaries with the content type as key and content as value. 177 | """ 178 | categorized = [] 179 | for para in paragraphs: 180 | trimmed = para.strip() 181 | 182 | # Simply categorize short lines as "other" 183 | if len(trimmed) > 0 and len(trimmed) < max_length: 184 | categorized.append({"other": para}) 185 | else: 186 | # Regular text content or empty lines 187 | categorized.append({"text": para}) 188 | return categorized 189 | 190 | def _should_merge_paragraphs(self, para1: str, para2: str) -> bool: 191 | """ 192 | Decide if para1 and para2 likely form a single continued sentence. 193 | """ 194 | if not para1 or not para2: 195 | return False 196 | 197 | p1_end = para1.rstrip() 198 | p2_start = para2.lstrip() 199 | 200 | # Hyphen or open parenthesis 201 | if p1_end.endswith('-') or p1_end.endswith('('): 202 | return True 203 | 204 | end_char_1 = p1_end[-1] if p1_end else '' 205 | start_char_2 = p2_start[0] if p2_start else '' 206 | 207 | # e.g. ends with lower, next starts with lower => likely a single sentence 208 | if end_char_1.islower() and start_char_2.islower(): 209 | return True 210 | # ends with punctuation and next starts with lower 211 | if end_char_1 in ',:·' and start_char_2.islower(): 212 | return True 213 | # ends with digit, next starts with '°' 214 | if end_char_1.isdigit() and start_char_2 == '°': 215 | return True 216 | 217 | return False 218 | 219 | def _is_table_line(self, line: str) -> bool: 220 | """Check if the line (stripped) starts & ends with '|' => table line.""" 221 | ls = line.strip() 222 | return ls.startswith("|") and ls.endswith("|") if ls else False 223 | 224 | def _looks_like_table_block(self, paragraph: str) -> bool: 225 | """ 226 | If every non-blank line in paragraph starts & ends with '|', treat as a table block. 227 | """ 228 | lines = paragraph.splitlines() 229 | for ln in lines: 230 | ln_str = ln.strip() 231 | if ln_str and (not (ln_str.startswith("|") and ln_str.endswith("|"))): 232 | return False 233 | return True 234 | 235 | def _is_header(self, line: str) -> bool: 236 | """Check if line is a markdown header (#...).""" 237 | return line.strip().startswith('#') 238 | 239 | def _extract_section_level(self, line: str) -> Tuple[int, str]: 240 | """Extract header level and title from a markdown header line.""" 241 | match = re.match(r'^(#+)\s*(.+)$', line.strip()) 242 | if match: 243 | level = len(match.group(1)) # Count the number of # symbols 244 | title = match.group(2) 245 | return level, title 246 | return 0, line 247 | 248 | def _process_sections(self, lines: List[str]) -> List[Section]: 249 | """ 250 | Process text to identify sections based on headers. 251 | Text between two headers becomes a section with the preceding header as title. 252 | This only divides the document into sections based on headers - content 253 | categorization happens in _process_section_content. 254 | 255 | Enhanced to handle: 256 | 1. Documents that start with content before the first header 257 | 2. Documents with no headers at all 258 | 3. Content after the last header 259 | 260 | Parameters: 261 | - lines: List of text lines from the document 262 | 263 | Returns: 264 | List of Section objects representing the document structure 265 | """ 266 | sections = [] 267 | current_section = None 268 | n = len(lines) 269 | found_any_headers = False 270 | 271 | # Store raw lines between headers 272 | raw_section_lines = [] 273 | 274 | # Handle case 1: Document starts with content before any header 275 | # Create an initial section if the first line is not a header 276 | if n > 0 and not self._is_header(lines[0].strip()): 277 | # Use first line as title if it's not empty, otherwise use "Document" 278 | first_line = lines[0].strip() if lines[0].strip() else "Document" 279 | current_section = Section(title=first_line, start_line=0) 280 | 281 | i = 0 282 | while i < n: 283 | raw_line = lines[i].rstrip('\n') 284 | 285 | # Markdown heading - start of a new section 286 | if self._is_header(raw_line.strip()): 287 | found_any_headers = True 288 | 289 | # If we had a previous section, finalize it 290 | if current_section is not None: 291 | current_section.end_line = i - 1 292 | 293 | # Store raw section content 294 | current_section.raw_content = "\n".join(raw_section_lines) 295 | raw_section_lines = [] 296 | 297 | sections.append(current_section) 298 | 299 | # Create a new section based on the header 300 | _, title = self._extract_section_level(raw_line) 301 | current_section = Section(title=title, start_line=i) 302 | i += 1 303 | continue 304 | 305 | # Just store the raw line - content processing happens later 306 | if current_section is not None: 307 | raw_section_lines.append(raw_line) 308 | else: 309 | # This should generally not happen since we create an initial section if needed, 310 | # but in case first_line logic changes, keep this safety check 311 | raw_section_lines.append(raw_line) 312 | 313 | i += 1 314 | 315 | # Handle case 2 & 3: Document has no headers or content after the last header 316 | # Finalize the last section if there is one 317 | if current_section: 318 | current_section.end_line = n - 1 319 | current_section.raw_content = "\n".join(raw_section_lines) 320 | sections.append(current_section) 321 | elif raw_section_lines: # Handle case where no section was created but we collected content 322 | first_line = raw_section_lines[0].strip() if raw_section_lines and raw_section_lines[0].strip() else "Document" 323 | default_section = Section(title=first_line, start_line=0) 324 | default_section.end_line = n - 1 325 | default_section.raw_content = "\n".join(raw_section_lines[1:] if len(raw_section_lines) > 1 else raw_section_lines) 326 | sections.append(default_section) 327 | 328 | # Handle case 2: If no headers were found and we have no sections yet, create a default section 329 | if not found_any_headers and not sections and n > 0: 330 | title = lines[0].strip() if lines[0].strip() else "Document" 331 | content = "\n".join(lines[1:] if len(lines) > 1 else lines) 332 | default_section = Section(title=title, start_line=0) 333 | default_section.end_line = n - 1 334 | default_section.raw_content = content 335 | sections.append(default_section) 336 | 337 | return sections 338 | 339 | def _process_section_content(self, sections: List[Section]): 340 | """ 341 | Process the raw content of each section to categorize it into appropriate content types: 342 | 1. Tables: Identified by markdown table formatting (|) 343 | 2. Lists: Identified by bullet points or numbered items 344 | 3. Other: Standalone references, image placeholders, etc. 345 | 4. Text: All remaining content, including empty lines 346 | 347 | The original structure with line breaks is preserved within each content block. 348 | 349 | Parameters: 350 | - sections: List of Section objects to process 351 | """ 352 | for section in sections: 353 | # Clear existing content and start fresh from raw content 354 | section.content = [] 355 | 356 | # Split raw content into lines for processing 357 | if not section.raw_content: 358 | continue 359 | 360 | raw_lines = section.raw_content.split('\n') 361 | i = 0 362 | n = len(raw_lines) 363 | 364 | # Buffer to collect text content including empty lines 365 | text_buffer = [] 366 | 367 | while i < n: 368 | line = raw_lines[i] 369 | 370 | # 1. Check for tables (lines with | at start and end) 371 | if self._is_table_line(line): 372 | # Flush any text buffer first 373 | if text_buffer: 374 | section.add_content("text", "\n".join(text_buffer)) 375 | text_buffer = [] 376 | 377 | # Collect all table lines 378 | table_lines = [line] 379 | i += 1 380 | while i < n and self._is_table_line(raw_lines[i]): 381 | table_lines.append(raw_lines[i]) 382 | i += 1 383 | # Add as table content 384 | section.add_content("table", "\n".join(table_lines)) 385 | continue 386 | 387 | # 2. Check for list items 388 | elif self._is_list_bullet_line(line): 389 | # Flush any text buffer first 390 | if text_buffer: 391 | section.add_content("text", "\n".join(text_buffer)) 392 | text_buffer = [] 393 | 394 | # Collect the list item and any continuation lines 395 | list_item = [line] 396 | i += 1 397 | while i < n: 398 | next_line = raw_lines[i] 399 | if (not next_line.strip() or 400 | self._is_list_bullet_line(next_line) or 401 | self._is_table_line(next_line) or 402 | self._is_header(next_line)): 403 | break 404 | # Add continuation line preserving its formatting 405 | list_item.append(next_line) 406 | i += 1 407 | # Add as list content, preserving line breaks 408 | section.add_content("list", "\n".join(list_item)) 409 | continue 410 | 411 | # 3. Check for 'other' content (standalone refs, image placeholders, etc) 412 | elif self._detect_other_lines([line])[0].get('other'): 413 | # Flush any text buffer first 414 | if text_buffer: 415 | section.add_content("text", "\n".join(text_buffer)) 416 | text_buffer = [] 417 | 418 | section.add_content("other", line) 419 | i += 1 420 | continue 421 | 422 | # 4. Regular text content and empty lines - add to buffer 423 | else: 424 | # Add to text buffer (preserves empty lines and formatting) 425 | text_buffer.append(line) 426 | i += 1 427 | 428 | # Don't forget to add any remaining text in buffer 429 | if text_buffer: 430 | section.add_content("text", "\n".join(text_buffer)) 431 | 432 | # Update section flags based on content 433 | section.has_table = any("table" in item for item in section.content) 434 | section.has_list = any("list" in item for item in section.content) 435 | section.has_text = any("text" in item for item in section.content) 436 | section.has_other = any("other" in item for item in section.content) 437 | 438 | 439 | 440 | def _format_academic_document(self, text: str, filename: str) -> List[Dict[str, Any]]: 441 | """ 442 | Process a document and format it into structured data for output. 443 | 444 | Parameters: 445 | - text: The text content of the document 446 | - filename: The filename of the document 447 | 448 | Returns: 449 | A list of dictionaries with structured section data for Parquet output 450 | """ 451 | lines = text.splitlines() 452 | 453 | # 1) Identify sections in the document based on markdown headers 454 | sections = self._process_sections(lines) 455 | 456 | # 2) Process section content - categorize each line appropriately 457 | self._process_section_content(sections) 458 | 459 | # 3) Format the data for output 460 | rows = [] 461 | for section in sections: 462 | # Calculate section position as fraction of total document 463 | start_frac = section.start_line / max(1, len(lines)) 464 | end_frac = section.end_line / max(1, len(lines)) 465 | place_str = f"{start_frac:.2f}-{end_frac:.2f}" 466 | 467 | # Create a list of dictionaries for JSON serialization 468 | json_items = [] 469 | for item in section.content: 470 | # Each item is a dict with a single key (content type) and value 471 | content_type = list(item.keys())[0] 472 | content_value = item[content_type] 473 | 474 | # Create proper dictionary object for JSON serialization 475 | json_items.append({content_type: content_value}) 476 | 477 | # Use Python's json module for proper JSON serialization with all escaping handled 478 | json_content = json.dumps(json_items, ensure_ascii=False, indent=2) 479 | 480 | row = { 481 | "id": len(rows), 482 | "filename": filename, 483 | "has_table": section.has_table, 484 | "has_list": section.has_list, 485 | "has_other": section.has_other, 486 | "has_text": section.has_text, 487 | "header": section.title.strip(), 488 | "place": place_str, 489 | "section": section.raw_content, # Store the original unprocessed section text 490 | "json_section": json_content # Store the formatted JSON content 491 | } 492 | rows.append(row) 493 | 494 | return rows 495 | 496 | def to_parquet(self, input_dir, output_dir, filenames_to_process): 497 | """ 498 | Process Markdown files from input_dir and write structured data to a Parquet file. 499 | 500 | Args: 501 | input_dir (str): Directory containing Markdown files to process 502 | output_dir (str): Directory where the output Parquet file will be written 503 | filenames_to_process (list): List of filenames (without extensions) to process. 504 | Only files matching these names will be processed. 505 | This should be a list of base filenames without extensions. 506 | 507 | The output Parquet file will contain structured data about sections from all documents, 508 | including information about tables, lists, footnotes, and regular text. 509 | """ 510 | os.makedirs(output_dir, exist_ok=True) 511 | 512 | parquet_path = os.path.join(output_dir, "sections_for_annotation.parquet") 513 | schema = pa.schema([ 514 | pa.field("id", pa.int64()), 515 | pa.field("row_id", pa.string()), 516 | pa.field("filename", pa.string()), 517 | pa.field("has_table", pa.bool_()), 518 | pa.field("has_list", pa.bool_()), 519 | pa.field("has_other", pa.bool_()), 520 | pa.field("has_text", pa.bool_()), 521 | pa.field("header", pa.string()), 522 | pa.field("place", pa.string()), 523 | pa.field("section", pa.string()), # Raw section text 524 | pa.field("json_section", pa.string()), # Formatted JSON content 525 | pa.field("section_length", pa.int64()), # Number of non-empty lines in section 526 | pa.field("section_propo", pa.int64()), # Proportion of document (0-1000) 527 | ]) 528 | 529 | writer = pq.ParquetWriter(parquet_path, schema=schema) 530 | row_counter = 1 # global row id counter 531 | 532 | # Process each Markdown file individually to keep memory usage low. 533 | processed_files_count = 0 534 | skipped_files = [] 535 | print(f"\n===== SECTIONING PHASE =====") 536 | print(f"Input directory: {input_dir}") 537 | print(f"Output directory: {output_dir}") 538 | print(f"Good files list (length {len(filenames_to_process)}): {filenames_to_process}") 539 | print(f"Available files in directory:") 540 | md_files = [f for f in os.listdir(input_dir) if f.endswith(".md")] 541 | for i, md_file in enumerate(md_files): 542 | print(f" {i+1}. {md_file} (basename: {os.path.splitext(md_file)[0]})") 543 | 544 | for filename in os.listdir(input_dir): 545 | if filename.endswith(".md"): 546 | # Get the base name without extension for filtering 547 | base_name = os.path.splitext(filename)[0] 548 | 549 | # Only process files that are in our whitelist 550 | if base_name not in filenames_to_process: 551 | skipped_files.append(base_name) 552 | print(f"⚠️ SKIPPED: {base_name} - not in the good files list") 553 | continue # Skip this file as it's not in our list of good files 554 | 555 | processed_files_count += 1 556 | print(f"✅ PROCESSING: {base_name} - in good files list") 557 | input_path = os.path.join(input_dir, filename) 558 | with open(input_path, 'r', encoding='utf-8') as f: 559 | text = f.read() 560 | 561 | short_name = os.path.splitext(filename)[0] 562 | doc_rows = self._format_academic_document(text, short_name) 563 | 564 | # Calculate section_length for each row (number of non-empty lines) 565 | for row in doc_rows: 566 | section_lines = row.get("section", "").splitlines() 567 | section_length = sum(1 for line in section_lines if line.strip()) 568 | row['section_length'] = section_length 569 | 570 | # Calculate the total article length (sum of all section lengths) 571 | article_length = sum(row.get("section_length", 0) for row in doc_rows) 572 | 573 | # Calculate section_propo for each row (proportion * 1000, rounded) 574 | for row in doc_rows: 575 | if article_length > 0: 576 | section_propo = round((row.get("section_length", 0) / article_length) * 1000) 577 | else: 578 | section_propo = 0 579 | row['section_propo'] = section_propo 580 | 581 | # Add id and row_id to each row 582 | for row in doc_rows: 583 | row['id'] = row_counter 584 | row['row_id'] = f'row_{row_counter}' 585 | row_counter += 1 586 | 587 | if doc_rows: 588 | df = pd.DataFrame(doc_rows) 589 | table = pa.Table.from_pandas(df, schema=schema, preserve_index=False) 590 | writer.write_table(table) 591 | 592 | writer.close() 593 | 594 | # More informative logging 595 | print(f"\nSection processing summary:") 596 | print(f" - Good files list contained {len(filenames_to_process)} files: {filenames_to_process}") 597 | print(f" - Found {processed_files_count} markdown files matching good files list") 598 | if skipped_files: 599 | print(f" - Skipped {len(skipped_files)} files that weren't in good list: {skipped_files}") 600 | print(f" - Saved {row_counter - 1} total sections to {parquet_path}") 601 | -------------------------------------------------------------------------------- /pipeline/src/glossapi/models/kmeans_weights.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eellak/glossAPI/02ba170f69681c6bb1ad0a52b48fb2309c8354f1/pipeline/src/glossapi/models/kmeans_weights.joblib -------------------------------------------------------------------------------- /pipeline/src/glossapi/models/section_classifier.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eellak/glossAPI/02ba170f69681c6bb1ad0a52b48fb2309c8354f1/pipeline/src/glossapi/models/section_classifier.joblib -------------------------------------------------------------------------------- /pipeline/src/glossapi/parquet_schema.py: -------------------------------------------------------------------------------- 1 | """ 2 | Standardized Parquet Schema definitions for GlossAPI pipeline. 3 | 4 | This module defines standard schemas for parquet files used throughout the GlossAPI 5 | pipeline, ensuring consistency between different pipeline stages. 6 | """ 7 | 8 | import os 9 | import pandas as pd 10 | import pyarrow as pa 11 | import pyarrow.parquet as pq 12 | from pathlib import Path 13 | from typing import List, Dict, Any, Optional, Union, Tuple 14 | 15 | 16 | class ParquetSchema: 17 | """ 18 | Defines standardized schema for parquet files in the GlossAPI pipeline. 19 | 20 | This class provides methods to validate, read, and write parquet files 21 | with consistent schemas for different pipeline stages. 22 | 23 | The pipeline uses two distinct types of parquet files: 24 | 25 | 1. Metadata Parquet: 26 | - Each row represents a file (one-to-one relationship with files) 27 | - Essential columns: filename, URL column (configurable), extraction quality 28 | - Used by: downloader, extractor, and filter stages 29 | - Example: download_results.parquet 30 | - Typical location: {output_dir}/download_results/ 31 | - Schema: METADATA_SCHEMA, DOWNLOAD_SCHEMA 32 | 33 | 2. Sections Parquet: 34 | - Each row represents a section from a file (many-to-one relationship with files) 35 | - Essential columns: filename, title, content, section, predicted_section 36 | - Used by: section and annotation stages 37 | - Examples: sections_for_annotation.parquet, classified_sections.parquet 38 | - Typical location: {output_dir}/sections/ 39 | - Schema: SECTION_SCHEMA, CLASSIFIED_SCHEMA 40 | 41 | When the pipeline runs, it first creates and populates a metadata parquet, 42 | then uses it to filter files, and finally creates section parquets from the 43 | filtered files. 44 | """ 45 | 46 | def __init__(self, pipeline_config: Optional[Dict[str, Any]] = None): 47 | """ 48 | Initialize the ParquetSchema with optional pipeline configuration. 49 | 50 | Args: 51 | pipeline_config: Configuration dictionary with settings such as 52 | url_column, which will be used throughout the pipeline 53 | """ 54 | # TODO: Add more robust configuration options for each parquet type from input metadata and downloder, to section, and two phases of annotaiton. 55 | # TODO: Add support for consolidated sections parquet handling 56 | # TODO: Add methods to find the latest sections parquet in a pipeline 57 | self.config = pipeline_config or {} 58 | self.url_column = self.config.get('url_column', 'url') 59 | 60 | # Basic schema with common fields used across all parquet files 61 | COMMON_SCHEMA = pa.schema([ 62 | ('id', pa.string()), 63 | ('row_id', pa.int64()), 64 | ('filename', pa.string()), 65 | ]) 66 | 67 | # Metadata schema for files used by downloader and quality assessment 68 | METADATA_SCHEMA = pa.schema([ 69 | ('filename', pa.string()), 70 | ('url', pa.string()), # Can be customized with url_column parameter 71 | ('download_success', pa.bool_()), 72 | ('download_error', pa.string()), 73 | ('extraction_quality', pa.string()), # Values: "good", "bad", "unknown" 74 | ('processing_stage', pa.string()), # Tracks progress through pipeline 75 | ]) 76 | 77 | # Additional schemas for specific pipeline stages 78 | DOWNLOAD_SCHEMA = pa.schema([ 79 | ('url', pa.string()), # Will be replaced with the actual url_column 80 | ('download_success', pa.bool_()), 81 | ('download_error', pa.string()), 82 | ('download_retry_count', pa.int32()), 83 | ('filename', pa.string()), 84 | ]) 85 | 86 | SECTION_SCHEMA = pa.schema([ 87 | ('id', pa.string()), 88 | ('row_id', pa.int64()), 89 | ('filename', pa.string()), 90 | ('title', pa.string()), 91 | ('content', pa.string()), 92 | ('section', pa.string()), 93 | ]) 94 | 95 | CLASSIFIED_SCHEMA = pa.schema([ 96 | ('id', pa.string()), 97 | ('row_id', pa.int64()), 98 | ('filename', pa.string()), 99 | ('title', pa.string()), 100 | ('content', pa.string()), 101 | ('section', pa.string()), 102 | ('predicted_section', pa.string()), 103 | ('probability', pa.float64()), 104 | ]) 105 | 106 | def get_required_metadata(self) -> Dict[str, str]: 107 | """ 108 | Get required metadata fields for GlossAPI parquet files. 109 | 110 | Returns: 111 | Dict[str, str]: Dictionary of required metadata fields and their descriptions 112 | """ 113 | return { 114 | 'pipeline_version': 'GlossAPI pipeline version', 115 | 'created_at': 'ISO format timestamp when the file was created', 116 | 'source_file': 'Original source file that generated this parquet', 117 | 'processing_stage': 'Pipeline processing stage (download, extract, section, etc)' 118 | } 119 | 120 | def validate_schema(self, df: pd.DataFrame, schema_type: str = 'common') -> Tuple[bool, List[str]]: 121 | """ 122 | Validate that a DataFrame conforms to the specified schema. 123 | 124 | Args: 125 | df: DataFrame to validate 126 | schema_type: Type of schema to validate against ('common', 'download', 'section', 'classified', 'metadata') 127 | 128 | Returns: 129 | Tuple[bool, List[str]]: (is_valid, missing_columns) 130 | """ 131 | if schema_type.lower() == 'download': 132 | required_columns = [field.name for field in self.DOWNLOAD_SCHEMA] 133 | # Make sure to use the configured url_column 134 | if self.url_column != 'url' and 'url' in required_columns: 135 | required_columns.remove('url') 136 | required_columns.append(self.url_column) 137 | elif schema_type.lower() == 'section': 138 | required_columns = [field.name for field in self.SECTION_SCHEMA] 139 | elif schema_type.lower() == 'classified': 140 | required_columns = [field.name for field in self.CLASSIFIED_SCHEMA] 141 | elif schema_type.lower() == 'metadata': 142 | required_columns = ['filename'] 143 | # Make sure to use the configured url_column 144 | required_columns.append(self.url_column) 145 | else: # Default to common schema 146 | required_columns = [field.name for field in self.COMMON_SCHEMA] 147 | 148 | # Check for missing columns 149 | missing_columns = [col for col in required_columns if col not in df.columns] 150 | 151 | return len(missing_columns) == 0, missing_columns 152 | 153 | def add_metadata(self, table: pa.Table, metadata: Dict[str, str]) -> pa.Table: 154 | """ 155 | Add metadata to a PyArrow Table. 156 | 157 | Args: 158 | table: PyArrow Table to add metadata to 159 | metadata: Dictionary of metadata to add 160 | 161 | Returns: 162 | pa.Table: Table with added metadata 163 | """ 164 | # Add pipeline configuration to metadata 165 | if self.config: 166 | for key, value in self.config.items(): 167 | if key not in metadata: 168 | metadata[f'config_{key}'] = str(value) 169 | # Convert all metadata values to strings 170 | metadata_bytes = {k.encode(): str(v).encode() for k, v in metadata.items()} 171 | 172 | # Add required metadata if missing 173 | required_metadata = self.get_required_metadata() 174 | for key in required_metadata: 175 | if key not in metadata: 176 | metadata_bytes[key.encode()] = f"MISSING: {required_metadata[key]}".encode() 177 | 178 | return table.replace_schema_metadata(metadata_bytes) 179 | 180 | def read_parquet(self, file_path: Union[str, Path], validate: bool = True, schema_type: str = 'common') -> pd.DataFrame: 181 | """ 182 | Read a parquet file with validation. 183 | 184 | Args: 185 | file_path: Path to parquet file 186 | validate: Whether to validate the schema 187 | schema_type: Type of schema to validate against 188 | 189 | Returns: 190 | pd.DataFrame: DataFrame from parquet file 191 | """ 192 | df = pd.read_parquet(file_path) 193 | 194 | if validate: 195 | is_valid, missing_columns = self.validate_schema(df, schema_type) 196 | if not is_valid: 197 | print(f"Warning: Parquet file {file_path} is missing required columns: {missing_columns}") 198 | 199 | # Add missing columns with default values 200 | for col in missing_columns: 201 | if col in ['id', 'filename', 'title', 'section', 'predicted_section', 'download_error']: 202 | df[col] = '' 203 | elif col in ['row_id', 'download_retry_count']: 204 | df[col] = 0 205 | elif col == 'download_success': 206 | df[col] = False 207 | elif col == 'probability': 208 | df[col] = 0.0 209 | 210 | return df 211 | 212 | def find_metadata_parquet(self, directory: Union[str, Path], require_url_column: bool = False) -> Optional[Path]: 213 | """ 214 | Find the first valid metadata parquet file in a directory. 215 | 216 | Looks for parquet files that don't have section-specific columns 217 | like 'title' and 'header', and prioritizes files with the url_column. 218 | 219 | Args: 220 | directory: Directory to search for parquet files 221 | require_url_column: If True, require the URL column to be present; if False, only require filename column 222 | 223 | Returns: 224 | Optional[Path]: Path to the first valid metadata parquet, or None if not found 225 | """ 226 | import logging 227 | logger = logging.getLogger(__name__) 228 | 229 | directory = Path(directory) 230 | if not directory.exists(): 231 | logger.debug(f"Directory {directory} does not exist") 232 | return None 233 | 234 | # Get all parquet files in the directory 235 | parquet_files = list(directory.glob('**/*.parquet')) 236 | if not parquet_files: 237 | logger.debug(f"No parquet files found in {directory}") 238 | return None 239 | 240 | # Check for download_results files first 241 | download_files = [f for f in parquet_files if 'download_results' in str(f)] 242 | if download_files: 243 | logger.debug(f"Found {len(download_files)} download_results files") 244 | 245 | # Examine all files 246 | for file_path in parquet_files: 247 | try: 248 | df = pd.read_parquet(file_path) 249 | columns = df.columns.tolist() 250 | 251 | # Skip section parquets - they have title/header columns 252 | if 'title' in columns or 'header' in columns or 'section' in columns: 253 | logger.debug(f"Skipping sections parquet: {file_path}") 254 | continue 255 | 256 | # For metadata parquets - they don't have title/header but have filename 257 | if 'filename' in columns: 258 | if require_url_column: 259 | # Check if required URL column exists 260 | if self.url_column in columns: 261 | logger.info(f"Found metadata parquet with filename and {self.url_column}: {file_path}") 262 | return file_path 263 | else: 264 | # Missing URL column 265 | logger.warning(f"Found parquet with filename column but no {self.url_column} column: {file_path}") 266 | logger.debug(f"Available columns: {columns}") 267 | else: 268 | # URL not required, filename is enough 269 | logger.info(f"Found metadata parquet with filename (URL not required): {file_path}") 270 | return file_path 271 | else: 272 | logger.debug(f"Found parquet without filename column: {file_path}") 273 | except Exception as e: 274 | logger.debug(f"Error reading parquet {file_path}: {e}") 275 | continue 276 | 277 | logger.warning(f"No suitable metadata parquet found in {directory}") 278 | return None 279 | 280 | def is_valid_metadata_parquet(self, filepath: Union[str, Path]) -> bool: 281 | """ 282 | Check if a parquet file conforms to the metadata schema used by downloader. 283 | 284 | Args: 285 | filepath: Path to the parquet file to check 286 | 287 | Returns: 288 | bool: True if the file has the required metadata fields 289 | """ 290 | try: 291 | schema = pq.read_schema(filepath) 292 | # Check for url_column (which might be custom) and filename 293 | required_fields = [self.url_column, 'filename'] 294 | return all(field in schema.names for field in required_fields) 295 | except Exception: 296 | return False 297 | 298 | def create_basic_metadata_parquet(self, markdown_dir: Union[str, Path], output_dir: Union[str, Path]) -> Union[Path, None]: 299 | """ 300 | Create a simple metadata parquet file from a directory of markdown files. 301 | This is used when there is no existing parquet file to update. 302 | 303 | Args: 304 | markdown_dir: Directory containing markdown files 305 | output_dir: Directory where to create the parquet file 306 | 307 | Returns: 308 | Path: Path to the created parquet file, or None if creation failed 309 | """ 310 | try: 311 | markdown_dir = Path(markdown_dir) 312 | output_dir = Path(output_dir) 313 | 314 | # Create output directory if it doesn't exist 315 | download_results_dir = output_dir / "download_results" 316 | os.makedirs(download_results_dir, exist_ok=True) 317 | 318 | # Get all markdown files in the input directory 319 | markdown_files = list(markdown_dir.glob("*.md")) 320 | if not markdown_files: 321 | print(f"No markdown files found in {markdown_dir}") 322 | return None 323 | 324 | # Create a DataFrame with just filenames 325 | data = [] 326 | for md_file in markdown_files: 327 | entry = { 328 | 'filename': md_file.name, 329 | self.url_column: "" # Minimal URL placeholder 330 | } 331 | data.append(entry) 332 | 333 | # Create DataFrame 334 | df = pd.DataFrame(data) 335 | 336 | # Set output path for the parquet file 337 | output_path = download_results_dir / "download_results.parquet" 338 | 339 | # Write to parquet without adding complex metadata 340 | pq.write_table(pa.Table.from_pandas(df), output_path) 341 | 342 | print(f"Created new metadata parquet file at {output_path}") 343 | return output_path 344 | 345 | except Exception as e: 346 | print(f"Error creating metadata parquet file: {e}") 347 | return None 348 | 349 | def is_download_result_parquet(self, filepath: Union[str, Path]) -> bool: 350 | """ 351 | Check if a parquet file contains download results with success/error information. 352 | 353 | Args: 354 | filepath: Path to the parquet file to check 355 | 356 | Returns: 357 | bool: True if the file has download result fields 358 | """ 359 | try: 360 | schema = pq.read_schema(filepath) 361 | # Check for download result fields 362 | required_fields = ['download_success', 'filename'] 363 | return all(field in schema.names for field in required_fields) 364 | except Exception: 365 | return False 366 | 367 | def is_sections_parquet(self, filepath: Union[str, Path]) -> bool: 368 | """ 369 | Check if a parquet file contains section data from extracted files. 370 | This identifies the second type of parquet in the pipeline - the sections parquet. 371 | 372 | Args: 373 | filepath: Path to the parquet file to check 374 | 375 | Returns: 376 | bool: True if the file has section data fields 377 | """ 378 | try: 379 | schema = pq.read_schema(filepath) 380 | # Check for required section fields 381 | required_fields = ['filename', 'title', 'content', 'section'] 382 | return all(field in schema.names for field in required_fields) 383 | except Exception: 384 | return False 385 | 386 | def add_processing_stage(self, df: pd.DataFrame, stage: str) -> pd.DataFrame: 387 | """ 388 | Add or update processing stage column in a DataFrame. 389 | 390 | Args: 391 | df: Input DataFrame to update 392 | stage: Processing stage value to set (e.g., 'downloaded', 'extracted', 'classified') 393 | 394 | Returns: 395 | pd.DataFrame: Updated DataFrame with processing_stage column 396 | """ 397 | df['processing_stage'] = stage 398 | return df 399 | 400 | def verify_required_columns(self, df: pd.DataFrame, required_columns: List[str]) -> Tuple[bool, List[str]]: 401 | """ 402 | Check if a DataFrame contains all required columns and return missing ones. 403 | 404 | Args: 405 | df: DataFrame to check 406 | required_columns: List of column names that should be present 407 | 408 | Returns: 409 | Tuple containing: 410 | - bool: True if all required columns are present 411 | - List[str]: List of missing columns (empty if all present) 412 | """ 413 | missing_columns = [col for col in required_columns if col not in df.columns] 414 | return (len(missing_columns) == 0, missing_columns) 415 | 416 | def write_parquet( 417 | self, 418 | df: pd.DataFrame, 419 | file_path: Union[str, Path], 420 | metadata: Optional[Dict[str, str]] = None, 421 | schema_type: str = 'common', 422 | validate: bool = True 423 | ) -> None: 424 | """ 425 | Write a DataFrame to parquet with standard schema and metadata. 426 | 427 | Args: 428 | df: DataFrame to write 429 | file_path: Path to write parquet file 430 | metadata: Dictionary of metadata to include 431 | schema_type: Type of schema to use 432 | validate: Whether to validate the schema before writing 433 | """ 434 | # Create a copy to avoid modifying the original 435 | df_copy = df.copy() 436 | 437 | # Validate and fix schema if needed 438 | if validate: 439 | is_valid, missing_columns = self.validate_schema(df_copy, schema_type) 440 | if not is_valid: 441 | print(f"Adding missing columns to DataFrame: {missing_columns}") 442 | 443 | # Add missing columns with default values 444 | for col in missing_columns: 445 | if col in ['id', 'filename', 'title', 'section', 'predicted_section', 'download_error']: 446 | df_copy[col] = '' 447 | elif col in ['row_id', 'download_retry_count']: 448 | df_copy[col] = 0 449 | elif col == 'download_success': 450 | df_copy[col] = False 451 | elif col == 'probability': 452 | df_copy[col] = 0.0 453 | 454 | # Convert to PyArrow Table 455 | table = pa.Table.from_pandas(df_copy) 456 | 457 | # Add metadata if provided 458 | if metadata: 459 | table = self.add_metadata(table, metadata) 460 | 461 | # Write to parquet 462 | pq.write_table(table, file_path) 463 | print(f"Parquet file written to {file_path} with schema type '{schema_type}'") 464 | -------------------------------------------------------------------------------- /pipeline/src/glossapi/sampler.py: -------------------------------------------------------------------------------- 1 | """ 2 | Sampler module for extracting samples from processed corpus data. 3 | 4 | This module provides functionality for sampling documents from processed 5 | parquet files, with options for filtering by column values and splitting 6 | into parts for cross-validation. 7 | """ 8 | 9 | import logging 10 | import os 11 | import pandas as pd 12 | import random 13 | from pathlib import Path 14 | from typing import Dict, Optional, Union, List, Any, Tuple 15 | 16 | class Sampler: 17 | """ 18 | A class for sampling documents from parquet files with flexible filtering options. 19 | 20 | This class allows sampling unique filenames based on specific criteria and 21 | extracting all their rows for analysis or further processing. 22 | 23 | Example: 24 | sampler = Sampler("/path/to/processed_data") 25 | 26 | # Sample 200 files where 'document_type' is 'Κεφάλαιο' 27 | sample_df = sampler.sample(sample_from={'document_type': 'Κεφάλαιο'}, n=200) 28 | 29 | # Sample 200 files from everything except where 'document_type' is 'Κεφάλαιο' 30 | sample_df = sampler.sample(sample_from_all_except={'document_type': 'Κεφάλαιο'}, n=200) 31 | 32 | # Sample and split into 2 equal parts for cross-validation 33 | sample_df = sampler.sample(n=200, parts=2) 34 | """ 35 | 36 | def __init__( 37 | self, 38 | base_dir: Union[str, Path], 39 | parquet_file: Optional[Union[str, Path]] = None, 40 | project_dir: Optional[Union[str, Path]] = None, 41 | log_level: int = logging.INFO 42 | ): 43 | """ 44 | Initialize the Sampler. 45 | 46 | Args: 47 | base_dir: Base directory where processed data is stored 48 | parquet_file: Optional specific parquet file to sample from 49 | (default: fully_annotated_sections.parquet in base_dir) 50 | project_dir: Optional project directory for text outputs 51 | (default: v2 directory in parent of base_dir) 52 | log_level: Logging level (default: INFO) 53 | """ 54 | self.base_dir = Path(base_dir) 55 | 56 | # Set up logging 57 | self.logger = logging.getLogger(__name__) 58 | self.logger.setLevel(log_level) 59 | 60 | if not self.logger.handlers: 61 | handler = logging.StreamHandler() 62 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 63 | handler.setFormatter(formatter) 64 | self.logger.addHandler(handler) 65 | 66 | # Set the default parquet file if not specified 67 | if parquet_file is None: 68 | self.parquet_file = self.base_dir / "fully_annotated_sections.parquet" 69 | else: 70 | self.parquet_file = Path(parquet_file) 71 | 72 | # Set up datasets directory in the base directory 73 | self.datasets_dir = self.base_dir / "datasets" 74 | os.makedirs(self.datasets_dir, exist_ok=True) 75 | 76 | # Set up project directory for text outputs 77 | if project_dir is None: 78 | try: 79 | # Try to find 'v2' directory in parent of base_dir 80 | parent_dir = self.base_dir.parent 81 | if (parent_dir / "v2").exists(): 82 | self.project_dir = parent_dir / "v2" 83 | else: 84 | # Fall back to base_dir if v2 not found 85 | self.project_dir = self.base_dir 86 | except Exception: 87 | self.project_dir = self.base_dir 88 | else: 89 | self.project_dir = Path(project_dir) 90 | 91 | # Set up text samples directory in the project directory 92 | self.text_dir = self.project_dir / "text_samples" 93 | os.makedirs(self.text_dir, exist_ok=True) 94 | 95 | def sample( 96 | self, 97 | n: int = 100, 98 | parts: int = 1, 99 | output_csv: Optional[Union[str, Path]] = None, 100 | sample_from: Optional[Dict[str, Any]] = None, 101 | sample_from_all_except: Optional[Dict[str, Any]] = None, 102 | output_name: Optional[str] = None 103 | ) -> Union[pd.DataFrame, List[pd.DataFrame]]: 104 | """ 105 | Sample a specified number of unique filenames and extract all their rows. 106 | 107 | Args: 108 | n: Number of unique filenames to sample 109 | parts: Number of even parts to split the sample into (default: 1) 110 | output_csv: Optional path to save the sampled data as CSV 111 | If not specified, will use output_name with default location 112 | sample_from: Optional dictionary {column: value} to sample only from rows 113 | where column has the specified value 114 | sample_from_all_except: Optional dictionary {column: value} to sample only 115 | from rows where column does NOT have the specified value 116 | output_name: Base name for output files (without extension) 117 | If not specified, will generate based on sampling criteria 118 | 119 | Returns: 120 | If parts=1: DataFrame containing all rows for the sampled filenames 121 | If parts>1: List of DataFrames, each containing rows for a part of the sampled filenames 122 | 123 | Raises: 124 | ValueError: If the specified column or label doesn't exist in the data 125 | """ 126 | if not self.parquet_file.exists(): 127 | self.logger.error(f"Parquet file not found: {self.parquet_file}") 128 | return pd.DataFrame() 129 | 130 | self.logger.info(f"Reading data from {self.parquet_file}...") 131 | 132 | # Read the parquet file 133 | df = pd.read_parquet(self.parquet_file,engine='fastparquet') 134 | 135 | # Check if filtering criteria are valid 136 | if sample_from: 137 | for column, value in sample_from.items(): 138 | if column not in df.columns: 139 | raise ValueError(f"Column '{column}' not found in the parquet file") 140 | if value not in df[column].values: 141 | if not value.startswith('regex') : 142 | raise ValueError(f"Value '{value}' not found in column '{column}'") 143 | 144 | if sample_from_all_except: 145 | for column, value in sample_from_all_except.items(): 146 | if column not in df.columns: 147 | raise ValueError(f"Column '{column}' not found in the parquet file") 148 | if value not in df[column].values: 149 | if not value.startswith('regex') : 150 | raise ValueError(f"Value '{value}' not found in column '{column}'") 151 | 152 | # Apply filters to the DataFrame 153 | filtered_df = df.copy() 154 | 155 | # Generate default output name if not provided 156 | if output_name is None: 157 | if sample_from and len(sample_from) == 1: 158 | col, val = next(iter(sample_from.items())) 159 | output_name = f"{val.lower().replace(' ', '_')}_samples" 160 | elif sample_from_all_except and len(sample_from_all_except) == 1: 161 | col, val = next(iter(sample_from_all_except.items())) 162 | output_name = f"non_{val.lower().replace(' ', '_')}_samples" 163 | else: 164 | output_name = "samples" 165 | 166 | # Apply filters to the DataFrame 167 | if sample_from: 168 | for column, value in sample_from.items(): 169 | if value.startswith('regex(') and value.endswith(')') : 170 | filter = (filtered_df[column].str.contains(value[6:-1])) 171 | filtered_df = filtered_df[filter] 172 | else : 173 | filtered_df = filtered_df[filtered_df[column] == value] 174 | self.logger.info(f"Filtered to rows where {column} = '{value}' ({len(filtered_df)} rows)") 175 | 176 | if sample_from_all_except: 177 | for column, value in sample_from_all_except.items(): 178 | if value.startswith('regex(') and value.endswith(')') : 179 | filter = (filtered_df[column].str.contains(value[6:-1])) 180 | filtered_df = filtered_df[~filter] 181 | else : 182 | filtered_df = filtered_df[filtered_df[column] != value] 183 | self.logger.info(f"Filtered to rows where {column} != '{value}' ({len(filtered_df)} rows)") 184 | 185 | # Get unique filenames from the filtered data 186 | unique_filenames = filtered_df['filename'].unique() 187 | total_unique = len(unique_filenames) 188 | 189 | if total_unique == 0: 190 | self.logger.error("No matching filenames found after applying filters") 191 | return pd.DataFrame() 192 | 193 | self.logger.info(f"Found {total_unique} unique filenames after filtering") 194 | 195 | if total_unique <= n: 196 | self.logger.warning(f"Requested sample size ({n}) is greater than or equal to the number of unique filenames ({total_unique}). Using all available filenames.") 197 | sampled_filenames = unique_filenames 198 | else: 199 | # Randomly sample unique filenames 200 | sampled_filenames = random.sample(list(unique_filenames), n) 201 | 202 | # Extract all rows for the sampled filenames 203 | sampled_df = df[df['filename'].isin(sampled_filenames)] 204 | 205 | self.logger.info(f"Sampled {len(sampled_filenames)} unique filenames with {len(sampled_df)} total rows") 206 | 207 | # Set up default output CSV path if not provided 208 | if output_csv is None and parts == 1: 209 | output_csv = self.datasets_dir / f"{output_name}.csv" 210 | 211 | # Save to CSV if output path is provided 212 | if output_csv and parts == 1: 213 | output_path = Path(output_csv) 214 | os.makedirs(output_path.parent, exist_ok=True) 215 | sampled_df.to_csv(output_path, index=False) 216 | self.logger.info(f"Saved sampled data to {output_path}") 217 | 218 | # Split into parts if requested 219 | if parts > 1: 220 | self.logger.info(f"Splitting sample into {parts} equal parts") 221 | 222 | # Split the sampled filenames into equal parts 223 | random.shuffle(sampled_filenames) 224 | filename_parts = [sampled_filenames[i::parts] for i in range(parts)] 225 | 226 | # Create a DataFrame for each part 227 | result_parts = [] 228 | for i, filenames in enumerate(filename_parts): 229 | part_df = df[df['filename'].isin(filenames)] 230 | result_parts.append(part_df) 231 | self.logger.info(f"Part {i+1}: {len(filenames)} filenames, {len(part_df)} rows") 232 | 233 | # Set up default output CSV path for each part if not provided 234 | if output_csv is None: 235 | part_output = self.datasets_dir / f"{output_name}_{i+1}.csv" 236 | else: 237 | # If output_csv is provided, create part-specific paths 238 | output_stem = Path(output_csv).stem 239 | output_suffix = Path(output_csv).suffix 240 | output_dir = Path(output_csv).parent 241 | part_output = output_dir / f"{output_stem}_{i+1}{output_suffix}" 242 | 243 | # Save each part 244 | os.makedirs(Path(part_output).parent, exist_ok=True) 245 | part_df.to_csv(part_output, index=False) 246 | self.logger.info(f"Saved part {i+1} to {part_output}") 247 | 248 | return result_parts 249 | 250 | return sampled_df 251 | 252 | def to_text( 253 | self, 254 | input_data: Union[str, Path, pd.DataFrame], 255 | output_dir: Optional[Union[str, Path]] = None, 256 | folder_name: Optional[str] = None 257 | ) -> None: 258 | """ 259 | Convert parquet or CSV data to formatted text files. 260 | 261 | Args: 262 | input_data: Path to parquet/CSV file or DataFrame containing the data 263 | output_dir: Directory to save the output text files 264 | If None, creates a directory in text_samples based on folder_name 265 | folder_name: Name for the output directory if output_dir is None 266 | If None, uses a default name based on timestamp 267 | """ 268 | # Set up output directory 269 | if output_dir is None: 270 | if folder_name is None: 271 | # Generate a timestamp-based name if no folder name provided 272 | if isinstance(input_data, pd.DataFrame): 273 | # Try to infer a good name from the DataFrame if available 274 | if 'document_type' in input_data.columns and len(input_data['document_type'].unique()) == 1: 275 | folder_name = f"{input_data['document_type'].iloc[0].lower().replace(' ', '_')}_samples" 276 | else: 277 | folder_name = f"samples_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}" 278 | else: 279 | # Use the input filename if it's a file 280 | if isinstance(input_data, (str, Path)): 281 | folder_name = Path(input_data).stem 282 | else: 283 | folder_name = f"samples_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}" 284 | 285 | output_dir = self.text_dir / folder_name 286 | 287 | self.logger.info(f"Converting data to formatted text files in {output_dir}...") 288 | 289 | # Create output directory 290 | output_dir = Path(output_dir) 291 | os.makedirs(output_dir, exist_ok=True) 292 | 293 | # Load data if input is a file path 294 | if isinstance(input_data, (str, Path)): 295 | input_path = Path(input_data) 296 | if input_path.suffix.lower() == '.csv': 297 | df = pd.read_csv(input_path) 298 | elif input_path.suffix.lower() == '.parquet': 299 | df = pd.read_parquet(input_path,engine='fatstparquet') 300 | else: 301 | self.logger.error(f"Unsupported file format: {input_path.suffix}") 302 | return 303 | else: 304 | # Assume input_data is a DataFrame 305 | df = input_data 306 | 307 | # Group by filename and sort by id 308 | self.logger.info("Grouping data by filename...") 309 | grouped = df.groupby('filename') 310 | 311 | # Process each unique filename 312 | for filename, group in grouped: 313 | # Sort by id to maintain the correct order of sections 314 | if 'id' in group.columns: 315 | group = group.sort_values('id') 316 | 317 | # Create output file path 318 | output_file_path = output_dir / f"{filename}.txt" 319 | 320 | # Write formatted content 321 | with open(output_file_path, 'w', encoding='utf-8') as f: 322 | # Write filename at the top 323 | f.write(f"# Document: {filename}\n\n") 324 | 325 | for _, row in group.iterrows(): 326 | # Write row with formatting 327 | section_type = row.get('predicted_section', '') 328 | row_id = row.get('row_id', '') 329 | header = row.get('header', '') 330 | section = row.get('section', '') 331 | 332 | f.write(f"{{{row_id}, {section_type}}} {header}\n\n") 333 | f.write(f"{section}\n\n") 334 | 335 | self.logger.info(f"Processed file: {output_file_path}") 336 | 337 | self.logger.info(f"Conversion complete. Text files saved to {output_dir}") 338 | -------------------------------------------------------------------------------- /refactoring_plan.md: -------------------------------------------------------------------------------- 1 | # GlossAPI Refactoring Plan 2 | 3 | ## Overview 4 | This document outlines the planned changes to the GlossAPI section classification pipeline, focusing on simplifying the section processing logic and changing the output structure. 5 | 6 | ## Key Changes 7 | 8 | ### 1. Simplification of Section Processing 9 | - Rename `_process_academic_text_with_positions` to `_process_sections` in all places in the code 10 | - Replace hierarchical section processing with flat processing: 11 | - Find text between two headers and define it as a section 12 | - Use the header above as the section's header 13 | - Process all markdown headers flatly instead of maintaining a hierarchical structure 14 | - **Important**: Maintain the existing functionality that protects lists and tables from cleaning and reformatting by detecting them and processing them differently 15 | 16 | ### 2. Changes to Output Schema 17 | - Remove the following columns from to_parquet: 18 | - `label` (string) 19 | - `section_propo` (int64) 20 | - `section_length` (int64) 21 | - Remove all related functionality for calculating `section_propo` and `section_length` 22 | 23 | ### 3. Section Content Structure Changes 24 | - Modify the logic in both `academic_section.py` and `gloss_section.py` 25 | - Return sections as JSON objects that contain, in the order they appear in the text, entries with keys: 26 | - "text" - for regular text content 27 | - "table" - for table content 28 | - "list" - for list content 29 | - "footnote" - for footnote content 30 | - Instead of deleting footnotes, annotate them appropriately 31 | - **Keep** the existing flags (`has_table`, `has_list`) in the output schema 32 | - **Add** new flags `has_footnote` and `has_text` to indicate presence of those content types 33 | - Implement detection logic to identify if a section contains non-empty lines that don't belong to tables, lists, or footnotes (for the `has_text` flag) 34 | 35 | ### 4. Implementation Plan 36 | 1. First, create new versions of the modules with the updated functionality 37 | 2. Ensure all dependencies and references are updated 38 | 3. Make sure the section processing works with these simplified changes 39 | 4. Test the pipeline with sample documents 40 | 41 | ## Files to be Changed 42 | - `/mnt/data/glossAPI/pipeline/src/glossapi/gloss_section.py` 43 | - `/mnt/data/glossAPI/pipeline/src/glossapi/academic_section.py` 44 | - Any other files that reference the renamed functions or changed outputs 45 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # GlossAPI Requirements 2 | # Automatically generated based on package imports 3 | 4 | # Core dependencies 5 | pandas>=1.3.0 6 | numpy>=1.20.0 7 | scikit-learn>=1.0.0 8 | joblib>=1.0.0 9 | dask>=2022.1.0 10 | pyarrow>=7.0.0 11 | 12 | # Document processing 13 | docling>=1.0.0 14 | 15 | # Python standard libraries (included for reference) 16 | # logging 17 | # os 18 | # pathlib 19 | # typing 20 | # re 21 | # random 22 | # shutil 23 | -------------------------------------------------------------------------------- /scraping/download_and_extract_scripts/__pycache__/downloader_app.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eellak/glossAPI/02ba170f69681c6bb1ad0a52b48fb2309c8354f1/scraping/download_and_extract_scripts/__pycache__/downloader_app.cpython-310.pyc -------------------------------------------------------------------------------- /scraping/download_and_extract_scripts/__pycache__/extractor_app.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eellak/glossAPI/02ba170f69681c6bb1ad0a52b48fb2309c8354f1/scraping/download_and_extract_scripts/__pycache__/extractor_app.cpython-310.pyc -------------------------------------------------------------------------------- /scraping/download_and_extract_scripts/downloader.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | import asyncio 3 | import os 4 | import argparse 5 | from urllib.parse import urlparse 6 | import random 7 | import aiofiles 8 | import logging 9 | import json 10 | import time 11 | 12 | 13 | #Configure logging for behavior tracking and errors 14 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 15 | 16 | #Function for the highest index of papers downloaded for continuation 17 | def get_indexes(papers): 18 | if papers: 19 | nums = [] 20 | for p in papers: 21 | num = p.split("_")[-1] 22 | nums.append(int(num)) 23 | return sorted(nums)[-1:] 24 | return [] 25 | 26 | #Function that is capable of downloading PDFs allowing retrial and concurrent downloads 27 | async def download_pdfs(metadata_dict, semaphore, visited, indexes, args, progress_report, retry=1): 28 | 29 | #Prepares tasks for download_pdf function and stores association of "paper_name.pdf" with original metadata. 30 | 31 | retry -= 1 32 | retries = {} #Dictionary holding files for download retrial 33 | tasks = [] #List to hold the tasks to be executed 34 | ordered_metadata = list(metadata_dict.items()) 35 | user_agent_gen = user_agent_generator() 36 | i = 0 37 | reached_end_of_file = True #flag: if all metadata are in "visited" 38 | 39 | #Process metadata urls and schedule downloads 40 | for metadata, url in ordered_metadata: 41 | if i < args.batch and metadata not in visited: 42 | reached_end_of_file = False 43 | if indexes: 44 | index = indexes[-1] + 1 45 | else: 46 | index = 1 47 | indexes.append(index) 48 | task = asyncio.create_task( 49 | download_pdf(index, metadata, url, semaphore, args, next(user_agent_gen)) 50 | ) 51 | tasks.append(task) 52 | i += 1 53 | results = await asyncio.gather(*tasks) 54 | for r in results: 55 | if r: 56 | has_downloaded_file, metadata, pdf_file_name = r 57 | if has_downloaded_file: 58 | progress_report[pdf_file_name[:-4]] = metadata 59 | else: 60 | logging.warning(f"Failed to download file for metadata: {metadata}") 61 | if retry > 0: 62 | retries[url] = metadata 63 | if retries and retry > 0: 64 | logging.info(f"Retrying download for {len(retries)} files") 65 | await download_pdfs(retries, semaphore, visited, indexes, args, progress_report, retry-1) 66 | if i < args.batch: reached_end_of_file = True 67 | return reached_end_of_file 68 | 69 | #Function to extract base URL from a given full URL 70 | async def get_base_url(url): 71 | if not url.startswith("http"): 72 | url = f"http://{url}" 73 | parsed_url = urlparse(url) 74 | base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" 75 | return base_url 76 | 77 | #Function for the initialization of session headers 78 | async def setup_session(session, url, headers): 79 | """ Initialize the session with base headers. """ 80 | base_url = await get_base_url(url) 81 | initial_url = f"{base_url}" 82 | async with session.get(initial_url, headers=headers) as response: 83 | await response.text() 84 | return headers 85 | 86 | #Function that arranges concurrent download of a PDFs given pdf_url, then returns download status, metadata and filename as a tuple. 87 | async def download_pdf(index, metadata, pdf_url, semaphore, args, user_agent, referer=None): 88 | 89 | if not referer: 90 | base_url = await get_base_url(pdf_url) 91 | else: 92 | base_url = referer 93 | headers = { 94 | 'User-Agent': user_agent, 95 | 'Referer': base_url 96 | } 97 | if not pdf_url.startswith("http"): 98 | pdf_url = f"http://{pdf_url}" 99 | sleep_time, file_type, request_type = args.sleep, args.type, args.req 100 | async with semaphore: 101 | timeout = aiohttp.ClientTimeout(total=60) 102 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False), timeout=timeout) as session: 103 | # Randomized sleep time between args.sleep and args.sleep + 2 (better for passing bot detection) 104 | await asyncio.sleep(random.uniform(sleep_time, sleep_time + 2)) 105 | 106 | file_name = f'paper_{index}.{file_type}' # Names file by order of appearance 107 | try: 108 | await setup_session(session, pdf_url, headers) 109 | requester = getattr(session, request_type) # sets session type as either session.get or session.post 110 | async with requester(pdf_url, headers=headers, allow_redirects=False) as response: 111 | if response.status in (301, 302): 112 | logging.error(f"Redirected: {pdf_url} to {response.headers['Location']}. Status code: {response.status}") 113 | return (False, metadata, file_name) 114 | elif response.status == 200: 115 | content = await response.read() 116 | if args.output: output_path = args.output 117 | await write_file(file_name, content, output_path) 118 | logging.info(f"Downloaded {file_name}") 119 | return (True, metadata, file_name) 120 | else: 121 | logging.error(f"Failed to download {pdf_url}. Status code: {response.status}") 122 | except aiohttp.ClientError as e: 123 | logging.error(f"ClientError while downloading {pdf_url}: {e}") 124 | except aiohttp.http_exceptions.HttpProcessingError as e: 125 | logging.error(f"HTTP processing error while downloading {pdf_url}: {e}") 126 | except asyncio.TimeoutError: 127 | logging.error(f"Timeout error while downloading {pdf_url}") 128 | except Exception as e: 129 | logging.error(f"Unexpected error while downloading {pdf_url}: {e}") 130 | return (False, metadata, file_name) 131 | 132 | #Function that writes downloaded content to a file 133 | async def write_file(filename, content, output_path = "./"): 134 | path_to_file = os.path.join(output_path, filename) 135 | async with aiofiles.open(path_to_file, 'wb') as file: 136 | await file.write(content) 137 | 138 | #Function to generate random user-agents for avoiding bot detection 139 | #to add proxy rotation option 140 | def user_agent_generator(): 141 | 142 | templates = [ 143 | "Mozilla/5.0 ({os}) AppleWebKit/537.36 (KHTML, like Gecko) {browser}/{version} Safari/537.36", 144 | "Mozilla/5.0 ({os}) Gecko/20100101 {browser}/{version}", 145 | "Mozilla/5.0 ({os}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{version} Safari/537.36" 146 | ] 147 | operating_systems = [ 148 | "Windows NT 10.0; Win64; x64", 149 | "Macintosh; Intel Mac OS X 10_15_7", 150 | "X11; Linux x86_64", 151 | "Windows NT 6.1; Win64; x64", 152 | "Android 9; Mobile; rv:40.0" 153 | ] 154 | browsers = [ 155 | ("Chrome", random.randint(70, 90)), 156 | ("Firefox", random.randint(50, 80)), 157 | ("Edge", random.randint(80, 90)) 158 | ] 159 | while True: 160 | template = random.choice(templates) 161 | os = random.choice(operating_systems) 162 | browser, version = random.choice(browsers) 163 | full_version = f"{version}.0.{random.randint(1000, 9999)}" 164 | user_agent = template.format(os=os, browser=browser, version=full_version) 165 | yield user_agent 166 | 167 | #Function for overall program executon 168 | async def run(args): 169 | current_working_directory = os.getcwd() 170 | path_to_url_siteguide = os.path.join(current_working_directory, args.filename) 171 | with open(path_to_url_siteguide, 'r') as file: 172 | metadata_dict = json.load(file) 173 | 174 | semaphore = asyncio.Semaphore(3) #if you get flagged by bot detection try adjusting value 175 | try: 176 | try: 177 | with open('progress_report.json', 'r') as file: 178 | progress_report = json.load(file) 179 | logging.info("Existing progress report found and loaded") 180 | indexes = get_indexes(list(progress_report.keys())) 181 | except FileNotFoundError: 182 | progress_report = {} 183 | indexes = [] 184 | logging.info("No existing progress report found") 185 | visited = list(progress_report.values()) 186 | # Download PDFs and update progress report 187 | logging.info(f"Starting download from {args.filename}") 188 | finished = await download_pdfs(metadata_dict, semaphore, visited, indexes, args, progress_report) 189 | logging.info(f"Finished download from {args.filename}") 190 | 191 | except Exception as e: 192 | logging.error(f"An error occurred: {e}") 193 | raise 194 | finally: 195 | if finished: 196 | logging.info("All available have been downloaded - Finished!") 197 | # still write to progress_report.json in case it finished because of i < args.batch 198 | with open('progress_report.json', 'w') as file: 199 | json.dump(progress_report, file, ensure_ascii=False, indent=4) 200 | return True 201 | else: 202 | logging.info("PDF downloads completed") 203 | with open('progress_report.json', 'w') as file: 204 | json.dump(progress_report, file, ensure_ascii=False, indent=4) 205 | logging.info("Progress report written to progress_report.json") 206 | return False 207 | 208 | #Function for handling command-line arguments 209 | def parse_input(): 210 | parser = argparse.ArgumentParser(description="Gets PDFs through URLs given as value entries in a JSON.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) 211 | parser.add_argument("--json", help="Add path to JSON file with URLs siteguide", required=True) 212 | parser.add_argument("--sleep", type=int, default=1, help="Set delay before new request is made (in seconds)") 213 | parser.add_argument("--type", help="Select file type to be downloaded e.g., 'pdf', 'doc'", required=True) 214 | parser.add_argument("--req", choices=['get', 'post'], default='get', help="Set request type 'get' or 'post'") 215 | parser.add_argument("-o", "--output", default="./", help="Set download directory") 216 | parser.add_argument("--little_potato", help="Set directory for progress_report.json (previously little_potato), default value is set to --output") 217 | parser.add_argument("--batch", type=int, default=10, help="Set number of files to download per run") 218 | args = parser.parse_args() 219 | 220 | if not args.little_potato: 221 | args.little_potato = args.output 222 | logging.info(f"Arguments received: JSON file: {args.json}, Sleep time: {args.sleep}, File type: {args.type}, Request type: {args.req}, Output path: {args.output}, 'progress_report.json' path: {args.little_potato}") 223 | return args 224 | 225 | #The main function to parse input arguments, load URL metadata from a JSON file, manage download progress with semaphores for concurrency, and save the download progress to a JSON report file 226 | async def main(): 227 | args = parse_input() 228 | with open(args.json, 'r') as file: 229 | metadata_dict = json.load(file) 230 | #Semaphore that limits concurrent downloads 231 | semaphore = asyncio.Semaphore(3) # Adjust the value as needed 232 | 233 | try: 234 | #Read existing progress report if any 235 | try: 236 | progress_report_path = os.path.join(args.little_potato, 'progress_report.json') 237 | with open(progress_report_path, 'r') as file: 238 | progress_report = json.load(file) 239 | logging.info("Existing progress report found and loaded") 240 | indexes = get_indexes(list(progress_report.keys())) 241 | except FileNotFoundError: 242 | progress_report = {} 243 | indexes = [] 244 | logging.info("No existing progress report found") 245 | visited = list(progress_report.values()) 246 | logging.info("Starting PDF downloads") 247 | finished = await download_pdfs(metadata_dict, semaphore, visited, indexes, args, progress_report) 248 | if finished: 249 | logging.info("All available files are in progress_report.json - Finished!") 250 | else: 251 | logging.info("PDF downloads completed") 252 | except Exception as e: 253 | logging.error(f"An error occurred: {e}") 254 | raise 255 | finally: 256 | #Write progress report to a JSON file 257 | progress_report_path = os.path.join(args.little_potato, 'progress_report.json') 258 | with open(progress_report_path, 'w') as file: 259 | json.dump(progress_report, file, ensure_ascii=False, indent=4) 260 | logging.info("Progress report written to progress_report.json") 261 | 262 | #Entry point of Downloader 263 | if __name__ == "__main__": 264 | asyncio.run(main()) -------------------------------------------------------------------------------- /scraping/json_sitemaps/boithimata-glossas-G-Lyk_pdf.json: -------------------------------------------------------------------------------- 1 | { 2 | "Προβολή σημειώσεων 1ο Διαγώνισμα - 1η Εκδοχή - Θέματα": "http://www.study4exams.gr/mod_greek/pdf/NG_D/NG_D1_THEMATA_1h_ekdoxh.pdf", 3 | "Προβολή σημειώσεων 1ο Διαγώνισμα - 1η Εκδοχή - Ενδεικτικές απαντήσεις": "http://www.study4exams.gr/mod_greek/pdf/NG_D/NG_D1_APANTHSEIS_1h_ekdoxh.pdf", 4 | "Προβολή σημειώσεων 1ο Διαγώνισμα - 2η Εκδοχή - Θέματα": "http://www.study4exams.gr/mod_greek/pdf/NG_D/NG_D1_THEMATA_2h_ekdoxh.pdf", 5 | "Προβολή σημειώσεων 1ο Διαγώνισμα - 2η Εκδοχή - Ενδεικτικές απαντήσεις": "http://www.study4exams.gr/mod_greek/pdf/NG_D/NG_D1_APANTHSEIS_2h_ekdoxh.pdf", 6 | "Προβολή σημειώσεων 2ο Διαγώνισμα - Θέματα (ΝΕΟ_2020)": "http://www.study4exams.gr/mod_greek/pdf/NG_D/NG_D2_THEMATA.pdf", 7 | "Προβολή σημειώσεων 2ο Διαγώνισμα - Ενδεικτικές απαντήσεις": "http://www.study4exams.gr/mod_greek/pdf/NG_D/NG_D2_APANTHSEIS.pdf" 8 | } 9 | -------------------------------------------------------------------------------- /scraping/json_sitemaps/greek-language_pdf.json: -------------------------------------------------------------------------------- 1 | { 2 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 01": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 01.pdf", 3 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 02": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 02.pdf", 4 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 03": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 03.pdf", 5 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 04": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 04.pdf", 6 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 05": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 05.pdf", 7 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 06": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 06.pdf", 8 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 07": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 07.pdf", 9 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 08": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 08.pdf", 10 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 09": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 09.pdf", 11 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 10": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 10.pdf", 12 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 11": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 11.pdf", 13 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 12": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 12.pdf", 14 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 13": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 13.pdf", 15 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 14": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 14.pdf", 16 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 15": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 15.pdf", 17 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 16": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 16.pdf", 18 | " 2. Ασκήσεις ΚΓΛ > Ενότητα 17": "https://www.greek-language.gr/certification/ΚΛΙΚ/2. Ασκήσεις ΚΓΛ/Ενότητα 17.pdf", 19 | " 3. Υλικό εξάσκησης > ΚΓΛ_1_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α1/ΚΓΛ_1_Α1.pdf", 20 | " 3. Υλικό εξάσκησης > ΚΓΛ_2_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α1/ΚΓΛ_2_Α1.pdf", 21 | " 3. Υλικό εξάσκησης > ΚΓΛ_3_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α1/ΚΓΛ_3_Α1.pdf", 22 | " 3. Υλικό εξάσκησης > ΚΓΛ_4_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α1/ΚΓΛ_4_Α1.pdf", 23 | " 3. Υλικό εξάσκησης > ΚΓΛ_5_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α1/ΚΓΛ_5_Α1.pdf", 24 | " 3. Υλικό εξάσκησης > ΚΓΛ_1_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α2/ΚΓΛ_1_Α2.pdf", 25 | " 3. Υλικό εξάσκησης > ΚΓΛ_2_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α2/ΚΓΛ_2_Α2.pdf", 26 | " 3. Υλικό εξάσκησης > ΚΓΛ_3_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α2/ΚΓΛ_3_Α2.pdf", 27 | " 3. Υλικό εξάσκησης > ΚΓΛ_4_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α2/ΚΓΛ_4_Α2.pdf", 28 | " 3. Υλικό εξάσκησης > ΚΓΛ_5_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΓΛ_Α2/ΚΓΛ_5_Α2.pdf", 29 | " 3. Υλικό εξάσκησης > ΚΠΛ_1_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α1/ΚΠΛ_1_Α1.pdf", 30 | " 3. Υλικό εξάσκησης > ΚΠΛ_2_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α1/ΚΠΛ_2_Α1.pdf", 31 | " 3. Υλικό εξάσκησης > ΚΠΛ_3_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α1/ΚΠΛ_3_Α1.pdf", 32 | " 3. Υλικό εξάσκησης > ΚΠΛ_4_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α1/ΚΠΛ_4_Α1.pdf", 33 | " 3. Υλικό εξάσκησης > ΚΠΛ_1_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α2/ΚΠΛ_1_Α2.pdf", 34 | " 3. Υλικό εξάσκησης > ΚΠΛ_2_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α2/ΚΠΛ_2_Α2.pdf", 35 | " 3. Υλικό εξάσκησης > ΚΠΛ_3_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α2/ΚΠΛ_3_Α2.pdf", 36 | " 3. Υλικό εξάσκησης > ΚΠΛ_4_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΚΠΛ_Α2/ΚΠΛ_4_Α2.pdf", 37 | " 3. Υλικό εξάσκησης > ΠΓΛ_1_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α1/ΠΓΛ_1_Α1.pdf", 38 | " 3. Υλικό εξάσκησης > ΠΓΛ_2_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α1/ΠΓΛ_2_Α1.pdf", 39 | " 3. Υλικό εξάσκησης > ΠΓΛ_3_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α1/ΠΓΛ_3_Α1.pdf", 40 | " 3. Υλικό εξάσκησης > ΠΓΛ_4_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α1/ΠΓΛ_4_Α1.pdf", 41 | " 3. Υλικό εξάσκησης > ΠΓΛ_5_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α1/ΠΓΛ_5_Α1.pdf", 42 | " 3. Υλικό εξάσκησης > ΠΓΛ_1_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α2/ΠΓΛ_1_Α2.pdf", 43 | " 3. Υλικό εξάσκησης > ΠΓΛ_2_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α2/ΠΓΛ_2_Α2.pdf", 44 | " 3. Υλικό εξάσκησης > ΠΓΛ_3_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α2/ΠΓΛ_3_Α2.pdf", 45 | " 3. Υλικό εξάσκησης > ΠΓΛ_4_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α2/ΠΓΛ_4_Α2.pdf", 46 | " 3. Υλικό εξάσκησης > ΠΓΛ_5_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΓΛ_Α2/ΠΓΛ_5_Α2.pdf", 47 | " 3. Υλικό εξάσκησης > ΠΠΛ_Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΠΛ_Α1/ΠΠΛ_Α1.pdf", 48 | " 3. Υλικό εξάσκησης > ΠΠΛ_Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/3. Υλικό εξάσκησης/ΠΠΛ_Α2/ΠΠΛ_Α2.pdf", 49 | " 4. Κείμενα ΚΠΛ > Κείμενα ΚΠΛ": "https://www.greek-language.gr/certification/ΚΛΙΚ/4. Κείμενα ΚΠΛ/Κείμενα ΚΠΛ.pdf", 50 | " 5. Λύσεις των ασκήσεων > Απαντήσεις Α1": "https://www.greek-language.gr/certification/ΚΛΙΚ/5. Λύσεις των ασκήσεων/Απαντήσεις Α1.pdf", 51 | " 5. Λύσεις των ασκήσεων > Απαντήσεις Α2": "https://www.greek-language.gr/certification/ΚΛΙΚ/5. Λύσεις των ασκήσεων/Απαντήσεις Α2.pdf" 52 | } -------------------------------------------------------------------------------- /scraping/json_sitemaps/kentra-ekpaideusis-enhlikwn_pdf.json: -------------------------------------------------------------------------------- 1 | { 2 | "ταξίδι στη γλώσσα": "http://repository.edulll.gr/edulll/retrieve/742/127.pdf" 3 | } 4 | -------------------------------------------------------------------------------- /scraping/json_sitemaps/sitemap_explainer.txt: -------------------------------------------------------------------------------- 1 | JSON with {metadata : file_link} pairs. 2 | 3 | Each JSON file corresponds to a website source which contains eg files of university theses, school/ university books, or school/ uni entry exams. 4 | 5 | Metadata is the native categorization of the site, each level of recurssion is split by " > ", and ends with the file title. 6 | 7 | By using downloader10.py on each of these files you get files "paper_n.pdf" or similar and another JSON file 8 | associating {filename : metadata}. By running extractor4.py you get similar result but for "paper_n.txt" or similar. 9 | -------------------------------------------------------------------------------- /scraping/json_sitemaps/themata-lyseis-panelladikwn_pdf.json: -------------------------------------------------------------------------------- 1 | { 2 | "2023 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2023/Istoria120623.pdf", 3 | "2023 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2023/fsm_fra_istoria120623new.pdf", 4 | "2023 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2023/Mathimatika060623.pdf", 5 | "2023 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2023/fsm_fra_mathimatika_060623.pdf", 6 | "2023 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2023/fsm_fra_mathimatika_060623.pdf", 7 | "2023 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2023/Fysiki120623.pdf", 8 | "2022 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2022/them_istoria100622.pdf", 9 | "2022 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2022/fsm_fra_istoria100622.pdf", 10 | "2022 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2022/them_math_gel_220606.pdf", 11 | "2022 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2022/fsm_fra_math060622.pdf", 12 | "2022 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2022/fsm_fra_math060622.pdf", 13 | "2022 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2022/them_fysiki100622.pdf", 14 | "2021 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2021/Istoria220621.pdf", 15 | "2021 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2021/fsm_fra_istoria220621.pdf", 16 | "2021 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2021/mathimatika_160621.pdf", 17 | "2021 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2021/fsm_fra_math160621.pdf", 18 | "2021 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2021/fsm_fra_math160621.pdf", 19 | "2021 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2021/Fysiki220621.pdf", 20 | "2020 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2020/Istoria240620.pdf", 21 | "2020 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2020/fsm_fra_istoria240620.pdf", 22 | "2020 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2020/mathimatika170620neo.pdf", 23 | "2020 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2020/fsm_fra_math170620final.pdf", 24 | "2020 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2020/fsm_fra_math170620final.pdf", 25 | "2020 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2020/Fysiki220620.pdf", 26 | "2019 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2019/Istoria_120619.pdf", 27 | "2019 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2019/fsm_fra_istoria120619final.pdf", 28 | "2019 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2019/Mathimatika_100619.pdf", 29 | "2019 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2019/fsm_fra_math100619final.pdf", 30 | "2019 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2019/fsm_fra_math100619final.pdf", 31 | "2019 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2019/Fysiki_120619.pdf", 32 | "2018 > Ιστορία > Θέματα": "https://eduadvisor.grhttp://www.minedu.gov.gr/publications/docs2018/EXETASEIS-2018/them_ist_op_c_hmer_180613.pdf", 33 | "2018 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2018/fsm_fra_ist_130618.pdf", 34 | "2018 > Μαθηματικά > Θέματα": "https://eduadvisor.grhttp://www.minedu.gov.gr/publications/docs2018/EXETASEIS-2018/them_mat_op_c_hmer_180611.pdf", 35 | "2018 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2018/fsm_fra_math_110618final.pdf", 36 | "2018 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2018/fsm_fra_math_110618final.pdf", 37 | "2018 > Φυσική > Λύσεις": "https://eduadvisor.grhttp://www.minedu.gov.gr/publications/docs2018/EXETASEIS-2018/them_fis_op_c_hmer_180613.pdf", 38 | "2017 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/Neo_Genikis_themata_070617.pdf", 39 | "2017 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/Neo_Genikis_lyseis_070617.pdf", 40 | "2017 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/MathimatikaKat_Themata090617.pdf", 41 | "2017 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/MathimatikaKat_Lyseis090617.pdf", 42 | "2017 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/Istoria_Themata_120617.pdf", 43 | "2017 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/Istoria_Lyseis_120617.pdf", 44 | "2017 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/MathimatikaKat_Lyseis090617.pdf", 45 | "2017 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2017/Fysiki_Themata_120617.pdf", 46 | "2016 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/Neoelliniki16_themata.pdf", 47 | "2016 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/fsm_fra_neo_16052016new.pdf", 48 | "2016 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/e_math_180516.pdf", 49 | "2016 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/fsm_fra_math_18052016new.pdf", 50 | "2016 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/e_ist_23052016.pdf", 51 | "2016 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/fsm_fra_ist_23052016new.pdf", 52 | "2016 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/fsm_fra_math_18052016new.pdf", 53 | "2016 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2016/e_fys_23052016.pdf", 54 | "2015 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_neo_18052015.pdf", 55 | "2015 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_neo_18052015_new.pdf", 56 | "2015 > Μαθηματικά και Στοιχεία Στατιστικής > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_neo_18052015_new.pdf", 57 | "2015 > Μαθηματικά και Στοιχεία Στατιστικής > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_math_20052015.pdf", 58 | "2015 > Βιολογία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_math_20052015.pdf", 59 | "2015 > Βιολογία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_math_20052015new.pdf", 60 | "2015 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_xhm_22052015.pdf", 61 | "2015 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_xhm_22052015new.pdf", 62 | "2015 > Λογοτεχνία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_neo_22052015.pdf", 63 | "2015 > Λογοτεχνία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_neo_22052015new.pdf", 64 | "2015 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_neo_22052015new.pdf", 65 | "2015 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_ist_29052015.pdf", 66 | "2015 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_math_25052015.pdf", 67 | "2015 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_math_25052015new.pdf", 68 | "2015 > Χημεία Βιοχημεία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_math_25052015new.pdf", 69 | "2015 > Χημεία Βιοχημεία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_xhm_22052015.pdf", 70 | "2015 > Αρχές Οργάνωσης και Διοίκησης Επιχειρήσεων > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/fsm_fra_xhm_22052015new.pdf", 71 | "2015 > Αρχές Οργάνωσης και Διοίκησης Επιχειρήσεων > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies2015/e_fys_29052015.pdf", 72 | "2014 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_neo_28052014.pdf", 73 | "2014 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_neo_updated.pdf", 74 | "2014 > Μαθηματικά και Στοιχεία Στατιστικής > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_neo_updated.pdf", 75 | "2014 > Μαθηματικά και Στοιχεία Στατιστικής > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_math_0530.pdf", 76 | "2014 > Βιολογία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_math_0530.pdf", 77 | "2014 > Βιολογία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_math_0530_updated.pdf", 78 | "2014 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_xhm_0604.pdf", 79 | "2014 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_xhm_0406_new.pdf", 80 | "2014 > Λογοτεχνία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_neo_0604.pdf", 81 | "2014 > Λογοτεχνία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_neo_0406_updated.pdf", 82 | "2014 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_neo_0406_updated.pdf", 83 | "2014 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_ist_1006.pdf", 84 | "2014 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_math_0206.pdf", 85 | "2014 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_math_0206_updated.pdf", 86 | "2014 > Χημεία Βιοχημεία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_math_0206_updated.pdf", 87 | "2014 > Χημεία Βιοχημεία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_xhm_0604.pdf", 88 | "2014 > Αρχές Οργάνωσης και Διοίκησης Επιχειρήσεων > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/fsm_fra_xhm_0406_new.pdf", 89 | "2014 > Αρχές Οργάνωσης και Διοίκησης Επιχειρήσεων > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Panellinies_2014/e_fys_1006.pdf", 90 | "2013 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/%CE%A0%CE%91%CE%9D%CE%95%CE%9B%CE%9B%CE%97%CE%9D%CE%99%CE%95%CE%A3%202013/%CE%98%CE%95%CE%9C%CE%91%CE%A4%CE%91%20%CE%9A%CE%91%CE%99%20%CE%91%CE%A0%CE%91%CE%9D%CE%A4%CE%97%CE%A3%CE%95%CE%99%CE%A3/2013/Pan_neo_gen_17.pdf", 91 | "2013 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/%CE%A0%CE%91%CE%9D%CE%95%CE%9B%CE%9B%CE%97%CE%9D%CE%99%CE%95%CE%A3%202013/%CE%98%CE%95%CE%9C%CE%91%CE%A4%CE%91%20%CE%9A%CE%91%CE%99%20%CE%91%CE%A0%CE%91%CE%9D%CE%A4%CE%97%CE%A3%CE%95%CE%99%CE%A3/2013/fsm_fra_neo_gen_17.pdf", 92 | "2013 > Μαθηματικά και Στοιχεία Στατιστικής > Θέματα": "https://eduadvisor.gr/images/stories/pdf/%CE%A0%CE%91%CE%9D%CE%95%CE%9B%CE%9B%CE%97%CE%9D%CE%99%CE%95%CE%A3%202013/%CE%98%CE%95%CE%9C%CE%91%CE%A4%CE%91%20%CE%9A%CE%91%CE%99%20%CE%91%CE%A0%CE%91%CE%9D%CE%A4%CE%97%CE%A3%CE%95%CE%99%CE%A3/2013/fsm_fra_neo_gen_17.pdf", 93 | "2013 > Μαθηματικά και Στοιχεία Στατιστικής > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/%CE%A0%CE%91%CE%9D%CE%95%CE%9B%CE%9B%CE%97%CE%9D%CE%99%CE%95%CE%A3%202013/%CE%98%CE%95%CE%9C%CE%91%CE%A4%CE%91%20%CE%9A%CE%91%CE%99%20%CE%91%CE%A0%CE%91%CE%9D%CE%A4%CE%97%CE%A3%CE%95%CE%99%CE%A3/2013/pan_math_gen_20.pdf", 94 | "2013 > Βιολογία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/%CE%A0%CE%91%CE%9D%CE%95%CE%9B%CE%9B%CE%97%CE%9D%CE%99%CE%95%CE%A3%202013/%CE%98%CE%95%CE%9C%CE%91%CE%A4%CE%91%20%CE%9A%CE%91%CE%99%20%CE%91%CE%A0%CE%91%CE%9D%CE%A4%CE%97%CE%A3%CE%95%CE%99%CE%A3/2013/pan_math_gen_20.pdf", 95 | "2013 > Βιολογία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/%CE%A0%CE%91%CE%9D%CE%95%CE%9B%CE%9B%CE%97%CE%9D%CE%99%CE%95%CE%A3%202013/%CE%98%CE%95%CE%9C%CE%91%CE%A4%CE%91%20%CE%9A%CE%91%CE%99%20%CE%91%CE%A0%CE%91%CE%9D%CE%A4%CE%97%CE%A3%CE%95%CE%99%CE%A3/2013/fsm_fra_math_20_final.pdf", 96 | "2013 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/pan_xhm_vio_kat_24.pdf", 97 | "2013 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/fsm_fra_xhm_vio_kat_24.pdf", 98 | "2013 > Λογοτεχνία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/pan_neo_22.pdf", 99 | "2013 > Λογοτεχνία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/fsm_fra_neo_22.pdf", 100 | "2013 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/fsm_fra_neo_22.pdf", 101 | "2013 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/pan_ist_kat_24.pdf", 102 | "2013 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/pan_math_kat_27.pdf", 103 | "2013 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/fsm_fra_math_kat_27_final2.pdf", 104 | "2013 > Χημεία Βιοχημεία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/fsm_fra_math_kat_27_final2.pdf", 105 | "2013 > Χημεία Βιοχημεία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/pan_xhm_vio_kat_24.pdf", 106 | "2013 > Αρχές Οργάνωσης και Διοίκησης Επιχειρήσεων > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/fsm_fra_xhm_vio_kat_24.pdf", 107 | "2013 > Αρχές Οργάνωσης και Διοίκησης Επιχειρήσεων > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2013/pan_fys_22.pdf", 108 | "2012 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/NeoellinikiGlossa21.pdf", 109 | "2012 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/tmthma_neo.pdf", 110 | "2012 > Μαθηματικά και Στοιχεία Στατιστικής > Θέματα": "https://eduadvisor.gr/images/stories/pdf/tmthma_neo.pdf", 111 | "2012 > Μαθηματικά και Στοιχεία Στατιστικής > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Mathimatika23.pdf", 112 | "2012 > Βιολογία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/Mathimatika23.pdf", 113 | "2012 > Βιολογία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/tmthma_math.pdf", 114 | "2012 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/tmthma_mathEPAL.pdf", 115 | "2012 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/FysikiKAT.pdf", 116 | "2012 > Λογοτεχνία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/NeoellKat.pdf", 117 | "2012 > Λογοτεχνία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/tmthma_neoKAT.pdf", 118 | "2012 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/tmthma_neoKAT.pdf", 119 | "2012 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Ist_kat30.pdf", 120 | "2012 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/MathI_EPAL24.pdf", 121 | "2012 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/tmthma_mathEPAL.pdf", 122 | "2012 > Χημεία Βιοχημεία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/tmthma_math28.pdf", 123 | "2012 > Χημεία Βιοχημεία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/Biox_kat30.pdf", 124 | "2011 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Νεοελληνική%20Γλώσσα.pdf", 125 | "2011 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Νεοελληνική%20Γλώσσα%20(Λύσεις).pdf", 126 | "2011 > Μαθηματικά και Στοιχεία Στατιστικής > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Νεοελληνική%20Γλώσσα%20(Λύσεις).pdf", 127 | "2011 > Μαθηματικά και Στοιχεία Στατιστικής > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Μαθηματικά%20και%20Στοιχεία%20Στατιστικής.pdf", 128 | "2011 > Βιολογία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Μαθηματικά%20και%20Στοιχεία%20Στατιστικής.pdf", 129 | "2011 > Βιολογία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Μαθηματικά%20και%20Στοιχεία%20Στατιστικής%20(Λύσεις).pdf", 130 | "2011 > Λογοτεχνία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Λογοτεχνία.pdf", 131 | "2011 > Λογοτεχνία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Λογοτεχνία%20(Λύσεις).pdf", 132 | "2011 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Λογοτεχνία%20(Λύσεις).pdf", 133 | "2011 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Ιστορία.pdf", 134 | "2011 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Μαθηματικά.pdf", 135 | "2011 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Μαθηματικά%20(Λύσεις).pdf", 136 | "2011 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Μαθηματικά%20(Λύσεις).pdf", 137 | "2011 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Φυσική.pdf", 138 | "2011 > Χημεία Βιοχημεία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Μαθηματικά%20(Λύσεις).pdf", 139 | "2011 > Χημεία Βιοχημεία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Χημεία%20-%20Βιοχημεία.pdf", 140 | "2010-2001 > Νεοελληνική Γλώσσα > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Νεοελληνική%20Γλώσσα.pdf", 141 | "2010-2001 > Νεοελληνική Γλώσσα > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Μαθηματικά%20και%20Στοιχεία%20Στατιστικής.pdf", 142 | "2010-2001 > Μαθηματικά και Στοιχεία Στατιστικής > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Μαθηματικά%20και%20Στοιχεία%20Στατιστικής.pdf", 143 | "2010-2001 > Μαθηματικά και Στοιχεία Στατιστικής > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Βιολογία%20Γενικής%20Παιδείας.pdf", 144 | "2010-2001 > Βιολογία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Βιολογία%20Γενικής%20Παιδείας.pdf", 145 | "2010-2001 > Βιολογία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΓΕΝΙΚΗ%20ΠΑΙΔΕΙΑ/Φυσική%20Γενικής%20Παιδείας.pdf", 146 | "2010-2001 > Λογοτεχνία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Λογοτεχνία.pdf", 147 | "2010-2001 > Λογοτεχνία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Ιστορία%20Κατεύθυνσης.pdf", 148 | "2010-2001 > Ιστορία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Ιστορία%20Κατεύθυνσης.pdf", 149 | "2010-2001 > Ιστορία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΘΕΩΡΗΤΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Αρχαία.pdf", 150 | "2010-2001 > Μαθηματικά > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Μαθηματικά%20Κατεύθυνσης.pdf", 151 | "2010-2001 > Μαθηματικά > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Φυσική%20Κατεύθυνσης.pdf", 152 | "2010-2001 > Φυσική > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Φυσική%20Κατεύθυνσης.pdf", 153 | "2010-2001 > Φυσική > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ%20ΙΙ/Αρχές%20Οργάνωσης%20&%20Διοίκησης.pdf", 154 | "2010-2001 > Χημεία Βιοχημεία > Θέματα": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Χημεία%20-%20Βιοχημεία.pdf", 155 | "2010-2001 > Χημεία Βιοχημεία > Λύσεις": "https://eduadvisor.gr/images/stories/pdf/ΠΑΝΕΛΛΗΝΙΕΣ%202013/ΘΕΜΑΤΑ%20ΚΑΙ%20ΑΠΑΝΤΗΣΕΙΣ/2001-2011/ΤΕΧΝΟΛΟΓΙΚΗ%20ΚΑΤΕΥΘΥΝΣΗ/Φυσική%20Κατεύθυνσης.pdf" 156 | } -------------------------------------------------------------------------------- /test_script.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Simple test of the GlossAPI Corpus functionality with the refactored pipeline 4 | """ 5 | import logging 6 | from pathlib import Path 7 | from glossapi.corpus import Corpus 8 | 9 | # Configure logging 10 | logging.basicConfig(level=logging.INFO) 11 | logger = logging.getLogger("simple_test") 12 | 13 | # Test directory - using the directory where we downloaded the paper 14 | TEST_DIR = Path("/home/fivos/CascadeProjects/glossAPI/corpus_test") 15 | 16 | def main(): 17 | # Create a basic corpus object - using same directory for input and output 18 | logger.info("Creating Corpus object") 19 | corpus = Corpus( 20 | input_dir=TEST_DIR, 21 | output_dir=TEST_DIR 22 | ) 23 | 24 | # Skipping download step since we already have the PDF file 25 | logger.info("Skipping download step (already have the PDF file)") 26 | 27 | # 2. Extract 28 | logger.info("Running extract step") 29 | # Specify the formats we know are in the downloads directory 30 | corpus.extract() 31 | 32 | # 4. Section - now uses files marked as 'good' quality 33 | logger.info("Running section step") 34 | corpus.section() 35 | 36 | # 5. Annotate 37 | logger.info("Running annotate step") 38 | corpus.annotate(annotation_type="chapter") 39 | 40 | # Check results 41 | logger.info("Pipeline completed") 42 | 43 | if __name__ == "__main__": 44 | main() 45 | --------------------------------------------------------------------------------