├── .gitignore
├── LICENSE
├── README.md
├── arxiv-tools
    ├── README.md
    ├── download.py
    ├── eda_manifest.py
    └── extract_pdfs.py
├── job_status_server.py
├── postprocess.py
├── requirements.txt
├── run_nougat.py
└── utils
    ├── .DS_Store
    ├── check_complete_results.py
    └── get_metadata.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Neural Work
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Arxiver
 2 | 
 3 | A toolkit for downloading and converting arXiv papers to multi markdown (.mmd) format with Nougat - a neural OCR. Our pipeline can extract LaTeX equations and includes post-processing tools to clean up and merge extracted data. See the [arxiver](https://huggingface.co/datasets/neuralwork/arxiver) dataset on Hugging Face Hub for sample results.
 4 | 
 5 | ## Project Structure
 6 | ```
 7 | arxiver/
 8 |     arxiv-tools/          # Tools for downloading arXiv papers
 9 |     utils/                # Utility files to check processed data, get article metadata, etc.
10 |     run_nougat.py         # Batch PDF processing script to extract text in .mmd format
11 |     job_status_server.py  # Web server to monitor extraction progress
12 |     postprocess.py        # Post-processing scripting to clean and merge Nougat outputs
13 | ```
14 | 
15 | ## Downloading arXiv
16 | 
17 | The `arxiv-tools` folder contains scripts for downloading arXiv papers and computing useful statistics about the arXiv dataset. For detailed instructions, see the [arxiv-tools README](arxiv-tools/README.md). Downloading and extracting the dataset creates a hierarchical folder structure organized by publication year and month as follows:
18 | 
19 | ```
20 | output_dir/
21 |     2310/           # October 2023
22 |         paper1.pdf
23 |         paper2.pdf
24 |     2311/           # November 2023
25 |         paper3.pdf
26 |         paper4.pdf
27 | ```
28 | 
29 | ## Nougat Processing
30 | 
31 | The `run_nougat.py` script processes PDF files in batches using the [Nougat](https://arxiv.org/abs/2308.13418) neural OCR model:
32 | 
33 | ```bash
34 | python run_nougat.py \
35 |     --input_dir /path/to/datadir \
36 |     --output_dir /path/to/output \
37 |     --gpu_id 0 \
38 |     --batch_size 8
39 | ```
40 | 
41 | You can run Nougat using the output data directory as an input argument. Running this script processes pdfs by batches on specified GPU and logs successful and failed jobs (Nougat is not 100% stable). Output structure maintains the same year-month-based subdirectory structure but saves each page separately:
42 | ```
43 | output_dir/
44 |     2310/
45 |         paper1_1.mmd    # Paper 1, page 1
46 |         paper1_2.mmd    # Paper 1, page 2
47 |         paper2_1.mmd
48 |     2311/
49 |         paper3_1.mmd
50 |         paper3_2.mmd
51 |         paper4_1.mmd
52 | ```
53 | 
54 | #### Progress Monitoring
55 | We provide an optinoal script, `job_status_server.py` to provide a web interface to monitor processing progress:
56 | 
57 | ```bash
58 | python job_status_server.py \
59 |     --input_dir /path/to/pdf/files \
60 |     --output_dir /path/to/output \
61 |     --port 8005
62 | ```
63 | 
64 | 
65 | ## Post-Processing
66 | The post-processing pipeline includes several steps to validate and clean up the Nougat output. You can optionally check how many of the papers have been fully processed (all pages successfully extracted) by running:
67 | ```bash
68 | cd utils
69 | python check_complete_results.py --pdf-dir /path/to/pdf/root/dir --mmd-dir /path/to/mmd/root/dir
70 | ```
71 | 
72 | You can use the output .mmd files as they are or run post-processing to remove headers and references and merge multiple page MMD files into single documents operations. To do this, run the post-processing script:
73 | ```bash
74 | cd ..
75 | python postprocess.py --input-dir /path/to/processed-data --output-dir /path/to/output
76 | ```
77 | 
78 | Note that this script preserves the original hierarchical folder structure organized by publication year and month.
79 | 
80 | #### Metadata Extraction
81 | You can optionally get article metadata by running:
82 | ```bash
83 | cd utils
84 | python extract_metadata.py --input-dir /path/to/merged-mmd-folder
85 | ```
86 | 
87 | ## Notes
88 | - GPU with CUDA support is required for efficient processing
89 | - Tested on an NVIDIA T4 GPU, processing speed depends on GPU memory and batch size
90 | - arxiv-tools/ is adapted from the original [repo](https://github.com/armancohan/arxiv-tools)
91 | 
92 | <a href='https://ko-fi.com/Z8Z616R4PF' target='_blank'><img height='36' style='border:0px;height:36px;' src='https://storage.ko-fi.com/cdn/kofi6.png?v=6' border='0' alt='Buy Me a Coffee at ko-fi.com' /></a>
93 | 
94 | From [neuralwork](https://neuralwork.ai/) with :heart:
95 | 


--------------------------------------------------------------------------------
/arxiv-tools/README.md:
--------------------------------------------------------------------------------
  1 | # arXiv-tools
  2 | 
  3 | A tool for downloading and managing arXiv documents in bulk using Amazon S3.
  4 | 
  5 | ## Prerequisites
  6 | 
  7 | - [Amazon AWS Account](https://aws.amazon.com/free) - required for accessing arXiv's bulk data on [Amazon S3](https://aws.amazon.com/s3)
  8 | - Python 2.x to use the `s3cmd` package
  9 | - Python 3.x for manifest file analysis
 10 | 
 11 | ## Installation
 12 | 
 13 | 1. Install s3cmd, a command-line tool for interacting with Amazon S3:
 14 |    ```bash
 15 |    pip install s3cmd  # Python 2 only
 16 |    ```
 17 | 
 18 | 2. Configure s3cmd with your AWS credentials:
 19 |    ```bash
 20 |    s3cmd --configure
 21 |    ```
 22 |    > Note: You'll need your AWS credentials from the Account Management tab on the AWS website.
 23 | 
 24 | 3. Install required Python packages for manifest file analysis:
 25 |    ```bash
 26 |    pip install pandas  # For Python 3.x
 27 |    ```
 28 | 
 29 | ## Usage
 30 | 
 31 | ### 1. Download Manifest Files
 32 | 
 33 | First, download the manifest files containing the complete list of available arXiv files:
 34 | 
 35 | **For PDF documents:**
 36 | ```bash
 37 | s3cmd get --requester-pays \
 38 |     s3://arxiv/pdf/arXiv_pdf_manifest.xml \
 39 |     local-directory/arXiv_pdf_manifest.xml
 40 | ```
 41 | 
 42 | **For source documents:**
 43 | ```bash
 44 | s3cmd get --requester-pays \
 45 |     s3://arxiv/src/arXiv_src_manifest.xml \
 46 |     local-directory/arXiv_src_manifest.xml
 47 | ```
 48 | 
 49 | ### 2. Analyze Manifest Files (Optional)
 50 | 
 51 | Use the `eda_manifest.py` script to analyze the manifest files:
 52 | 
 53 | ```bash
 54 | python eda_manifest.py
 55 | ```
 56 | 
 57 | This script provides useful statistics about the arXiv dataset:
 58 | - Total size of the dataset in bytes, MB, and GB
 59 | - Total number of articles
 60 | - Average file sizes
 61 | - Number of files per time period
 62 | - Detailed statistics for recent years (2022-2023)
 63 | 
 64 | ### 3. Download arXiv Files
 65 | 
 66 | Use the `download.py` script to fetch the actual files:
 67 | 
 68 | **For PDF files:**
 69 | ```bash
 70 | python download.py \
 71 |     --manifest_file /path/to/pdf-manifest \
 72 |     --mode pdf \
 73 |     --output_dir /path/to/output
 74 | ```
 75 | 
 76 | **For source files:**
 77 | ```bash
 78 | python download.py \
 79 |     --manifest_file /path/to/src-manifest \
 80 |     --mode src \
 81 |     --output_dir /path/to/output
 82 | ```
 83 | 
 84 | The files will be downloaded to your specified output directory. Each file is in `.tar` format and approximately 500MB in size.
 85 | 
 86 | ### 4. Extract PDFs from Tar Files
 87 | 
 88 | After downloading the tar files, use `extract_pdfs.py` to extract PDFs into organized directories:
 89 | 
 90 | ```bash
 91 | python extract_pdfs.py \
 92 |     --data_dir /path/to/tar/files \
 93 |     --output_dir /path/to/output \
 94 |     [--keep_tars]  # Optional: keep original tar files
 95 | ```
 96 | 
 97 | The script will create and extract pdf files to year-month subdirectories (e.g., "2310" for October 2023). Example output structure:
 98 | ```
 99 | output_dir/
100 |     2310/           # October 2023
101 |         paper1.pdf
102 |         paper2.pdf
103 |     2311/           # November 2023
104 |         paper3.pdf
105 |         paper4.pdf
106 | ```
107 | 
108 | 
109 | ## Additional Resources
110 | - For metadata downloads, consider using [metha](https://github.com/miku/metha)
111 | 
112 | ## Notes
113 | - The arXiv files are stored in requester-pays buckets on Amazon S3
114 | - Each archive file is approximately 500MB in size and uses the `.tar` format


--------------------------------------------------------------------------------
/arxiv-tools/download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import subprocess
 4 | from argparse import ArgumentParser
 5 | import xml.etree.ElementTree as ET
 6 | 
 7 | 
 8 | # set up logging configuration
 9 | log_file = os.path.join("logs", "preprocessing_logs.log")
10 | os.makedirs("logs", exist_ok=True)
11 | 
12 | logging.basicConfig(
13 |     level=logging.INFO,
14 |     format="%(asctime)s - %(levelname)s - %(message)s",
15 |     handlers=[
16 |         logging.FileHandler(log_file),
17 |         logging.StreamHandler() 
18 |     ]
19 | )
20 | logger = logging.getLogger(__name__)
21 | 
22 | def download_files(**args):
23 |     manifest_file = args["manifest_file"]
24 |     mode = args["mode"]
25 |     out_dir = args["output_dir"]
26 | 
27 |     if mode != "pdf" and mode != "src":
28 |         logger.error("Invalid mode: %s. Mode should be 'pdf' or 'src'.", mode)
29 | 
30 |     def get_file(fname, out_dir):
31 |         cmd = ["s3cmd", "get", "--requester-pays", "s3://arxiv/%s" % fname, "./%s" % out_dir]
32 |         logger.info("Downloading file: %s to %s", fname, out_dir)
33 |         subprocess.call(' '.join(cmd), shell=True)
34 | 
35 |     try:
36 |         for file in ET.parse(manifest_file).getroot().findall("file")[:1]:
37 |             filename = file.find("filename").text
38 |             logger.info("Processing file: %s", filename)
39 | 
40 |             get_file(filename, out_dir='%s/%s/' % (out_dir, mode))
41 |             logger.debug("Successfully downloaded: %s", filename)
42 |     except Exception as e:
43 |         logger.error("Failed to process manifest file: %s", str(e), exc_info=True)
44 | 
45 |     logger.info("Download process completed")
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     argparser = ArgumentParser()
50 |     argparser.add_argument("--manifest_file", "-m", type=str, help="The manifest file to download files from arXiv.", required=True)
51 |     argparser.add_argument("--output_dir", "-o", type=str, default="data", help="Output directory to save files to.")
52 |     argparser.add_argument("--mode", type=str, default="src", choices=set(("pdf", "src")), help="Can be 'pdf' or 'src'.")
53 |     args = argparser.parse_args()
54 |     download_files(**vars(args))
55 | 


--------------------------------------------------------------------------------
/arxiv-tools/eda_manifest.py:
--------------------------------------------------------------------------------
  1 | """
  2 | !/usr/bin/env python3
  3 | arXiv Manifest Analysis Script
  4 | 
  5 | This script analyzes XML manifest files from arXiv's bulk data access,
  6 | providing insights about file sizes, article counts, and temporal distribution.
  7 | """
  8 | 
  9 | import xml.etree.ElementTree as ET
 10 | import pandas as pd
 11 | from typing import List, Dict
 12 | 
 13 | 
 14 | def parse_manifest(filepath: str) -> pd.DataFrame:
 15 |     """
 16 |     Parse the arXiv manifest XML file into a pandas DataFrame.
 17 |     
 18 |     Args:
 19 |         filepath: Path to the manifest XML file
 20 |         
 21 |     Returns:
 22 |         DataFrame containing parsed manifest data
 23 |     """
 24 |     tree = ET.parse(filepath)
 25 |     root = tree.getroot()
 26 |     
 27 |     data = [{
 28 |         "Filename": element.find("filename").text,
 29 |         "Number of Items": int(element.find("num_items").text),
 30 |         "Size": int(element.find("size").text),
 31 |         "Timestamp": element.find("timestamp").text,
 32 |         "YYMM": element.find("yymm").text,
 33 |     } for element in root.findall("file")]
 34 |     
 35 |     return pd.DataFrame(data)
 36 | 
 37 | 
 38 | def analyze_total_statistics(df: pd.DataFrame) -> Dict:
 39 |     """Calculate and return overall statistics from the manifest data."""
 40 |     total_size = df["Size"].sum()
 41 |     total_articles = df["Number of Items"].sum()
 42 |     total_tars = len(df)
 43 |     
 44 |     return {
 45 |         "total_size_bytes": total_size,
 46 |         "total_size_mb": total_size / 1e6,
 47 |         "total_size_gb": total_size / 1e9,
 48 |         "total_articles": total_articles,
 49 |         "total_tar_files": total_tars,
 50 |         "avg_article_size_mb": (total_size / total_articles) / 1e6,
 51 |         "avg_tar_size_mb": (total_size / total_tars) / 1e6,
 52 |         "avg_items_per_tar": total_articles / total_tars
 53 |     }
 54 | 
 55 | 
 56 | def analyze_yearly_data(df: pd.DataFrame, year: str) -> Dict:
 57 |     """
 58 |     Analyze manifest data for a specific year.
 59 |     
 60 |     Args:
 61 |         df: Full manifest DataFrame
 62 |         year: Two-digit year string (e.g., "22" for 2022)
 63 |         
 64 |     Returns:
 65 |         Dictionary containing year-specific statistics
 66 |     """
 67 |     year_df = df[df["YYMM"].str.startswith(year)]
 68 |     total_size = year_df["Size"].sum()
 69 |     total_articles = year_df["Number of Items"].sum()
 70 |     
 71 |     return {
 72 |         "total_articles": total_articles,
 73 |         "total_size_gb": total_size / 1e9,
 74 |         "data": year_df
 75 |     }
 76 | 
 77 | 
 78 | def print_statistics(stats: Dict) -> None:
 79 |     """Print formatted statistics."""
 80 |     print("\n=== Overall Statistics ===")
 81 |     print(f"Total Size: {stats['total_size_gb']:.2f} GB ({stats['total_size_bytes']:,} bytes)")
 82 |     print(f"Total Articles: {stats['total_articles']:,}")
 83 |     print(f"Total TAR Files: {stats['total_tar_files']:,}")
 84 |     print(f"\nAverages:")
 85 |     print(f"- Article Size: {stats['avg_article_size_mb']:.2f} MB")
 86 |     print(f"- TAR File Size: {stats['avg_tar_size_mb']:.2f} MB")
 87 |     print(f"- Items per TAR: {stats['avg_items_per_tar']:.1f}")
 88 | 
 89 | 
 90 | def main():
 91 |     # path to input manifest file
 92 |     MANIFEST_PATH = "arXiv_pdf_manifest.xml"
 93 |     
 94 |     # load and parse manifest
 95 |     print("Parsing manifest file...")
 96 |     df = parse_manifest(MANIFEST_PATH)
 97 |     
 98 |     # calculate overall statistics
 99 |     stats = analyze_total_statistics(df)
100 |     print_statistics(stats)
101 |     
102 |     # analyze recent years
103 |     print("\n=== Recent Years Analysis ===")
104 |     for year in ["22", "23"]:
105 |         year_stats = analyze_yearly_data(df, year)
106 |         print(f"\nYear 20{year}:")
107 |         print(f"- Articles: {year_stats['total_articles']:,}")
108 |         print(f"- Size: {year_stats['total_size_gb']:.2f} GB")
109 |         
110 |         # export year-specific data if needed
111 |         year_stats["data"].to_csv(f"df_{year}.csv", index=False)
112 |     
113 |     # print unique YYMM values for reference
114 |     print("\n=== Time Coverage ===")
115 |     unique_yymm = sorted(df["YYMM"].unique())
116 |     print(f"Coverage period: {unique_yymm[0]} to {unique_yymm[-1]}")
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     main()


--------------------------------------------------------------------------------
/arxiv-tools/extract_pdfs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | arXiv Tar Extraction Script
  4 | 
  5 | Extracts PDF files from arXiv tar archives into organized directories.
  6 | The script processes tar files named like 'arXiv_pdf_YY_MM_N.tar' and organizes PDFs into
  7 | YYMM directories.
  8 | 
  9 | Usage:
 10 |     python extract_arxiv.py --data_dir /path/to/tar/files
 11 | """
 12 | 
 13 | import os
 14 | import tarfile
 15 | import logging
 16 | import argparse
 17 | from typing import List
 18 | from pathlib import Path
 19 | 
 20 | 
 21 | def setup_logging() -> logging.Logger:
 22 |     """Configure and return logger."""
 23 |     logging.basicConfig(
 24 |         level=logging.INFO,
 25 |         format="%(asctime)s - %(levelname)s - %(message)s",
 26 |         handlers=[
 27 |             logging.FileHandler("arxiv_extract.log"),
 28 |             logging.StreamHandler()
 29 |         ]
 30 |     )
 31 |     return logging.getLogger("arxiv_extractor")
 32 | 
 33 | 
 34 | def parse_args() -> argparse.Namespace:
 35 |     """Parse command line arguments."""
 36 |     parser = argparse.ArgumentParser(
 37 |         description="Extract PDFs from arXiv tar archives",
 38 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 39 |     )
 40 |     parser.add_argument(
 41 |         "--data_dir",
 42 |         type=str,
 43 |         required=True,
 44 |         help="Directory containing arXiv tar files"
 45 |     )
 46 |     parser.add_argument(
 47 |         "--output_dir",
 48 |         type=str,
 49 |         default="extracted",
 50 |         help="Base directory for extracted PDFs"
 51 |     )
 52 |     parser.add_argument(
 53 |         "--keep_tars",
 54 |         action="store_true",
 55 |         help="Keep tar files after extraction (default: delete)"
 56 |     )
 57 |     
 58 |     return parser.parse_args()
 59 | 
 60 | 
 61 | def get_tar_files(data_dir: Path) -> List[Path]:
 62 |     """
 63 |     Find all tar files in the specified directory.
 64 |     
 65 |     Args:
 66 |         data_dir: Directory to search for tar files
 67 |         
 68 |     Returns:
 69 |         List of paths to tar files
 70 |     """
 71 |     return list(data_dir.glob("*.tar"))
 72 | 
 73 | 
 74 | def extract_pdfs_from_tar(
 75 |     tar_path: Path,
 76 |     output_base: Path,
 77 |     keep_tar: bool = False
 78 | ) -> bool:
 79 |     """
 80 |     Extract PDF files from a tar archive into a year-month directory.
 81 |     
 82 |     Args:
 83 |         tar_path: Path to the tar file
 84 |         output_base: Base directory for extracted files
 85 |         keep_tar: Whether to keep the tar file after extraction
 86 |         
 87 |     Returns:
 88 |         bool: True if extraction successful, False otherwise
 89 |     """
 90 |     logger = logging.getLogger("arxiv_extractor")
 91 |     
 92 |     try:
 93 |         # extract directory name (YYMM) from tar file name
 94 |         # example: "arXiv_pdf_23_10_1.tar" -> "2310"
 95 |         parts = tar_path.stem.split("_")
 96 |         if len(parts) != 5 or not parts[2].isdigit() or not parts[3].isdigit():
 97 |             logger.warning("Invalid tar file name format: %s", tar_path.name)
 98 |             return False
 99 |             
100 |         dir_name = parts[2] + parts[3]  # e.g., "2310" for year 23, month 10
101 |         output_dir = output_base / dir_name
102 |         output_dir.mkdir(parents=True, exist_ok=True)
103 |         
104 |         logger.info("Extracting %s to directory %s", tar_path.name, output_dir)
105 |         
106 |         # Extract PDF files
107 |         with tarfile.open(tar_path, "r") as tar:
108 |             pdf_members = [m for m in tar.getmembers() 
109 |                          if m.isreg() and m.name.endswith(".pdf")]
110 |             
111 |             total_pdfs = len(pdf_members)
112 |             for i, member in enumerate(pdf_members, 1):
113 |                 tar.extract(member, output_dir)
114 |                 if i % 100 == 0:  # log progress every 100 files
115 |                     logger.info("Extracted %d/%d PDFs (%.1f%%)",
116 |                               i, total_pdfs, (i/total_pdfs)*100)
117 |             
118 |             logger.info("Successfully extracted %d PDF files to %s",
119 |                        total_pdfs, output_dir)
120 |         
121 |         # clean up tar file if requested
122 |         if not keep_tar:
123 |             tar_path.unlink()
124 |             logger.info("Deleted tar file: %s", tar_path.name)
125 |         
126 |         return True
127 |         
128 |     except Exception as e:
129 |         logger.error("Error processing %s: %s", tar_path.name, str(e))
130 |         return False
131 | 
132 | 
133 | def main():
134 |     args = parse_args()
135 |     logger = setup_logging()
136 |     
137 |     data_dir = Path(args.data_dir)
138 |     output_base = Path(args.output_dir)
139 |     
140 |     if not data_dir.is_dir():
141 |         logger.error("Data directory does not exist: %s", data_dir)
142 |         return
143 |     
144 |     # create output base directory
145 |     output_base.mkdir(parents=True, exist_ok=True)
146 |     
147 |     # get list of tar files
148 |     tar_files = get_tar_files(data_dir)
149 |     if not tar_files:
150 |         logger.error("No tar files found in %s", data_dir)
151 |         return
152 |     
153 |     logger.info("Found %d tar files to process", len(tar_files))
154 |     
155 |     # process each tar file
156 |     successful = 0
157 |     failed = 0
158 |     
159 |     for tar_path in tar_files:
160 |         if extract_pdfs_from_tar(tar_path, output_base, args.keep_tars):
161 |             successful += 1
162 |         else:
163 |             failed += 1
164 |     
165 |     # log final statistics
166 |     logger.info("\nExtraction complete:")
167 |     logger.info("- Successful: %d", successful)
168 |     logger.info("- Failed: %d", failed)
169 |     logger.info("- Total processed: %d", len(tar_files))
170 | 
171 | 
172 | if __name__ == "__main__":
173 |     main()
174 | 
175 | 
176 | 
177 | 


--------------------------------------------------------------------------------
/job_status_server.py:
--------------------------------------------------------------------------------
  1 | """
  2 | !/usr/bin/env python3
  3 | Job Status Server
  4 | 
  5 | A FastAPI server that monitors and reports Nougat inference progress.
  6 | Scans directories of MMD files to track processing progress.
  7 | 
  8 | Dependencies:
  9 |     - fastapi
 10 |     - uvicorn[standard]
 11 | 
 12 | Usage:
 13 |     python job_status_server.py
 14 | """
 15 | 
 16 | import logging
 17 | import argparse
 18 | from datetime import datetime
 19 | from pathlib import Path
 20 | from typing import Tuple
 21 | 
 22 | from fastapi import FastAPI, HTTPException
 23 | from fastapi.responses import HTMLResponse
 24 | import uvicorn
 25 | 
 26 | 
 27 | def parse_args() -> argparse.Namespace:
 28 |     """Parse command line arguments."""
 29 |     parser = argparse.ArgumentParser(
 30 |         description="Monitor Nougat inference progress",
 31 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 32 |     )
 33 |     parser.add_argument(
 34 |         "--input_dir",
 35 |         type=str,
 36 |         required=True,
 37 |         help="Directory containing source PDF files"
 38 |     )
 39 |     parser.add_argument(
 40 |         "--output_dir",
 41 |         type=str,
 42 |         required=True,
 43 |         help="Directory containing output MMD files"
 44 |     )
 45 |     parser.add_argument(
 46 |         "--port",
 47 |         type=int,
 48 |         default=8005,
 49 |         help="Port number for the server"
 50 |     )
 51 |     return parser.parse_args()
 52 | 
 53 | 
 54 | # Configure logging
 55 | logging.basicConfig(
 56 |     level=logging.INFO,
 57 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 58 |     handlers=[
 59 |         logging.FileHandler("job_server.log"),
 60 |         logging.StreamHandler()
 61 |     ]
 62 | )
 63 | logger = logging.getLogger("job_status_server")
 64 | 
 65 | # initialize FastAPI app
 66 | app = FastAPI(
 67 |     title="Nougat Job Status Server",
 68 |     description="Monitor Nougat inference progress",
 69 |     version="1.0.0"
 70 | )
 71 | 
 72 | input_dir: Path = None
 73 | output_dir: Path = None
 74 | start_time: datetime = None
 75 | 
 76 | 
 77 | def calculate_time_difference(start: datetime, end: datetime) -> str:
 78 |     """Calculate and format the time difference between two timestamps."""
 79 |     time_difference = abs(end - start)
 80 |     days = time_difference.days
 81 |     hours, remainder = divmod(time_difference.seconds, 3600)
 82 |     minutes, _ = divmod(remainder, 60)
 83 |     
 84 |     return f"{days} days, {hours} hours, and {minutes} minutes"
 85 | 
 86 | 
 87 | def count_pdf_files() -> int:
 88 |     """Count total number of PDF files in input directory."""
 89 |     total = 0
 90 |     for month_dir in input_dir.iterdir():
 91 |         if month_dir.is_dir():
 92 |             total += len(list(month_dir.glob("*.pdf")))
 93 |     return total
 94 | 
 95 | 
 96 | def get_processed_files() -> dict:
 97 |     """Get count of processed files per month directory."""
 98 |     processed = {}
 99 |     if output_dir.exists():
100 |         for month_dir in output_dir.iterdir():
101 |             if month_dir.is_dir():
102 |                 processed[month_dir.name] = len(list(month_dir.glob("*.mmd")))
103 |     return processed
104 | 
105 | 
106 | def get_job_stats() -> Tuple[int, int, float, dict]:
107 |     """Calculate current job statistics."""
108 |     try:
109 |         total_pdfs = count_pdf_files()
110 |         processed_files = get_processed_files()
111 |         total_processed = sum(processed_files.values())
112 |         remaining = total_pdfs - total_processed
113 |         percentage = (total_processed / total_pdfs * 100) if total_pdfs > 0 else 0
114 |         
115 |         logger.info(
116 |             "Stats - Total PDFs: %d, Processed: %d, Remaining: %d, Percentage: %.2f%%",
117 |             total_pdfs, total_processed, remaining, percentage
118 |         )
119 |         
120 |         return total_pdfs, total_processed, remaining, percentage, processed_files
121 |     
122 |     except Exception as e:
123 |         logger.error("Error calculating job stats: %s", str(e))
124 |         raise HTTPException(status_code=500, detail="Error calculating job statistics")
125 | 
126 | 
127 | @app.get("/", response_class=HTMLResponse)
128 | def status() -> HTMLResponse:
129 |     """Generate HTML status page showing current job statistics."""
130 |     try:
131 |         total_pdfs, processed, remaining, percentage, processed_files = get_job_stats()
132 |         elapsed_time = calculate_time_difference(start_time, datetime.now())
133 |         
134 |         # generate month-wise progress HTML
135 |         month_progress = ""
136 |         for month, count in sorted(processed_files.items()):
137 |             month_progress += f"<p>Month {month}: {count:,} files processed</p>\n"
138 |         
139 |         return f"""
140 |         <html>
141 |             <head>
142 |                 <title>Nougat Inference Status</title>
143 |                 <style>
144 |                     body {{
145 |                         font-family: Arial, sans-serif;
146 |                         margin: 40px;
147 |                         line-height: 1.6;
148 |                     }}
149 |                     h1, h2 {{
150 |                         color: #2c3e50;
151 |                     }}
152 |                     p {{
153 |                         margin: 10px 0;
154 |                     }}
155 |                     .progress-bar {{
156 |                         width: 100%;
157 |                         background-color: #f0f0f0;
158 |                         padding: 3px;
159 |                         border-radius: 3px;
160 |                         box-shadow: inset 0 1px 3px rgba(0, 0, 0, .2);
161 |                     }}
162 |                     .progress {{
163 |                         width: {percentage}%;
164 |                         height: 20px;
165 |                         background-color: #4CAF50;
166 |                         border-radius: 3px;
167 |                     }}
168 |                 </style>
169 |             </head>
170 |             <body>
171 |                 <h1>Nougat Inference Progress</h1>
172 |                 <div class="progress-bar">
173 |                     <div class="progress"></div>
174 |                 </div>
175 |                 <p>Total PDF files: {total_pdfs:,}</p>
176 |                 <p>Processed files: {processed:,}</p>
177 |                 <p>Remaining files: {remaining:,}</p>
178 |                 <p>Completion: {percentage:.2f}%</p>
179 |                 <p>Time elapsed: {elapsed_time}</p>
180 |                 
181 |                 <h2>Progress by Month</h2>
182 |                 {month_progress}
183 |             </body>
184 |         </html>
185 |         """
186 |     
187 |     except Exception as e:
188 |         logger.error("Error generating status page: %s", str(e))
189 |         raise HTTPException(status_code=500, detail="Error generating status page")
190 | 
191 | 
192 | @app.on_event("startup")
193 | async def startup_event():
194 |     """Initialize server state and log startup."""
195 |     global start_time
196 |     start_time = datetime.now()
197 |     
198 |     logger.info("Job Status Server starting up on port %d", args.port)
199 |     logger.info("Monitoring input directory: %s", input_dir)
200 |     logger.info("Monitoring output directory: %s", output_dir)
201 |     total_pdfs = count_pdf_files()
202 |     logger.info("Total PDF files to process: %d", total_pdfs)
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     args = parse_args()
207 |     input_dir = Path(args.input_dir)
208 |     output_dir = Path(args.output_dir)
209 |     uvicorn.run(app, host="0.0.0.0", port=args.port)


--------------------------------------------------------------------------------
/postprocess.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import time
  4 | import argparse
  5 | from pathlib import Path
  6 | from typing import Set, List, Tuple
  7 | 
  8 | 
  9 | def read_mmd(file_path: str) -> str:
 10 |     """Read MMD file content."""
 11 |     with open(file_path, "r", encoding="utf-8") as f:
 12 |         return f.read()
 13 | 
 14 | 
 15 | def parse_filename(filename: str) -> Tuple[str, str]:
 16 |     """Extract article ID and page number from filename."""
 17 |     base_name = filename[:-4] if filename.endswith(".mmd") else filename
 18 |     paper_id, page_num = base_name.rsplit("_", 1)
 19 |     return paper_id, page_num
 20 | 
 21 | 
 22 | def detect_headers(mmd: str) -> List[Tuple[int, str]]:
 23 |     """Detect headers in MMD content."""
 24 |     return [(i, line) for i, line in enumerate(mmd.splitlines()) if line.startswith("#")]
 25 | 
 26 | 
 27 | def has_abstract(mmd: str) -> bool:
 28 |     """Check if MMD content contains an abstract."""
 29 |     return any("abstract" in line.lower() for line in mmd.splitlines())
 30 | 
 31 | 
 32 | def find_references(mmd: str) -> bool:
 33 |     """Find references section in MMD content."""
 34 |     for line in mmd.splitlines():
 35 |         if line.startswith("#") and "references" in line.lower():
 36 |             return True
 37 |     return False
 38 | 
 39 | 
 40 | def remove_authors(mmd: str) -> str:
 41 |     """Remove author names while preserving layout."""
 42 |     lines = mmd.splitlines()
 43 |     abstract_line = 0
 44 |     for i, line in enumerate(lines):
 45 |         if line.startswith("#") and "abstract" in line.lower():
 46 |             abstract_line = i
 47 |             break
 48 |     return "\n".join([lines[0], ""] + lines[abstract_line:])
 49 | 
 50 | 
 51 | def remove_references(mmd: str) -> str:
 52 |     """Remove content after references section."""
 53 |     lines = mmd.splitlines()
 54 |     for i, line in enumerate(lines):
 55 |         if line.startswith("#") and "references" in line.lower():
 56 |             return '\n'.join(lines[:i])
 57 |     return mmd
 58 | 
 59 | class ArticleProcessor:
 60 |     def __init__(self):
 61 |         self.headers_detected = set()
 62 |         self.abstract_detected = set()
 63 |         self.reference_pages = {}
 64 |         self.article_pages = {}
 65 |         # track which month directory each article belongs to
 66 |         self.article_months = {}
 67 | 
 68 |     def process_month_directory(self, month_dir: Path):
 69 |         """Process all MMD files in a month directory."""
 70 |         if not month_dir.is_dir():
 71 |             return
 72 | 
 73 |         month_name = month_dir.name
 74 |         for mmd_path in month_dir.glob("*.mmd"):
 75 |             paper_id, page_num = parse_filename(mmd_path.name)
 76 |             
 77 |             # store month information
 78 |             self.article_months[paper_id] = month_name
 79 |             
 80 |             # store page information
 81 |             if paper_id not in self.article_pages:
 82 |                 self.article_pages[paper_id] = []
 83 |             self.article_pages[paper_id].append(page_num)
 84 |             
 85 |             try:
 86 |                 mmd_content = read_mmd(str(mmd_path))
 87 |                 
 88 |                 # only process first page for headers and abstract
 89 |                 if page_num == '1':
 90 |                     if has_abstract(mmd_content):
 91 |                         self.abstract_detected.add(paper_id)
 92 |                     if len(detect_headers(mmd_content)) > 1:
 93 |                         self.headers_detected.add(paper_id)
 94 |                 
 95 |                 # check for references
 96 |                 if find_references(mmd_content):
 97 |                     self.reference_pages[paper_id] = page_num
 98 |                     
 99 |             except Exception as e:
100 |                 print(f"Error processing {mmd_path}: {e}")
101 | 
102 |     def get_valid_articles(self) -> Set[str]:
103 |         """Return articles with both headers and abstract."""
104 |         return self.headers_detected.intersection(self.abstract_detected)
105 | 
106 | def postprocess_articles(input_dir: Path, output_dir: Path, processor: ArticleProcessor):
107 |     """Postprocess articles by removing authors and references."""
108 |     valid_articles = processor.get_valid_articles()
109 |     
110 |     for article_id in valid_articles:
111 |         if article_id not in processor.reference_pages or article_id not in processor.article_pages:
112 |             continue
113 |             
114 |         # get the year-month directory for this article
115 |         month = processor.article_months[article_id]
116 |         month_output_dir = output_dir / month
117 |         month_output_dir.mkdir(parents=True, exist_ok=True)
118 |         
119 |         pages = sorted([int(p) for p in processor.article_pages[article_id]])
120 |         ref_page = int(processor.reference_pages[article_id])
121 |         processed_content = []
122 |         
123 |         for page_num in pages:
124 |             mmd_path = input_dir / month / f"{article_id}_{page_num}.mmd"
125 |             if not mmd_path.exists():
126 |                 continue
127 |                 
128 |             content = read_mmd(str(mmd_path))
129 |             
130 |             if page_num == 1:
131 |                 content = remove_authors(content)
132 |             elif page_num == ref_page:
133 |                 if not content.splitlines()[0].lower().startswith("# reference"):
134 |                     content = remove_references(content)
135 |                 else:
136 |                     continue
137 |             elif page_num > ref_page:
138 |                 continue
139 |                 
140 |             processed_content.append(content)
141 |         
142 |         if processed_content:
143 |             output_path = month_output_dir / f"{article_id}.mmd"
144 |             with open(output_path, "w", encoding="utf-8") as f:
145 |                 f.write('\n'.join(processed_content))
146 | 
147 | def main(args):
148 |     input_dir = Path(args.input_dir)
149 |     output_dir = Path(args.output_dir)
150 |     
151 |     print("Processing MMD files...")
152 |     start_time = time.time()
153 |     
154 |     # initialize processor
155 |     processor = ArticleProcessor()
156 |     
157 |     # process each month directory
158 |     for month_dir in input_dir.iterdir():
159 |         if month_dir.is_dir():
160 |             print(f"Processing directory: {month_dir.name}")
161 |             processor.process_month_directory(month_dir)
162 |     
163 |     valid_articles = processor.get_valid_articles()
164 |     
165 |     print(f"\nFound:")
166 |     print(f"- Articles with headers and abstract: {len(valid_articles)}")
167 |     print(f"- Articles with references: {len(processor.reference_pages)}")
168 |     print(f"- Total articles: {len(processor.article_pages)}")
169 |     
170 |     # postprocess valid articles
171 |     print("\nPostprocessing articles...")
172 |     postprocess_articles(input_dir, output_dir, processor)
173 |     
174 |     processing_time = time.time() - start_time
175 |     print(f"\nProcessing completed in {processing_time:.2f} seconds")
176 |     print(f"Processed files saved to: {output_dir}")
177 | 
178 | 
179 | if __name__ == "__main__":
180 |     parser = argparse.ArgumentParser(
181 |         description="Process arXiv MMD files: detect headers, abstracts, references, and postprocess"
182 |     )
183 |     parser.add_argument(
184 |         "--input-dir",
185 |         type=str,
186 |         required=True,
187 |         help="Input directory containing MMD files organized by month (YYMM)"
188 |     )
189 |     parser.add_argument(
190 |         "--output-dir",
191 |         type=str,
192 |         required=True,
193 |         help="Output directory for processed MMD files (will maintain month structure)"
194 |     )
195 |     args = parser.parse_args()
196 |     main(args)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch
 2 | torchvision
 3 | transformers==4.25.1
 4 | timm==0.5.4
 5 | ojson
 6 | opencv-python-headless
 7 | datasets[vision]
 8 | lightning==2.0.0
 9 | nltk
10 | python-Levenshtein
11 | tqdm
12 | sentencepiece
13 | sconf==0.2.3
14 | albumentations==1.0.0
15 | pypdf==3.1.0
16 | pypdfium2
17 | Pillow
18 | PyPDF2


--------------------------------------------------------------------------------
/run_nougat.py:
--------------------------------------------------------------------------------
  1 | """
  2 | !/usr/bin/env python3
  3 | Nougat batch inference script
  4 | 
  5 | Processes PDFs with Nougat model in batches and extracts text to multi markdown files.
  6 | Works with a directory structure organized by year-month (YYMM).
  7 | 
  8 | Usage:
  9 |     python nougat_inference.py --input_dir /path/to/pdfs --output_dir /path/to/output --gpu_id 0
 10 | """
 11 | 
 12 | import time
 13 | import logging
 14 | import argparse
 15 | from functools import partial
 16 | from pathlib import Path
 17 | from typing import List
 18 | 
 19 | import torch
 20 | import pypdf
 21 | from tqdm import tqdm
 22 | from nougat import NougatModel
 23 | from nougat.utils.dataset import LazyDataset
 24 | from nougat.utils.checkpoint import get_checkpoint
 25 | from nougat.postprocessing import markdown_compatible
 26 | 
 27 | 
 28 | def setup_logging(output_dir: Path) -> None:
 29 |     """Configure logging to file and console."""
 30 |     logging.basicConfig(
 31 |         level=logging.INFO,
 32 |         format="%(asctime)s - %(levelname)s - %(message)s",
 33 |         handlers=[
 34 |             logging.FileHandler(output_dir / "nougat_inference.log"),
 35 |             logging.StreamHandler()
 36 |         ]
 37 |     )
 38 | 
 39 | 
 40 | def parse_args() -> argparse.Namespace:
 41 |     """Parse command line arguments."""
 42 |     parser = argparse.ArgumentParser(
 43 |         description="Process PDFs with Nougat model",
 44 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 45 |     )
 46 |     parser.add_argument(
 47 |         "--input_dir",
 48 |         type=str,
 49 |         required=True,
 50 |         help="Directory containing PDF files organized in YYMM folders"
 51 |     )
 52 |     parser.add_argument(
 53 |         "--output_dir",
 54 |         type=str,
 55 |         required=True,
 56 |         help="Directory for output markdown files"
 57 |     )
 58 |     parser.add_argument(
 59 |         "--gpu_id",
 60 |         type=int,
 61 |         default=0,
 62 |         help="GPU ID to use for inference"
 63 |     )
 64 |     parser.add_argument(
 65 |         "--batch_size",
 66 |         type=int,
 67 |         default=8,
 68 |         help="Batch size for processing pages"
 69 |     )
 70 |     return parser.parse_args()
 71 | 
 72 | 
 73 | def load_model_to_gpu(model_tag: str, gpu_id: int) -> NougatModel:
 74 |     """Initialize and load Nougat model to specified GPU."""
 75 |     logger = logging.getLogger("nougat_inference")
 76 |     logger.info(f"Loading model {model_tag} to GPU {gpu_id}")
 77 |     checkpoint = get_checkpoint(None, model_tag=model_tag)
 78 |     model = NougatModel.from_pretrained(checkpoint)
 79 |     model.to(f"cuda:{gpu_id}").to(torch.bfloat16)
 80 |     model.eval()
 81 |     return model
 82 | 
 83 | 
 84 | def get_pdf_files(input_dir: Path) -> List[Path]:
 85 |     """Get all PDF files from the input directory structure."""
 86 |     pdf_files = []
 87 |     for month_dir in input_dir.iterdir():
 88 |         if month_dir.is_dir():
 89 |             pdf_files.extend(month_dir.glob("*.pdf"))
 90 |     return sorted(pdf_files)
 91 | 
 92 | 
 93 | def process_pdf(pdf_path: Path, output_dir: Path, model: NougatModel, batch_size: int) -> bool:
 94 |     """
 95 |     Process all pages of a PDF document with the Nougat model.
 96 |     
 97 |     Args:
 98 |         pdf_path: Path to PDF file
 99 |         output_dir: Directory for output files
100 |         model: Loaded Nougat model
101 |         batch_size: Number of pages to process at once
102 |     
103 |     Returns:
104 |         bool: True if processing was successful, False otherwise
105 |     """
106 |     logger = logging.getLogger("nougat_inference")
107 |     start_time = time.time()
108 |     
109 |     # get document ID (remove .pdf and use full path structure)
110 |     document_id = pdf_path.stem
111 |     month_dir = pdf_path.parent.name
112 | 
113 |     try:
114 |         # prepare dataset for all pages
115 |         full_dataset = LazyDataset(
116 |             str(pdf_path), partial(model.encoder.prepare_input, random_padding=False)
117 |         )
118 |     except pypdf.errors.PdfStreamError as e:
119 |         logger.error(f"Failed to load PDF {document_id}: {str(e)}")
120 |         return False
121 | 
122 |     # create dataloader
123 |     dataloader = torch.utils.data.DataLoader(
124 |         full_dataset,
125 |         batch_size=batch_size,
126 |         shuffle=False,
127 |         collate_fn=LazyDataset.ignore_none_collate,
128 |     )
129 |     
130 |     try:
131 |         # create month directory in output
132 |         month_output_dir = output_dir / month_dir
133 |         month_output_dir.mkdir(exist_ok=True)
134 |         
135 |         # process pages
136 |         for batch_idx, (sample, is_last_page) in enumerate(tqdm(dataloader, desc=f"Processing {document_id}")):
137 |             with torch.no_grad():
138 |                 model_output = model.inference(
139 |                     image_tensors=sample,
140 |                     early_stopping=False
141 |                 )
142 | 
143 |             # save predictions for each page
144 |             for j, output in enumerate(model_output["predictions"]):
145 |                 page_num = batch_idx * batch_size + j + 1
146 |                 formatted_output = markdown_compatible(output.strip())
147 |                 
148 |                 output_path = month_output_dir / f"{document_id}_{page_num}.mmd"
149 |                 output_path.write_text(formatted_output)
150 | 
151 |         elapsed_time = time.time() - start_time
152 |         logger.info(f"Processed {document_id} in {elapsed_time:.2f} seconds")
153 |         return True
154 | 
155 |     except Exception as e:
156 |         logger.error(f"Error processing {document_id}: {str(e)}")
157 |         return False
158 | 
159 | 
160 | def main():
161 |     """Main execution function."""
162 |     args = parse_args()
163 |     
164 |     # create output directory and setup logging
165 |     input_dir = Path(args.input_dir)
166 |     output_dir = Path(args.output_dir)
167 | 
168 |     output_dir.mkdir(parents=True, exist_ok=True)
169 |     setup_logging(output_dir)
170 |     logger = logging.getLogger("nougat_inference")
171 |     
172 |     # load Nougat model
173 |     model = load_model_to_gpu("0.1.0-small", args.gpu_id)
174 |     
175 |     # get PDF files
176 |     pdf_files = get_pdf_files(input_dir)
177 |     logger.info(f"Found {len(pdf_files)} PDF files to process")
178 |     
179 |     # process PDFs
180 |     processed = 0
181 |     failed = 0
182 |     
183 |     for pdf_path in tqdm(pdf_files, desc="Overall progress"):
184 |         if process_pdf(pdf_path, output_dir, model, args.batch_size):
185 |             processed += 1
186 |         else:
187 |             failed += 1
188 |     
189 |     # log final summary
190 |     logger.info("\nProcessing Summary:")
191 |     logger.info(f"Successfully processed: {processed}")
192 |     logger.info(f"Failed: {failed}")
193 |     logger.info(f"Total files attempted: {processed + failed}")
194 | 
195 | 
196 | if __name__ == "__main__":
197 |     main()


--------------------------------------------------------------------------------
/utils/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neuralwork/arxiver/59ce3c3cf31894911b8f623743b0fc5a4e5ec1f5/utils/.DS_Store


--------------------------------------------------------------------------------
/utils/check_complete_results.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | from pathlib import Path
  4 | from collections import defaultdict
  5 | import PyPDF2
  6 |  
  7 | 
  8 | 
  9 | def get_pdf_page_count(pdf_path):
 10 |     try:
 11 |         with open(pdf_path, 'rb') as f:
 12 |             pdf = PyPDF2.PdfReader(f)
 13 |             return len(pdf.pages)
 14 |     except Exception as e:
 15 |         print(f"Error reading {pdf_path}: {str(e)}")
 16 |         return None
 17 | 
 18 | 
 19 | def collect_mmd_files(mmd_root):
 20 |     # dictionary to store paper_id -> list of page numbers
 21 |     mmd_files = defaultdict(list)
 22 |     
 23 |     # walk through all subdirectories
 24 |     for month_dir in os.listdir(mmd_root):
 25 |         month_path = os.path.join(mmd_root, month_dir)
 26 |         if not os.path.isdir(month_path):
 27 |             continue
 28 |             
 29 |         for filename in os.listdir(month_path):
 30 |             if not filename.endswith('.mmd'):
 31 |                 continue
 32 |                 
 33 |             # extract paper ID and page number - paper_id_page.mmd
 34 |             base_name = filename[:-4]  # remove .mmd
 35 |             paper_id, page_num = base_name.rsplit('_', 1)
 36 |             mmd_files[paper_id].append(int(page_num))
 37 |             
 38 |     return mmd_files
 39 | 
 40 | def main(args):
 41 |     pdf_root = Path(args.pdf_dir)
 42 |     mmd_root = Path(args.mmd_dir)
 43 |     
 44 |     complete = []
 45 |     incomplete = []
 46 |     missing = []
 47 |     
 48 |     # collect all MMD files first
 49 |     print("Collecting MMD files...")
 50 |     mmd_files = collect_mmd_files(mmd_root)
 51 |     
 52 |     # process each PDF file
 53 |     print("\nChecking PDFs against MMD files...")
 54 |     for month_dir in os.listdir(pdf_root):
 55 |         month_path = os.path.join(pdf_root, month_dir)
 56 |         if not os.path.isdir(month_path):
 57 |             continue
 58 |             
 59 |         for pdf_file in os.listdir(month_path):
 60 |             if not pdf_file.endswith('.pdf'):
 61 |                 continue
 62 |                 
 63 |             paper_id = pdf_file[:-4]  # remove .pdf extension
 64 |             pdf_path = os.path.join(month_path, pdf_file)
 65 |             
 66 |             # get PDF page count
 67 |             pdf_pages = get_pdf_page_count(pdf_path)
 68 |             if pdf_pages is None:
 69 |                 print(f"Skipping {pdf_file} due to error")
 70 |                 continue
 71 |             
 72 |             # check if we have MMD files for this paper
 73 |             if paper_id in mmd_files:
 74 |                 mmd_pages = len(mmd_files[paper_id])
 75 |                 max_page = max(mmd_files[paper_id])
 76 |                 
 77 |                 # check if all pages are present (no gaps)
 78 |                 expected_pages = set(range(1, max_page + 1))
 79 |                 actual_pages = set(mmd_files[paper_id])
 80 |                 
 81 |                 if mmd_pages == pdf_pages and expected_pages == actual_pages:
 82 |                     complete.append((paper_id, pdf_pages))
 83 |                 else:
 84 |                     incomplete.append((paper_id, pdf_pages, mmd_pages))
 85 |                     if expected_pages != actual_pages:
 86 |                         missing_pages = sorted(expected_pages - actual_pages)
 87 |                         print(f"Paper {paper_id} has gaps: missing pages {missing_pages}")
 88 |             else:
 89 |                 missing.append((paper_id, pdf_pages))
 90 |     
 91 |     # print summary
 92 |     print("\nSummary:")
 93 |     print(f"Complete conversions: {len(complete)}")
 94 |     print(f"Incomplete conversions: {len(incomplete)}")
 95 |     print(f"Missing conversions: {len(missing)}")
 96 |     
 97 |     # print details of incomplete conversions
 98 |     if incomplete:
 99 |         print("\nIncomplete conversions details:")
100 |         for paper_id, pdf_pages, mmd_pages in incomplete:
101 |             print(f"Paper {paper_id}: Expected {pdf_pages} pages, found {mmd_pages} MMD files")
102 |     
103 |     # print details of missing conversions
104 |     if missing:
105 |         print("\nMissing conversions details:")
106 |         for paper_id, pdf_pages in missing:
107 |             print(f"Paper {paper_id}: No MMD files found (PDF has {pdf_pages} pages)")
108 | 
109 | if __name__ == "__main__":
110 |     parser = argparse.ArgumentParser(description='Check completeness of PDF to MMD conversion')
111 |     parser.add_argument(
112 |         '--pdf-dir', type=str, required=True, 
113 |         help='Root directory containing PDF files organized by month'
114 |     )
115 |     parser.add_argument(
116 |         '--mmd-dir', type=str, required=True,
117 |         help='Root directory containing MMD files organized by month'
118 |     )
119 |     args =  parser.parse_args()
120 |     main(args)


--------------------------------------------------------------------------------
/utils/get_metadata.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | import time
  4 | import requests
  5 | import argparse
  6 | from pathlib import Path
  7 | 
  8 | from tqdm import tqdm
  9 | import xml.etree.ElementTree as ET
 10 | 
 11 | 
 12 | 
 13 | def get_arxiv_metadata(arxiv_id):
 14 |     """Fetch metadata for a single arxiv paper."""
 15 |     url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}"
 16 |     response = requests.get(url)
 17 |     
 18 |     if response.status_code == 200:
 19 |         # parse the XML response
 20 |         root = ET.fromstring(response.content)
 21 |         ns = {"http://www.w3.org/2005/Atom"}
 22 |         
 23 |         entry = root.find("atom:entry", ns)
 24 |         if entry:
 25 |             title = entry.find("atom:title", ns).text.strip()
 26 |             abstract = entry.find("atom:summary", ns).text.strip()
 27 |             authors = [author.find("atom:name", ns).text for author in entry.findall("atom:author", ns)]
 28 |             published_date = entry.find("atom:published", ns).text.strip()
 29 |             link = entry.find("atom:link", ns).get("href")
 30 |             return {
 31 |                 "title": title,
 32 |                 "abstract": abstract,
 33 |                 "authors": authors,
 34 |                 "published_date": published_date,
 35 |                 "link": link
 36 |             }
 37 |     return None
 38 | 
 39 | 
 40 | def process_mmd_files(input_dir: Path):
 41 |     """Process MMD files and extract metadata."""
 42 |     # ensure output file exists with headers
 43 |     with open("arxiv_metadata.csv", "w", newline="", encoding="utf-8") as csvfile:
 44 |         fieldnames = ["id", "title", "abstract", "authors", "published_date", "link"]
 45 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 46 |         writer.writeheader()
 47 |     
 48 |     # collect all mmd files from input directory including subdirectories
 49 |     mmd_files = []
 50 |     for root, _, files in os.walk(input_dir):
 51 |         for file in files:
 52 |             if file.endswith(".mmd"):
 53 |                 mmd_files.append(os.path.join(root, file))
 54 |     mmd_files.sort()
 55 | 
 56 |     # process files
 57 |     for mmd_path in tqdm(mmd_files, desc="Processing MMD files"):
 58 |         arxiv_id = Path(mmd_path).stem  # get filename without extension
 59 |         
 60 |         metadata = get_arxiv_metadata(arxiv_id)
 61 |         if metadata:
 62 |             # append to csv
 63 |             with open("arxiv_metadata.csv", "a", newline="", encoding="utf-8") as csvfile:
 64 |                 writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 65 |                 writer.writerow({
 66 |                     "id": arxiv_id,
 67 |                     "title": metadata["title"],
 68 |                     "abstract": metadata["abstract"],
 69 |                     "authors": ", ".join(metadata["authors"]),
 70 |                     "published_date": metadata["published_date"],
 71 |                     "link": metadata["link"]
 72 |                 })
 73 |         
 74 |         # respect arXiv API rate limits (3 requests per second)
 75 |         time.sleep(3)
 76 | 
 77 | 
 78 | def main(args):
 79 |     input_dir = Path(args.input_dir)
 80 |     if not input_dir.exists():
 81 |         print(f"Error: Input directory {input_dir} does not exist")
 82 |         return
 83 | 
 84 |     print("Starting metadata extraction...")
 85 |     start_time = time.time()
 86 |     
 87 |     process_mmd_files(input_dir)
 88 |     
 89 |     processing_time = time.time() - start_time
 90 |     print(f"\nProcessing completed in {processing_time:.2f} seconds")
 91 |     print("Results saved to: arxiv_metadata.csv")
 92 | 
 93 | 
 94 | if __name__ == "__main__":
 95 |     parser = argparse.ArgumentParser(
 96 |         description="Extract metadata from arXiv papers using their IDs"
 97 |     )
 98 |     parser.add_argument(
 99 |         "--input-dir",
100 |         type=str,
101 |         required=True,
102 |         help="Input directory containing MMD files (can include subdirectories)"
103 |     )
104 |     args = parser.parse_args()
105 |     main(args)


--------------------------------------------------------------------------------