├── .gitignore ├── LICENSE ├── README.md ├── arxiv-tools ├── README.md ├── download.py ├── eda_manifest.py └── extract_pdfs.py ├── job_status_server.py ├── postprocess.py ├── requirements.txt ├── run_nougat.py └── utils ├── .DS_Store ├── check_complete_results.py └── get_metadata.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Neural Work 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Arxiver 2 | 3 | A toolkit for downloading and converting arXiv papers to multi markdown (.mmd) format with Nougat - a neural OCR. Our pipeline can extract LaTeX equations and includes post-processing tools to clean up and merge extracted data. See the [arxiver](https://huggingface.co/datasets/neuralwork/arxiver) dataset on Hugging Face Hub for sample results. 4 | 5 | ## Project Structure 6 | ``` 7 | arxiver/ 8 | arxiv-tools/ # Tools for downloading arXiv papers 9 | utils/ # Utility files to check processed data, get article metadata, etc. 10 | run_nougat.py # Batch PDF processing script to extract text in .mmd format 11 | job_status_server.py # Web server to monitor extraction progress 12 | postprocess.py # Post-processing scripting to clean and merge Nougat outputs 13 | ``` 14 | 15 | ## Downloading arXiv 16 | 17 | The `arxiv-tools` folder contains scripts for downloading arXiv papers and computing useful statistics about the arXiv dataset. For detailed instructions, see the [arxiv-tools README](arxiv-tools/README.md). Downloading and extracting the dataset creates a hierarchical folder structure organized by publication year and month as follows: 18 | 19 | ``` 20 | output_dir/ 21 | 2310/ # October 2023 22 | paper1.pdf 23 | paper2.pdf 24 | 2311/ # November 2023 25 | paper3.pdf 26 | paper4.pdf 27 | ``` 28 | 29 | ## Nougat Processing 30 | 31 | The `run_nougat.py` script processes PDF files in batches using the [Nougat](https://arxiv.org/abs/2308.13418) neural OCR model: 32 | 33 | ```bash 34 | python run_nougat.py \ 35 | --input_dir /path/to/datadir \ 36 | --output_dir /path/to/output \ 37 | --gpu_id 0 \ 38 | --batch_size 8 39 | ``` 40 | 41 | You can run Nougat using the output data directory as an input argument. Running this script processes pdfs by batches on specified GPU and logs successful and failed jobs (Nougat is not 100% stable). Output structure maintains the same year-month-based subdirectory structure but saves each page separately: 42 | ``` 43 | output_dir/ 44 | 2310/ 45 | paper1_1.mmd # Paper 1, page 1 46 | paper1_2.mmd # Paper 1, page 2 47 | paper2_1.mmd 48 | 2311/ 49 | paper3_1.mmd 50 | paper3_2.mmd 51 | paper4_1.mmd 52 | ``` 53 | 54 | #### Progress Monitoring 55 | We provide an optinoal script, `job_status_server.py` to provide a web interface to monitor processing progress: 56 | 57 | ```bash 58 | python job_status_server.py \ 59 | --input_dir /path/to/pdf/files \ 60 | --output_dir /path/to/output \ 61 | --port 8005 62 | ``` 63 | 64 | 65 | ## Post-Processing 66 | The post-processing pipeline includes several steps to validate and clean up the Nougat output. You can optionally check how many of the papers have been fully processed (all pages successfully extracted) by running: 67 | ```bash 68 | cd utils 69 | python check_complete_results.py --pdf-dir /path/to/pdf/root/dir --mmd-dir /path/to/mmd/root/dir 70 | ``` 71 | 72 | You can use the output .mmd files as they are or run post-processing to remove headers and references and merge multiple page MMD files into single documents operations. To do this, run the post-processing script: 73 | ```bash 74 | cd .. 75 | python postprocess.py --input-dir /path/to/processed-data --output-dir /path/to/output 76 | ``` 77 | 78 | Note that this script preserves the original hierarchical folder structure organized by publication year and month. 79 | 80 | #### Metadata Extraction 81 | You can optionally get article metadata by running: 82 | ```bash 83 | cd utils 84 | python extract_metadata.py --input-dir /path/to/merged-mmd-folder 85 | ``` 86 | 87 | ## Notes 88 | - GPU with CUDA support is required for efficient processing 89 | - Tested on an NVIDIA T4 GPU, processing speed depends on GPU memory and batch size 90 | - arxiv-tools/ is adapted from the original [repo](https://github.com/armancohan/arxiv-tools) 91 | 92 | Buy Me a Coffee at ko-fi.com 93 | 94 | From [neuralwork](https://neuralwork.ai/) with :heart: 95 | -------------------------------------------------------------------------------- /arxiv-tools/README.md: -------------------------------------------------------------------------------- 1 | # arXiv-tools 2 | 3 | A tool for downloading and managing arXiv documents in bulk using Amazon S3. 4 | 5 | ## Prerequisites 6 | 7 | - [Amazon AWS Account](https://aws.amazon.com/free) - required for accessing arXiv's bulk data on [Amazon S3](https://aws.amazon.com/s3) 8 | - Python 2.x to use the `s3cmd` package 9 | - Python 3.x for manifest file analysis 10 | 11 | ## Installation 12 | 13 | 1. Install s3cmd, a command-line tool for interacting with Amazon S3: 14 | ```bash 15 | pip install s3cmd # Python 2 only 16 | ``` 17 | 18 | 2. Configure s3cmd with your AWS credentials: 19 | ```bash 20 | s3cmd --configure 21 | ``` 22 | > Note: You'll need your AWS credentials from the Account Management tab on the AWS website. 23 | 24 | 3. Install required Python packages for manifest file analysis: 25 | ```bash 26 | pip install pandas # For Python 3.x 27 | ``` 28 | 29 | ## Usage 30 | 31 | ### 1. Download Manifest Files 32 | 33 | First, download the manifest files containing the complete list of available arXiv files: 34 | 35 | **For PDF documents:** 36 | ```bash 37 | s3cmd get --requester-pays \ 38 | s3://arxiv/pdf/arXiv_pdf_manifest.xml \ 39 | local-directory/arXiv_pdf_manifest.xml 40 | ``` 41 | 42 | **For source documents:** 43 | ```bash 44 | s3cmd get --requester-pays \ 45 | s3://arxiv/src/arXiv_src_manifest.xml \ 46 | local-directory/arXiv_src_manifest.xml 47 | ``` 48 | 49 | ### 2. Analyze Manifest Files (Optional) 50 | 51 | Use the `eda_manifest.py` script to analyze the manifest files: 52 | 53 | ```bash 54 | python eda_manifest.py 55 | ``` 56 | 57 | This script provides useful statistics about the arXiv dataset: 58 | - Total size of the dataset in bytes, MB, and GB 59 | - Total number of articles 60 | - Average file sizes 61 | - Number of files per time period 62 | - Detailed statistics for recent years (2022-2023) 63 | 64 | ### 3. Download arXiv Files 65 | 66 | Use the `download.py` script to fetch the actual files: 67 | 68 | **For PDF files:** 69 | ```bash 70 | python download.py \ 71 | --manifest_file /path/to/pdf-manifest \ 72 | --mode pdf \ 73 | --output_dir /path/to/output 74 | ``` 75 | 76 | **For source files:** 77 | ```bash 78 | python download.py \ 79 | --manifest_file /path/to/src-manifest \ 80 | --mode src \ 81 | --output_dir /path/to/output 82 | ``` 83 | 84 | The files will be downloaded to your specified output directory. Each file is in `.tar` format and approximately 500MB in size. 85 | 86 | ### 4. Extract PDFs from Tar Files 87 | 88 | After downloading the tar files, use `extract_pdfs.py` to extract PDFs into organized directories: 89 | 90 | ```bash 91 | python extract_pdfs.py \ 92 | --data_dir /path/to/tar/files \ 93 | --output_dir /path/to/output \ 94 | [--keep_tars] # Optional: keep original tar files 95 | ``` 96 | 97 | The script will create and extract pdf files to year-month subdirectories (e.g., "2310" for October 2023). Example output structure: 98 | ``` 99 | output_dir/ 100 | 2310/ # October 2023 101 | paper1.pdf 102 | paper2.pdf 103 | 2311/ # November 2023 104 | paper3.pdf 105 | paper4.pdf 106 | ``` 107 | 108 | 109 | ## Additional Resources 110 | - For metadata downloads, consider using [metha](https://github.com/miku/metha) 111 | 112 | ## Notes 113 | - The arXiv files are stored in requester-pays buckets on Amazon S3 114 | - Each archive file is approximately 500MB in size and uses the `.tar` format -------------------------------------------------------------------------------- /arxiv-tools/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import subprocess 4 | from argparse import ArgumentParser 5 | import xml.etree.ElementTree as ET 6 | 7 | 8 | # set up logging configuration 9 | log_file = os.path.join("logs", "preprocessing_logs.log") 10 | os.makedirs("logs", exist_ok=True) 11 | 12 | logging.basicConfig( 13 | level=logging.INFO, 14 | format="%(asctime)s - %(levelname)s - %(message)s", 15 | handlers=[ 16 | logging.FileHandler(log_file), 17 | logging.StreamHandler() 18 | ] 19 | ) 20 | logger = logging.getLogger(__name__) 21 | 22 | def download_files(**args): 23 | manifest_file = args["manifest_file"] 24 | mode = args["mode"] 25 | out_dir = args["output_dir"] 26 | 27 | if mode != "pdf" and mode != "src": 28 | logger.error("Invalid mode: %s. Mode should be 'pdf' or 'src'.", mode) 29 | 30 | def get_file(fname, out_dir): 31 | cmd = ["s3cmd", "get", "--requester-pays", "s3://arxiv/%s" % fname, "./%s" % out_dir] 32 | logger.info("Downloading file: %s to %s", fname, out_dir) 33 | subprocess.call(' '.join(cmd), shell=True) 34 | 35 | try: 36 | for file in ET.parse(manifest_file).getroot().findall("file")[:1]: 37 | filename = file.find("filename").text 38 | logger.info("Processing file: %s", filename) 39 | 40 | get_file(filename, out_dir='%s/%s/' % (out_dir, mode)) 41 | logger.debug("Successfully downloaded: %s", filename) 42 | except Exception as e: 43 | logger.error("Failed to process manifest file: %s", str(e), exc_info=True) 44 | 45 | logger.info("Download process completed") 46 | 47 | 48 | if __name__ == "__main__": 49 | argparser = ArgumentParser() 50 | argparser.add_argument("--manifest_file", "-m", type=str, help="The manifest file to download files from arXiv.", required=True) 51 | argparser.add_argument("--output_dir", "-o", type=str, default="data", help="Output directory to save files to.") 52 | argparser.add_argument("--mode", type=str, default="src", choices=set(("pdf", "src")), help="Can be 'pdf' or 'src'.") 53 | args = argparser.parse_args() 54 | download_files(**vars(args)) 55 | -------------------------------------------------------------------------------- /arxiv-tools/eda_manifest.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3 3 | arXiv Manifest Analysis Script 4 | 5 | This script analyzes XML manifest files from arXiv's bulk data access, 6 | providing insights about file sizes, article counts, and temporal distribution. 7 | """ 8 | 9 | import xml.etree.ElementTree as ET 10 | import pandas as pd 11 | from typing import List, Dict 12 | 13 | 14 | def parse_manifest(filepath: str) -> pd.DataFrame: 15 | """ 16 | Parse the arXiv manifest XML file into a pandas DataFrame. 17 | 18 | Args: 19 | filepath: Path to the manifest XML file 20 | 21 | Returns: 22 | DataFrame containing parsed manifest data 23 | """ 24 | tree = ET.parse(filepath) 25 | root = tree.getroot() 26 | 27 | data = [{ 28 | "Filename": element.find("filename").text, 29 | "Number of Items": int(element.find("num_items").text), 30 | "Size": int(element.find("size").text), 31 | "Timestamp": element.find("timestamp").text, 32 | "YYMM": element.find("yymm").text, 33 | } for element in root.findall("file")] 34 | 35 | return pd.DataFrame(data) 36 | 37 | 38 | def analyze_total_statistics(df: pd.DataFrame) -> Dict: 39 | """Calculate and return overall statistics from the manifest data.""" 40 | total_size = df["Size"].sum() 41 | total_articles = df["Number of Items"].sum() 42 | total_tars = len(df) 43 | 44 | return { 45 | "total_size_bytes": total_size, 46 | "total_size_mb": total_size / 1e6, 47 | "total_size_gb": total_size / 1e9, 48 | "total_articles": total_articles, 49 | "total_tar_files": total_tars, 50 | "avg_article_size_mb": (total_size / total_articles) / 1e6, 51 | "avg_tar_size_mb": (total_size / total_tars) / 1e6, 52 | "avg_items_per_tar": total_articles / total_tars 53 | } 54 | 55 | 56 | def analyze_yearly_data(df: pd.DataFrame, year: str) -> Dict: 57 | """ 58 | Analyze manifest data for a specific year. 59 | 60 | Args: 61 | df: Full manifest DataFrame 62 | year: Two-digit year string (e.g., "22" for 2022) 63 | 64 | Returns: 65 | Dictionary containing year-specific statistics 66 | """ 67 | year_df = df[df["YYMM"].str.startswith(year)] 68 | total_size = year_df["Size"].sum() 69 | total_articles = year_df["Number of Items"].sum() 70 | 71 | return { 72 | "total_articles": total_articles, 73 | "total_size_gb": total_size / 1e9, 74 | "data": year_df 75 | } 76 | 77 | 78 | def print_statistics(stats: Dict) -> None: 79 | """Print formatted statistics.""" 80 | print("\n=== Overall Statistics ===") 81 | print(f"Total Size: {stats['total_size_gb']:.2f} GB ({stats['total_size_bytes']:,} bytes)") 82 | print(f"Total Articles: {stats['total_articles']:,}") 83 | print(f"Total TAR Files: {stats['total_tar_files']:,}") 84 | print(f"\nAverages:") 85 | print(f"- Article Size: {stats['avg_article_size_mb']:.2f} MB") 86 | print(f"- TAR File Size: {stats['avg_tar_size_mb']:.2f} MB") 87 | print(f"- Items per TAR: {stats['avg_items_per_tar']:.1f}") 88 | 89 | 90 | def main(): 91 | # path to input manifest file 92 | MANIFEST_PATH = "arXiv_pdf_manifest.xml" 93 | 94 | # load and parse manifest 95 | print("Parsing manifest file...") 96 | df = parse_manifest(MANIFEST_PATH) 97 | 98 | # calculate overall statistics 99 | stats = analyze_total_statistics(df) 100 | print_statistics(stats) 101 | 102 | # analyze recent years 103 | print("\n=== Recent Years Analysis ===") 104 | for year in ["22", "23"]: 105 | year_stats = analyze_yearly_data(df, year) 106 | print(f"\nYear 20{year}:") 107 | print(f"- Articles: {year_stats['total_articles']:,}") 108 | print(f"- Size: {year_stats['total_size_gb']:.2f} GB") 109 | 110 | # export year-specific data if needed 111 | year_stats["data"].to_csv(f"df_{year}.csv", index=False) 112 | 113 | # print unique YYMM values for reference 114 | print("\n=== Time Coverage ===") 115 | unique_yymm = sorted(df["YYMM"].unique()) 116 | print(f"Coverage period: {unique_yymm[0]} to {unique_yymm[-1]}") 117 | 118 | 119 | if __name__ == "__main__": 120 | main() -------------------------------------------------------------------------------- /arxiv-tools/extract_pdfs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | arXiv Tar Extraction Script 4 | 5 | Extracts PDF files from arXiv tar archives into organized directories. 6 | The script processes tar files named like 'arXiv_pdf_YY_MM_N.tar' and organizes PDFs into 7 | YYMM directories. 8 | 9 | Usage: 10 | python extract_arxiv.py --data_dir /path/to/tar/files 11 | """ 12 | 13 | import os 14 | import tarfile 15 | import logging 16 | import argparse 17 | from typing import List 18 | from pathlib import Path 19 | 20 | 21 | def setup_logging() -> logging.Logger: 22 | """Configure and return logger.""" 23 | logging.basicConfig( 24 | level=logging.INFO, 25 | format="%(asctime)s - %(levelname)s - %(message)s", 26 | handlers=[ 27 | logging.FileHandler("arxiv_extract.log"), 28 | logging.StreamHandler() 29 | ] 30 | ) 31 | return logging.getLogger("arxiv_extractor") 32 | 33 | 34 | def parse_args() -> argparse.Namespace: 35 | """Parse command line arguments.""" 36 | parser = argparse.ArgumentParser( 37 | description="Extract PDFs from arXiv tar archives", 38 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 39 | ) 40 | parser.add_argument( 41 | "--data_dir", 42 | type=str, 43 | required=True, 44 | help="Directory containing arXiv tar files" 45 | ) 46 | parser.add_argument( 47 | "--output_dir", 48 | type=str, 49 | default="extracted", 50 | help="Base directory for extracted PDFs" 51 | ) 52 | parser.add_argument( 53 | "--keep_tars", 54 | action="store_true", 55 | help="Keep tar files after extraction (default: delete)" 56 | ) 57 | 58 | return parser.parse_args() 59 | 60 | 61 | def get_tar_files(data_dir: Path) -> List[Path]: 62 | """ 63 | Find all tar files in the specified directory. 64 | 65 | Args: 66 | data_dir: Directory to search for tar files 67 | 68 | Returns: 69 | List of paths to tar files 70 | """ 71 | return list(data_dir.glob("*.tar")) 72 | 73 | 74 | def extract_pdfs_from_tar( 75 | tar_path: Path, 76 | output_base: Path, 77 | keep_tar: bool = False 78 | ) -> bool: 79 | """ 80 | Extract PDF files from a tar archive into a year-month directory. 81 | 82 | Args: 83 | tar_path: Path to the tar file 84 | output_base: Base directory for extracted files 85 | keep_tar: Whether to keep the tar file after extraction 86 | 87 | Returns: 88 | bool: True if extraction successful, False otherwise 89 | """ 90 | logger = logging.getLogger("arxiv_extractor") 91 | 92 | try: 93 | # extract directory name (YYMM) from tar file name 94 | # example: "arXiv_pdf_23_10_1.tar" -> "2310" 95 | parts = tar_path.stem.split("_") 96 | if len(parts) != 5 or not parts[2].isdigit() or not parts[3].isdigit(): 97 | logger.warning("Invalid tar file name format: %s", tar_path.name) 98 | return False 99 | 100 | dir_name = parts[2] + parts[3] # e.g., "2310" for year 23, month 10 101 | output_dir = output_base / dir_name 102 | output_dir.mkdir(parents=True, exist_ok=True) 103 | 104 | logger.info("Extracting %s to directory %s", tar_path.name, output_dir) 105 | 106 | # Extract PDF files 107 | with tarfile.open(tar_path, "r") as tar: 108 | pdf_members = [m for m in tar.getmembers() 109 | if m.isreg() and m.name.endswith(".pdf")] 110 | 111 | total_pdfs = len(pdf_members) 112 | for i, member in enumerate(pdf_members, 1): 113 | tar.extract(member, output_dir) 114 | if i % 100 == 0: # log progress every 100 files 115 | logger.info("Extracted %d/%d PDFs (%.1f%%)", 116 | i, total_pdfs, (i/total_pdfs)*100) 117 | 118 | logger.info("Successfully extracted %d PDF files to %s", 119 | total_pdfs, output_dir) 120 | 121 | # clean up tar file if requested 122 | if not keep_tar: 123 | tar_path.unlink() 124 | logger.info("Deleted tar file: %s", tar_path.name) 125 | 126 | return True 127 | 128 | except Exception as e: 129 | logger.error("Error processing %s: %s", tar_path.name, str(e)) 130 | return False 131 | 132 | 133 | def main(): 134 | args = parse_args() 135 | logger = setup_logging() 136 | 137 | data_dir = Path(args.data_dir) 138 | output_base = Path(args.output_dir) 139 | 140 | if not data_dir.is_dir(): 141 | logger.error("Data directory does not exist: %s", data_dir) 142 | return 143 | 144 | # create output base directory 145 | output_base.mkdir(parents=True, exist_ok=True) 146 | 147 | # get list of tar files 148 | tar_files = get_tar_files(data_dir) 149 | if not tar_files: 150 | logger.error("No tar files found in %s", data_dir) 151 | return 152 | 153 | logger.info("Found %d tar files to process", len(tar_files)) 154 | 155 | # process each tar file 156 | successful = 0 157 | failed = 0 158 | 159 | for tar_path in tar_files: 160 | if extract_pdfs_from_tar(tar_path, output_base, args.keep_tars): 161 | successful += 1 162 | else: 163 | failed += 1 164 | 165 | # log final statistics 166 | logger.info("\nExtraction complete:") 167 | logger.info("- Successful: %d", successful) 168 | logger.info("- Failed: %d", failed) 169 | logger.info("- Total processed: %d", len(tar_files)) 170 | 171 | 172 | if __name__ == "__main__": 173 | main() 174 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /job_status_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3 3 | Job Status Server 4 | 5 | A FastAPI server that monitors and reports Nougat inference progress. 6 | Scans directories of MMD files to track processing progress. 7 | 8 | Dependencies: 9 | - fastapi 10 | - uvicorn[standard] 11 | 12 | Usage: 13 | python job_status_server.py 14 | """ 15 | 16 | import logging 17 | import argparse 18 | from datetime import datetime 19 | from pathlib import Path 20 | from typing import Tuple 21 | 22 | from fastapi import FastAPI, HTTPException 23 | from fastapi.responses import HTMLResponse 24 | import uvicorn 25 | 26 | 27 | def parse_args() -> argparse.Namespace: 28 | """Parse command line arguments.""" 29 | parser = argparse.ArgumentParser( 30 | description="Monitor Nougat inference progress", 31 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 32 | ) 33 | parser.add_argument( 34 | "--input_dir", 35 | type=str, 36 | required=True, 37 | help="Directory containing source PDF files" 38 | ) 39 | parser.add_argument( 40 | "--output_dir", 41 | type=str, 42 | required=True, 43 | help="Directory containing output MMD files" 44 | ) 45 | parser.add_argument( 46 | "--port", 47 | type=int, 48 | default=8005, 49 | help="Port number for the server" 50 | ) 51 | return parser.parse_args() 52 | 53 | 54 | # Configure logging 55 | logging.basicConfig( 56 | level=logging.INFO, 57 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 58 | handlers=[ 59 | logging.FileHandler("job_server.log"), 60 | logging.StreamHandler() 61 | ] 62 | ) 63 | logger = logging.getLogger("job_status_server") 64 | 65 | # initialize FastAPI app 66 | app = FastAPI( 67 | title="Nougat Job Status Server", 68 | description="Monitor Nougat inference progress", 69 | version="1.0.0" 70 | ) 71 | 72 | input_dir: Path = None 73 | output_dir: Path = None 74 | start_time: datetime = None 75 | 76 | 77 | def calculate_time_difference(start: datetime, end: datetime) -> str: 78 | """Calculate and format the time difference between two timestamps.""" 79 | time_difference = abs(end - start) 80 | days = time_difference.days 81 | hours, remainder = divmod(time_difference.seconds, 3600) 82 | minutes, _ = divmod(remainder, 60) 83 | 84 | return f"{days} days, {hours} hours, and {minutes} minutes" 85 | 86 | 87 | def count_pdf_files() -> int: 88 | """Count total number of PDF files in input directory.""" 89 | total = 0 90 | for month_dir in input_dir.iterdir(): 91 | if month_dir.is_dir(): 92 | total += len(list(month_dir.glob("*.pdf"))) 93 | return total 94 | 95 | 96 | def get_processed_files() -> dict: 97 | """Get count of processed files per month directory.""" 98 | processed = {} 99 | if output_dir.exists(): 100 | for month_dir in output_dir.iterdir(): 101 | if month_dir.is_dir(): 102 | processed[month_dir.name] = len(list(month_dir.glob("*.mmd"))) 103 | return processed 104 | 105 | 106 | def get_job_stats() -> Tuple[int, int, float, dict]: 107 | """Calculate current job statistics.""" 108 | try: 109 | total_pdfs = count_pdf_files() 110 | processed_files = get_processed_files() 111 | total_processed = sum(processed_files.values()) 112 | remaining = total_pdfs - total_processed 113 | percentage = (total_processed / total_pdfs * 100) if total_pdfs > 0 else 0 114 | 115 | logger.info( 116 | "Stats - Total PDFs: %d, Processed: %d, Remaining: %d, Percentage: %.2f%%", 117 | total_pdfs, total_processed, remaining, percentage 118 | ) 119 | 120 | return total_pdfs, total_processed, remaining, percentage, processed_files 121 | 122 | except Exception as e: 123 | logger.error("Error calculating job stats: %s", str(e)) 124 | raise HTTPException(status_code=500, detail="Error calculating job statistics") 125 | 126 | 127 | @app.get("/", response_class=HTMLResponse) 128 | def status() -> HTMLResponse: 129 | """Generate HTML status page showing current job statistics.""" 130 | try: 131 | total_pdfs, processed, remaining, percentage, processed_files = get_job_stats() 132 | elapsed_time = calculate_time_difference(start_time, datetime.now()) 133 | 134 | # generate month-wise progress HTML 135 | month_progress = "" 136 | for month, count in sorted(processed_files.items()): 137 | month_progress += f"

Month {month}: {count:,} files processed

\n" 138 | 139 | return f""" 140 | 141 | 142 | Nougat Inference Status 143 | 169 | 170 | 171 |

Nougat Inference Progress

172 |
173 |
174 |
175 |

Total PDF files: {total_pdfs:,}

176 |

Processed files: {processed:,}

177 |

Remaining files: {remaining:,}

178 |

Completion: {percentage:.2f}%

179 |

Time elapsed: {elapsed_time}

180 | 181 |

Progress by Month

182 | {month_progress} 183 | 184 | 185 | """ 186 | 187 | except Exception as e: 188 | logger.error("Error generating status page: %s", str(e)) 189 | raise HTTPException(status_code=500, detail="Error generating status page") 190 | 191 | 192 | @app.on_event("startup") 193 | async def startup_event(): 194 | """Initialize server state and log startup.""" 195 | global start_time 196 | start_time = datetime.now() 197 | 198 | logger.info("Job Status Server starting up on port %d", args.port) 199 | logger.info("Monitoring input directory: %s", input_dir) 200 | logger.info("Monitoring output directory: %s", output_dir) 201 | total_pdfs = count_pdf_files() 202 | logger.info("Total PDF files to process: %d", total_pdfs) 203 | 204 | 205 | if __name__ == "__main__": 206 | args = parse_args() 207 | input_dir = Path(args.input_dir) 208 | output_dir = Path(args.output_dir) 209 | uvicorn.run(app, host="0.0.0.0", port=args.port) -------------------------------------------------------------------------------- /postprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import time 4 | import argparse 5 | from pathlib import Path 6 | from typing import Set, List, Tuple 7 | 8 | 9 | def read_mmd(file_path: str) -> str: 10 | """Read MMD file content.""" 11 | with open(file_path, "r", encoding="utf-8") as f: 12 | return f.read() 13 | 14 | 15 | def parse_filename(filename: str) -> Tuple[str, str]: 16 | """Extract article ID and page number from filename.""" 17 | base_name = filename[:-4] if filename.endswith(".mmd") else filename 18 | paper_id, page_num = base_name.rsplit("_", 1) 19 | return paper_id, page_num 20 | 21 | 22 | def detect_headers(mmd: str) -> List[Tuple[int, str]]: 23 | """Detect headers in MMD content.""" 24 | return [(i, line) for i, line in enumerate(mmd.splitlines()) if line.startswith("#")] 25 | 26 | 27 | def has_abstract(mmd: str) -> bool: 28 | """Check if MMD content contains an abstract.""" 29 | return any("abstract" in line.lower() for line in mmd.splitlines()) 30 | 31 | 32 | def find_references(mmd: str) -> bool: 33 | """Find references section in MMD content.""" 34 | for line in mmd.splitlines(): 35 | if line.startswith("#") and "references" in line.lower(): 36 | return True 37 | return False 38 | 39 | 40 | def remove_authors(mmd: str) -> str: 41 | """Remove author names while preserving layout.""" 42 | lines = mmd.splitlines() 43 | abstract_line = 0 44 | for i, line in enumerate(lines): 45 | if line.startswith("#") and "abstract" in line.lower(): 46 | abstract_line = i 47 | break 48 | return "\n".join([lines[0], ""] + lines[abstract_line:]) 49 | 50 | 51 | def remove_references(mmd: str) -> str: 52 | """Remove content after references section.""" 53 | lines = mmd.splitlines() 54 | for i, line in enumerate(lines): 55 | if line.startswith("#") and "references" in line.lower(): 56 | return '\n'.join(lines[:i]) 57 | return mmd 58 | 59 | class ArticleProcessor: 60 | def __init__(self): 61 | self.headers_detected = set() 62 | self.abstract_detected = set() 63 | self.reference_pages = {} 64 | self.article_pages = {} 65 | # track which month directory each article belongs to 66 | self.article_months = {} 67 | 68 | def process_month_directory(self, month_dir: Path): 69 | """Process all MMD files in a month directory.""" 70 | if not month_dir.is_dir(): 71 | return 72 | 73 | month_name = month_dir.name 74 | for mmd_path in month_dir.glob("*.mmd"): 75 | paper_id, page_num = parse_filename(mmd_path.name) 76 | 77 | # store month information 78 | self.article_months[paper_id] = month_name 79 | 80 | # store page information 81 | if paper_id not in self.article_pages: 82 | self.article_pages[paper_id] = [] 83 | self.article_pages[paper_id].append(page_num) 84 | 85 | try: 86 | mmd_content = read_mmd(str(mmd_path)) 87 | 88 | # only process first page for headers and abstract 89 | if page_num == '1': 90 | if has_abstract(mmd_content): 91 | self.abstract_detected.add(paper_id) 92 | if len(detect_headers(mmd_content)) > 1: 93 | self.headers_detected.add(paper_id) 94 | 95 | # check for references 96 | if find_references(mmd_content): 97 | self.reference_pages[paper_id] = page_num 98 | 99 | except Exception as e: 100 | print(f"Error processing {mmd_path}: {e}") 101 | 102 | def get_valid_articles(self) -> Set[str]: 103 | """Return articles with both headers and abstract.""" 104 | return self.headers_detected.intersection(self.abstract_detected) 105 | 106 | def postprocess_articles(input_dir: Path, output_dir: Path, processor: ArticleProcessor): 107 | """Postprocess articles by removing authors and references.""" 108 | valid_articles = processor.get_valid_articles() 109 | 110 | for article_id in valid_articles: 111 | if article_id not in processor.reference_pages or article_id not in processor.article_pages: 112 | continue 113 | 114 | # get the year-month directory for this article 115 | month = processor.article_months[article_id] 116 | month_output_dir = output_dir / month 117 | month_output_dir.mkdir(parents=True, exist_ok=True) 118 | 119 | pages = sorted([int(p) for p in processor.article_pages[article_id]]) 120 | ref_page = int(processor.reference_pages[article_id]) 121 | processed_content = [] 122 | 123 | for page_num in pages: 124 | mmd_path = input_dir / month / f"{article_id}_{page_num}.mmd" 125 | if not mmd_path.exists(): 126 | continue 127 | 128 | content = read_mmd(str(mmd_path)) 129 | 130 | if page_num == 1: 131 | content = remove_authors(content) 132 | elif page_num == ref_page: 133 | if not content.splitlines()[0].lower().startswith("# reference"): 134 | content = remove_references(content) 135 | else: 136 | continue 137 | elif page_num > ref_page: 138 | continue 139 | 140 | processed_content.append(content) 141 | 142 | if processed_content: 143 | output_path = month_output_dir / f"{article_id}.mmd" 144 | with open(output_path, "w", encoding="utf-8") as f: 145 | f.write('\n'.join(processed_content)) 146 | 147 | def main(args): 148 | input_dir = Path(args.input_dir) 149 | output_dir = Path(args.output_dir) 150 | 151 | print("Processing MMD files...") 152 | start_time = time.time() 153 | 154 | # initialize processor 155 | processor = ArticleProcessor() 156 | 157 | # process each month directory 158 | for month_dir in input_dir.iterdir(): 159 | if month_dir.is_dir(): 160 | print(f"Processing directory: {month_dir.name}") 161 | processor.process_month_directory(month_dir) 162 | 163 | valid_articles = processor.get_valid_articles() 164 | 165 | print(f"\nFound:") 166 | print(f"- Articles with headers and abstract: {len(valid_articles)}") 167 | print(f"- Articles with references: {len(processor.reference_pages)}") 168 | print(f"- Total articles: {len(processor.article_pages)}") 169 | 170 | # postprocess valid articles 171 | print("\nPostprocessing articles...") 172 | postprocess_articles(input_dir, output_dir, processor) 173 | 174 | processing_time = time.time() - start_time 175 | print(f"\nProcessing completed in {processing_time:.2f} seconds") 176 | print(f"Processed files saved to: {output_dir}") 177 | 178 | 179 | if __name__ == "__main__": 180 | parser = argparse.ArgumentParser( 181 | description="Process arXiv MMD files: detect headers, abstracts, references, and postprocess" 182 | ) 183 | parser.add_argument( 184 | "--input-dir", 185 | type=str, 186 | required=True, 187 | help="Input directory containing MMD files organized by month (YYMM)" 188 | ) 189 | parser.add_argument( 190 | "--output-dir", 191 | type=str, 192 | required=True, 193 | help="Output directory for processed MMD files (will maintain month structure)" 194 | ) 195 | args = parser.parse_args() 196 | main(args) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | transformers==4.25.1 4 | timm==0.5.4 5 | ojson 6 | opencv-python-headless 7 | datasets[vision] 8 | lightning==2.0.0 9 | nltk 10 | python-Levenshtein 11 | tqdm 12 | sentencepiece 13 | sconf==0.2.3 14 | albumentations==1.0.0 15 | pypdf==3.1.0 16 | pypdfium2 17 | Pillow 18 | PyPDF2 -------------------------------------------------------------------------------- /run_nougat.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3 3 | Nougat batch inference script 4 | 5 | Processes PDFs with Nougat model in batches and extracts text to multi markdown files. 6 | Works with a directory structure organized by year-month (YYMM). 7 | 8 | Usage: 9 | python nougat_inference.py --input_dir /path/to/pdfs --output_dir /path/to/output --gpu_id 0 10 | """ 11 | 12 | import time 13 | import logging 14 | import argparse 15 | from functools import partial 16 | from pathlib import Path 17 | from typing import List 18 | 19 | import torch 20 | import pypdf 21 | from tqdm import tqdm 22 | from nougat import NougatModel 23 | from nougat.utils.dataset import LazyDataset 24 | from nougat.utils.checkpoint import get_checkpoint 25 | from nougat.postprocessing import markdown_compatible 26 | 27 | 28 | def setup_logging(output_dir: Path) -> None: 29 | """Configure logging to file and console.""" 30 | logging.basicConfig( 31 | level=logging.INFO, 32 | format="%(asctime)s - %(levelname)s - %(message)s", 33 | handlers=[ 34 | logging.FileHandler(output_dir / "nougat_inference.log"), 35 | logging.StreamHandler() 36 | ] 37 | ) 38 | 39 | 40 | def parse_args() -> argparse.Namespace: 41 | """Parse command line arguments.""" 42 | parser = argparse.ArgumentParser( 43 | description="Process PDFs with Nougat model", 44 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 45 | ) 46 | parser.add_argument( 47 | "--input_dir", 48 | type=str, 49 | required=True, 50 | help="Directory containing PDF files organized in YYMM folders" 51 | ) 52 | parser.add_argument( 53 | "--output_dir", 54 | type=str, 55 | required=True, 56 | help="Directory for output markdown files" 57 | ) 58 | parser.add_argument( 59 | "--gpu_id", 60 | type=int, 61 | default=0, 62 | help="GPU ID to use for inference" 63 | ) 64 | parser.add_argument( 65 | "--batch_size", 66 | type=int, 67 | default=8, 68 | help="Batch size for processing pages" 69 | ) 70 | return parser.parse_args() 71 | 72 | 73 | def load_model_to_gpu(model_tag: str, gpu_id: int) -> NougatModel: 74 | """Initialize and load Nougat model to specified GPU.""" 75 | logger = logging.getLogger("nougat_inference") 76 | logger.info(f"Loading model {model_tag} to GPU {gpu_id}") 77 | checkpoint = get_checkpoint(None, model_tag=model_tag) 78 | model = NougatModel.from_pretrained(checkpoint) 79 | model.to(f"cuda:{gpu_id}").to(torch.bfloat16) 80 | model.eval() 81 | return model 82 | 83 | 84 | def get_pdf_files(input_dir: Path) -> List[Path]: 85 | """Get all PDF files from the input directory structure.""" 86 | pdf_files = [] 87 | for month_dir in input_dir.iterdir(): 88 | if month_dir.is_dir(): 89 | pdf_files.extend(month_dir.glob("*.pdf")) 90 | return sorted(pdf_files) 91 | 92 | 93 | def process_pdf(pdf_path: Path, output_dir: Path, model: NougatModel, batch_size: int) -> bool: 94 | """ 95 | Process all pages of a PDF document with the Nougat model. 96 | 97 | Args: 98 | pdf_path: Path to PDF file 99 | output_dir: Directory for output files 100 | model: Loaded Nougat model 101 | batch_size: Number of pages to process at once 102 | 103 | Returns: 104 | bool: True if processing was successful, False otherwise 105 | """ 106 | logger = logging.getLogger("nougat_inference") 107 | start_time = time.time() 108 | 109 | # get document ID (remove .pdf and use full path structure) 110 | document_id = pdf_path.stem 111 | month_dir = pdf_path.parent.name 112 | 113 | try: 114 | # prepare dataset for all pages 115 | full_dataset = LazyDataset( 116 | str(pdf_path), partial(model.encoder.prepare_input, random_padding=False) 117 | ) 118 | except pypdf.errors.PdfStreamError as e: 119 | logger.error(f"Failed to load PDF {document_id}: {str(e)}") 120 | return False 121 | 122 | # create dataloader 123 | dataloader = torch.utils.data.DataLoader( 124 | full_dataset, 125 | batch_size=batch_size, 126 | shuffle=False, 127 | collate_fn=LazyDataset.ignore_none_collate, 128 | ) 129 | 130 | try: 131 | # create month directory in output 132 | month_output_dir = output_dir / month_dir 133 | month_output_dir.mkdir(exist_ok=True) 134 | 135 | # process pages 136 | for batch_idx, (sample, is_last_page) in enumerate(tqdm(dataloader, desc=f"Processing {document_id}")): 137 | with torch.no_grad(): 138 | model_output = model.inference( 139 | image_tensors=sample, 140 | early_stopping=False 141 | ) 142 | 143 | # save predictions for each page 144 | for j, output in enumerate(model_output["predictions"]): 145 | page_num = batch_idx * batch_size + j + 1 146 | formatted_output = markdown_compatible(output.strip()) 147 | 148 | output_path = month_output_dir / f"{document_id}_{page_num}.mmd" 149 | output_path.write_text(formatted_output) 150 | 151 | elapsed_time = time.time() - start_time 152 | logger.info(f"Processed {document_id} in {elapsed_time:.2f} seconds") 153 | return True 154 | 155 | except Exception as e: 156 | logger.error(f"Error processing {document_id}: {str(e)}") 157 | return False 158 | 159 | 160 | def main(): 161 | """Main execution function.""" 162 | args = parse_args() 163 | 164 | # create output directory and setup logging 165 | input_dir = Path(args.input_dir) 166 | output_dir = Path(args.output_dir) 167 | 168 | output_dir.mkdir(parents=True, exist_ok=True) 169 | setup_logging(output_dir) 170 | logger = logging.getLogger("nougat_inference") 171 | 172 | # load Nougat model 173 | model = load_model_to_gpu("0.1.0-small", args.gpu_id) 174 | 175 | # get PDF files 176 | pdf_files = get_pdf_files(input_dir) 177 | logger.info(f"Found {len(pdf_files)} PDF files to process") 178 | 179 | # process PDFs 180 | processed = 0 181 | failed = 0 182 | 183 | for pdf_path in tqdm(pdf_files, desc="Overall progress"): 184 | if process_pdf(pdf_path, output_dir, model, args.batch_size): 185 | processed += 1 186 | else: 187 | failed += 1 188 | 189 | # log final summary 190 | logger.info("\nProcessing Summary:") 191 | logger.info(f"Successfully processed: {processed}") 192 | logger.info(f"Failed: {failed}") 193 | logger.info(f"Total files attempted: {processed + failed}") 194 | 195 | 196 | if __name__ == "__main__": 197 | main() -------------------------------------------------------------------------------- /utils/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neuralwork/arxiver/59ce3c3cf31894911b8f623743b0fc5a4e5ec1f5/utils/.DS_Store -------------------------------------------------------------------------------- /utils/check_complete_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from pathlib import Path 4 | from collections import defaultdict 5 | import PyPDF2 6 | 7 | 8 | 9 | def get_pdf_page_count(pdf_path): 10 | try: 11 | with open(pdf_path, 'rb') as f: 12 | pdf = PyPDF2.PdfReader(f) 13 | return len(pdf.pages) 14 | except Exception as e: 15 | print(f"Error reading {pdf_path}: {str(e)}") 16 | return None 17 | 18 | 19 | def collect_mmd_files(mmd_root): 20 | # dictionary to store paper_id -> list of page numbers 21 | mmd_files = defaultdict(list) 22 | 23 | # walk through all subdirectories 24 | for month_dir in os.listdir(mmd_root): 25 | month_path = os.path.join(mmd_root, month_dir) 26 | if not os.path.isdir(month_path): 27 | continue 28 | 29 | for filename in os.listdir(month_path): 30 | if not filename.endswith('.mmd'): 31 | continue 32 | 33 | # extract paper ID and page number - paper_id_page.mmd 34 | base_name = filename[:-4] # remove .mmd 35 | paper_id, page_num = base_name.rsplit('_', 1) 36 | mmd_files[paper_id].append(int(page_num)) 37 | 38 | return mmd_files 39 | 40 | def main(args): 41 | pdf_root = Path(args.pdf_dir) 42 | mmd_root = Path(args.mmd_dir) 43 | 44 | complete = [] 45 | incomplete = [] 46 | missing = [] 47 | 48 | # collect all MMD files first 49 | print("Collecting MMD files...") 50 | mmd_files = collect_mmd_files(mmd_root) 51 | 52 | # process each PDF file 53 | print("\nChecking PDFs against MMD files...") 54 | for month_dir in os.listdir(pdf_root): 55 | month_path = os.path.join(pdf_root, month_dir) 56 | if not os.path.isdir(month_path): 57 | continue 58 | 59 | for pdf_file in os.listdir(month_path): 60 | if not pdf_file.endswith('.pdf'): 61 | continue 62 | 63 | paper_id = pdf_file[:-4] # remove .pdf extension 64 | pdf_path = os.path.join(month_path, pdf_file) 65 | 66 | # get PDF page count 67 | pdf_pages = get_pdf_page_count(pdf_path) 68 | if pdf_pages is None: 69 | print(f"Skipping {pdf_file} due to error") 70 | continue 71 | 72 | # check if we have MMD files for this paper 73 | if paper_id in mmd_files: 74 | mmd_pages = len(mmd_files[paper_id]) 75 | max_page = max(mmd_files[paper_id]) 76 | 77 | # check if all pages are present (no gaps) 78 | expected_pages = set(range(1, max_page + 1)) 79 | actual_pages = set(mmd_files[paper_id]) 80 | 81 | if mmd_pages == pdf_pages and expected_pages == actual_pages: 82 | complete.append((paper_id, pdf_pages)) 83 | else: 84 | incomplete.append((paper_id, pdf_pages, mmd_pages)) 85 | if expected_pages != actual_pages: 86 | missing_pages = sorted(expected_pages - actual_pages) 87 | print(f"Paper {paper_id} has gaps: missing pages {missing_pages}") 88 | else: 89 | missing.append((paper_id, pdf_pages)) 90 | 91 | # print summary 92 | print("\nSummary:") 93 | print(f"Complete conversions: {len(complete)}") 94 | print(f"Incomplete conversions: {len(incomplete)}") 95 | print(f"Missing conversions: {len(missing)}") 96 | 97 | # print details of incomplete conversions 98 | if incomplete: 99 | print("\nIncomplete conversions details:") 100 | for paper_id, pdf_pages, mmd_pages in incomplete: 101 | print(f"Paper {paper_id}: Expected {pdf_pages} pages, found {mmd_pages} MMD files") 102 | 103 | # print details of missing conversions 104 | if missing: 105 | print("\nMissing conversions details:") 106 | for paper_id, pdf_pages in missing: 107 | print(f"Paper {paper_id}: No MMD files found (PDF has {pdf_pages} pages)") 108 | 109 | if __name__ == "__main__": 110 | parser = argparse.ArgumentParser(description='Check completeness of PDF to MMD conversion') 111 | parser.add_argument( 112 | '--pdf-dir', type=str, required=True, 113 | help='Root directory containing PDF files organized by month' 114 | ) 115 | parser.add_argument( 116 | '--mmd-dir', type=str, required=True, 117 | help='Root directory containing MMD files organized by month' 118 | ) 119 | args = parser.parse_args() 120 | main(args) -------------------------------------------------------------------------------- /utils/get_metadata.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import time 4 | import requests 5 | import argparse 6 | from pathlib import Path 7 | 8 | from tqdm import tqdm 9 | import xml.etree.ElementTree as ET 10 | 11 | 12 | 13 | def get_arxiv_metadata(arxiv_id): 14 | """Fetch metadata for a single arxiv paper.""" 15 | url = f"http://export.arxiv.org/api/query?id_list={arxiv_id}" 16 | response = requests.get(url) 17 | 18 | if response.status_code == 200: 19 | # parse the XML response 20 | root = ET.fromstring(response.content) 21 | ns = {"http://www.w3.org/2005/Atom"} 22 | 23 | entry = root.find("atom:entry", ns) 24 | if entry: 25 | title = entry.find("atom:title", ns).text.strip() 26 | abstract = entry.find("atom:summary", ns).text.strip() 27 | authors = [author.find("atom:name", ns).text for author in entry.findall("atom:author", ns)] 28 | published_date = entry.find("atom:published", ns).text.strip() 29 | link = entry.find("atom:link", ns).get("href") 30 | return { 31 | "title": title, 32 | "abstract": abstract, 33 | "authors": authors, 34 | "published_date": published_date, 35 | "link": link 36 | } 37 | return None 38 | 39 | 40 | def process_mmd_files(input_dir: Path): 41 | """Process MMD files and extract metadata.""" 42 | # ensure output file exists with headers 43 | with open("arxiv_metadata.csv", "w", newline="", encoding="utf-8") as csvfile: 44 | fieldnames = ["id", "title", "abstract", "authors", "published_date", "link"] 45 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 46 | writer.writeheader() 47 | 48 | # collect all mmd files from input directory including subdirectories 49 | mmd_files = [] 50 | for root, _, files in os.walk(input_dir): 51 | for file in files: 52 | if file.endswith(".mmd"): 53 | mmd_files.append(os.path.join(root, file)) 54 | mmd_files.sort() 55 | 56 | # process files 57 | for mmd_path in tqdm(mmd_files, desc="Processing MMD files"): 58 | arxiv_id = Path(mmd_path).stem # get filename without extension 59 | 60 | metadata = get_arxiv_metadata(arxiv_id) 61 | if metadata: 62 | # append to csv 63 | with open("arxiv_metadata.csv", "a", newline="", encoding="utf-8") as csvfile: 64 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 65 | writer.writerow({ 66 | "id": arxiv_id, 67 | "title": metadata["title"], 68 | "abstract": metadata["abstract"], 69 | "authors": ", ".join(metadata["authors"]), 70 | "published_date": metadata["published_date"], 71 | "link": metadata["link"] 72 | }) 73 | 74 | # respect arXiv API rate limits (3 requests per second) 75 | time.sleep(3) 76 | 77 | 78 | def main(args): 79 | input_dir = Path(args.input_dir) 80 | if not input_dir.exists(): 81 | print(f"Error: Input directory {input_dir} does not exist") 82 | return 83 | 84 | print("Starting metadata extraction...") 85 | start_time = time.time() 86 | 87 | process_mmd_files(input_dir) 88 | 89 | processing_time = time.time() - start_time 90 | print(f"\nProcessing completed in {processing_time:.2f} seconds") 91 | print("Results saved to: arxiv_metadata.csv") 92 | 93 | 94 | if __name__ == "__main__": 95 | parser = argparse.ArgumentParser( 96 | description="Extract metadata from arXiv papers using their IDs" 97 | ) 98 | parser.add_argument( 99 | "--input-dir", 100 | type=str, 101 | required=True, 102 | help="Input directory containing MMD files (can include subdirectories)" 103 | ) 104 | args = parser.parse_args() 105 | main(args) --------------------------------------------------------------------------------