├── .gitignore
├── LICENSE
├── README.md
├── arxiv-tools
├── README.md
├── download.py
├── eda_manifest.py
└── extract_pdfs.py
├── job_status_server.py
├── postprocess.py
├── requirements.txt
├── run_nougat.py
└── utils
├── .DS_Store
├── check_complete_results.py
└── get_metadata.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Neural Work
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Arxiver
2 |
3 | A toolkit for downloading and converting arXiv papers to multi markdown (.mmd) format with Nougat - a neural OCR. Our pipeline can extract LaTeX equations and includes post-processing tools to clean up and merge extracted data. See the [arxiver](https://huggingface.co/datasets/neuralwork/arxiver) dataset on Hugging Face Hub for sample results.
4 |
5 | ## Project Structure
6 | ```
7 | arxiver/
8 | arxiv-tools/ # Tools for downloading arXiv papers
9 | utils/ # Utility files to check processed data, get article metadata, etc.
10 | run_nougat.py # Batch PDF processing script to extract text in .mmd format
11 | job_status_server.py # Web server to monitor extraction progress
12 | postprocess.py # Post-processing scripting to clean and merge Nougat outputs
13 | ```
14 |
15 | ## Downloading arXiv
16 |
17 | The `arxiv-tools` folder contains scripts for downloading arXiv papers and computing useful statistics about the arXiv dataset. For detailed instructions, see the [arxiv-tools README](arxiv-tools/README.md). Downloading and extracting the dataset creates a hierarchical folder structure organized by publication year and month as follows:
18 |
19 | ```
20 | output_dir/
21 | 2310/ # October 2023
22 | paper1.pdf
23 | paper2.pdf
24 | 2311/ # November 2023
25 | paper3.pdf
26 | paper4.pdf
27 | ```
28 |
29 | ## Nougat Processing
30 |
31 | The `run_nougat.py` script processes PDF files in batches using the [Nougat](https://arxiv.org/abs/2308.13418) neural OCR model:
32 |
33 | ```bash
34 | python run_nougat.py \
35 | --input_dir /path/to/datadir \
36 | --output_dir /path/to/output \
37 | --gpu_id 0 \
38 | --batch_size 8
39 | ```
40 |
41 | You can run Nougat using the output data directory as an input argument. Running this script processes pdfs by batches on specified GPU and logs successful and failed jobs (Nougat is not 100% stable). Output structure maintains the same year-month-based subdirectory structure but saves each page separately:
42 | ```
43 | output_dir/
44 | 2310/
45 | paper1_1.mmd # Paper 1, page 1
46 | paper1_2.mmd # Paper 1, page 2
47 | paper2_1.mmd
48 | 2311/
49 | paper3_1.mmd
50 | paper3_2.mmd
51 | paper4_1.mmd
52 | ```
53 |
54 | #### Progress Monitoring
55 | We provide an optinoal script, `job_status_server.py` to provide a web interface to monitor processing progress:
56 |
57 | ```bash
58 | python job_status_server.py \
59 | --input_dir /path/to/pdf/files \
60 | --output_dir /path/to/output \
61 | --port 8005
62 | ```
63 |
64 |
65 | ## Post-Processing
66 | The post-processing pipeline includes several steps to validate and clean up the Nougat output. You can optionally check how many of the papers have been fully processed (all pages successfully extracted) by running:
67 | ```bash
68 | cd utils
69 | python check_complete_results.py --pdf-dir /path/to/pdf/root/dir --mmd-dir /path/to/mmd/root/dir
70 | ```
71 |
72 | You can use the output .mmd files as they are or run post-processing to remove headers and references and merge multiple page MMD files into single documents operations. To do this, run the post-processing script:
73 | ```bash
74 | cd ..
75 | python postprocess.py --input-dir /path/to/processed-data --output-dir /path/to/output
76 | ```
77 |
78 | Note that this script preserves the original hierarchical folder structure organized by publication year and month.
79 |
80 | #### Metadata Extraction
81 | You can optionally get article metadata by running:
82 | ```bash
83 | cd utils
84 | python extract_metadata.py --input-dir /path/to/merged-mmd-folder
85 | ```
86 |
87 | ## Notes
88 | - GPU with CUDA support is required for efficient processing
89 | - Tested on an NVIDIA T4 GPU, processing speed depends on GPU memory and batch size
90 | - arxiv-tools/ is adapted from the original [repo](https://github.com/armancohan/arxiv-tools)
91 |
92 |
93 |
94 | From [neuralwork](https://neuralwork.ai/) with :heart:
95 |
--------------------------------------------------------------------------------
/arxiv-tools/README.md:
--------------------------------------------------------------------------------
1 | # arXiv-tools
2 |
3 | A tool for downloading and managing arXiv documents in bulk using Amazon S3.
4 |
5 | ## Prerequisites
6 |
7 | - [Amazon AWS Account](https://aws.amazon.com/free) - required for accessing arXiv's bulk data on [Amazon S3](https://aws.amazon.com/s3)
8 | - Python 2.x to use the `s3cmd` package
9 | - Python 3.x for manifest file analysis
10 |
11 | ## Installation
12 |
13 | 1. Install s3cmd, a command-line tool for interacting with Amazon S3:
14 | ```bash
15 | pip install s3cmd # Python 2 only
16 | ```
17 |
18 | 2. Configure s3cmd with your AWS credentials:
19 | ```bash
20 | s3cmd --configure
21 | ```
22 | > Note: You'll need your AWS credentials from the Account Management tab on the AWS website.
23 |
24 | 3. Install required Python packages for manifest file analysis:
25 | ```bash
26 | pip install pandas # For Python 3.x
27 | ```
28 |
29 | ## Usage
30 |
31 | ### 1. Download Manifest Files
32 |
33 | First, download the manifest files containing the complete list of available arXiv files:
34 |
35 | **For PDF documents:**
36 | ```bash
37 | s3cmd get --requester-pays \
38 | s3://arxiv/pdf/arXiv_pdf_manifest.xml \
39 | local-directory/arXiv_pdf_manifest.xml
40 | ```
41 |
42 | **For source documents:**
43 | ```bash
44 | s3cmd get --requester-pays \
45 | s3://arxiv/src/arXiv_src_manifest.xml \
46 | local-directory/arXiv_src_manifest.xml
47 | ```
48 |
49 | ### 2. Analyze Manifest Files (Optional)
50 |
51 | Use the `eda_manifest.py` script to analyze the manifest files:
52 |
53 | ```bash
54 | python eda_manifest.py
55 | ```
56 |
57 | This script provides useful statistics about the arXiv dataset:
58 | - Total size of the dataset in bytes, MB, and GB
59 | - Total number of articles
60 | - Average file sizes
61 | - Number of files per time period
62 | - Detailed statistics for recent years (2022-2023)
63 |
64 | ### 3. Download arXiv Files
65 |
66 | Use the `download.py` script to fetch the actual files:
67 |
68 | **For PDF files:**
69 | ```bash
70 | python download.py \
71 | --manifest_file /path/to/pdf-manifest \
72 | --mode pdf \
73 | --output_dir /path/to/output
74 | ```
75 |
76 | **For source files:**
77 | ```bash
78 | python download.py \
79 | --manifest_file /path/to/src-manifest \
80 | --mode src \
81 | --output_dir /path/to/output
82 | ```
83 |
84 | The files will be downloaded to your specified output directory. Each file is in `.tar` format and approximately 500MB in size.
85 |
86 | ### 4. Extract PDFs from Tar Files
87 |
88 | After downloading the tar files, use `extract_pdfs.py` to extract PDFs into organized directories:
89 |
90 | ```bash
91 | python extract_pdfs.py \
92 | --data_dir /path/to/tar/files \
93 | --output_dir /path/to/output \
94 | [--keep_tars] # Optional: keep original tar files
95 | ```
96 |
97 | The script will create and extract pdf files to year-month subdirectories (e.g., "2310" for October 2023). Example output structure:
98 | ```
99 | output_dir/
100 | 2310/ # October 2023
101 | paper1.pdf
102 | paper2.pdf
103 | 2311/ # November 2023
104 | paper3.pdf
105 | paper4.pdf
106 | ```
107 |
108 |
109 | ## Additional Resources
110 | - For metadata downloads, consider using [metha](https://github.com/miku/metha)
111 |
112 | ## Notes
113 | - The arXiv files are stored in requester-pays buckets on Amazon S3
114 | - Each archive file is approximately 500MB in size and uses the `.tar` format
--------------------------------------------------------------------------------
/arxiv-tools/download.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import subprocess
4 | from argparse import ArgumentParser
5 | import xml.etree.ElementTree as ET
6 |
7 |
8 | # set up logging configuration
9 | log_file = os.path.join("logs", "preprocessing_logs.log")
10 | os.makedirs("logs", exist_ok=True)
11 |
12 | logging.basicConfig(
13 | level=logging.INFO,
14 | format="%(asctime)s - %(levelname)s - %(message)s",
15 | handlers=[
16 | logging.FileHandler(log_file),
17 | logging.StreamHandler()
18 | ]
19 | )
20 | logger = logging.getLogger(__name__)
21 |
22 | def download_files(**args):
23 | manifest_file = args["manifest_file"]
24 | mode = args["mode"]
25 | out_dir = args["output_dir"]
26 |
27 | if mode != "pdf" and mode != "src":
28 | logger.error("Invalid mode: %s. Mode should be 'pdf' or 'src'.", mode)
29 |
30 | def get_file(fname, out_dir):
31 | cmd = ["s3cmd", "get", "--requester-pays", "s3://arxiv/%s" % fname, "./%s" % out_dir]
32 | logger.info("Downloading file: %s to %s", fname, out_dir)
33 | subprocess.call(' '.join(cmd), shell=True)
34 |
35 | try:
36 | for file in ET.parse(manifest_file).getroot().findall("file")[:1]:
37 | filename = file.find("filename").text
38 | logger.info("Processing file: %s", filename)
39 |
40 | get_file(filename, out_dir='%s/%s/' % (out_dir, mode))
41 | logger.debug("Successfully downloaded: %s", filename)
42 | except Exception as e:
43 | logger.error("Failed to process manifest file: %s", str(e), exc_info=True)
44 |
45 | logger.info("Download process completed")
46 |
47 |
48 | if __name__ == "__main__":
49 | argparser = ArgumentParser()
50 | argparser.add_argument("--manifest_file", "-m", type=str, help="The manifest file to download files from arXiv.", required=True)
51 | argparser.add_argument("--output_dir", "-o", type=str, default="data", help="Output directory to save files to.")
52 | argparser.add_argument("--mode", type=str, default="src", choices=set(("pdf", "src")), help="Can be 'pdf' or 'src'.")
53 | args = argparser.parse_args()
54 | download_files(**vars(args))
55 |
--------------------------------------------------------------------------------
/arxiv-tools/eda_manifest.py:
--------------------------------------------------------------------------------
1 | """
2 | !/usr/bin/env python3
3 | arXiv Manifest Analysis Script
4 |
5 | This script analyzes XML manifest files from arXiv's bulk data access,
6 | providing insights about file sizes, article counts, and temporal distribution.
7 | """
8 |
9 | import xml.etree.ElementTree as ET
10 | import pandas as pd
11 | from typing import List, Dict
12 |
13 |
14 | def parse_manifest(filepath: str) -> pd.DataFrame:
15 | """
16 | Parse the arXiv manifest XML file into a pandas DataFrame.
17 |
18 | Args:
19 | filepath: Path to the manifest XML file
20 |
21 | Returns:
22 | DataFrame containing parsed manifest data
23 | """
24 | tree = ET.parse(filepath)
25 | root = tree.getroot()
26 |
27 | data = [{
28 | "Filename": element.find("filename").text,
29 | "Number of Items": int(element.find("num_items").text),
30 | "Size": int(element.find("size").text),
31 | "Timestamp": element.find("timestamp").text,
32 | "YYMM": element.find("yymm").text,
33 | } for element in root.findall("file")]
34 |
35 | return pd.DataFrame(data)
36 |
37 |
38 | def analyze_total_statistics(df: pd.DataFrame) -> Dict:
39 | """Calculate and return overall statistics from the manifest data."""
40 | total_size = df["Size"].sum()
41 | total_articles = df["Number of Items"].sum()
42 | total_tars = len(df)
43 |
44 | return {
45 | "total_size_bytes": total_size,
46 | "total_size_mb": total_size / 1e6,
47 | "total_size_gb": total_size / 1e9,
48 | "total_articles": total_articles,
49 | "total_tar_files": total_tars,
50 | "avg_article_size_mb": (total_size / total_articles) / 1e6,
51 | "avg_tar_size_mb": (total_size / total_tars) / 1e6,
52 | "avg_items_per_tar": total_articles / total_tars
53 | }
54 |
55 |
56 | def analyze_yearly_data(df: pd.DataFrame, year: str) -> Dict:
57 | """
58 | Analyze manifest data for a specific year.
59 |
60 | Args:
61 | df: Full manifest DataFrame
62 | year: Two-digit year string (e.g., "22" for 2022)
63 |
64 | Returns:
65 | Dictionary containing year-specific statistics
66 | """
67 | year_df = df[df["YYMM"].str.startswith(year)]
68 | total_size = year_df["Size"].sum()
69 | total_articles = year_df["Number of Items"].sum()
70 |
71 | return {
72 | "total_articles": total_articles,
73 | "total_size_gb": total_size / 1e9,
74 | "data": year_df
75 | }
76 |
77 |
78 | def print_statistics(stats: Dict) -> None:
79 | """Print formatted statistics."""
80 | print("\n=== Overall Statistics ===")
81 | print(f"Total Size: {stats['total_size_gb']:.2f} GB ({stats['total_size_bytes']:,} bytes)")
82 | print(f"Total Articles: {stats['total_articles']:,}")
83 | print(f"Total TAR Files: {stats['total_tar_files']:,}")
84 | print(f"\nAverages:")
85 | print(f"- Article Size: {stats['avg_article_size_mb']:.2f} MB")
86 | print(f"- TAR File Size: {stats['avg_tar_size_mb']:.2f} MB")
87 | print(f"- Items per TAR: {stats['avg_items_per_tar']:.1f}")
88 |
89 |
90 | def main():
91 | # path to input manifest file
92 | MANIFEST_PATH = "arXiv_pdf_manifest.xml"
93 |
94 | # load and parse manifest
95 | print("Parsing manifest file...")
96 | df = parse_manifest(MANIFEST_PATH)
97 |
98 | # calculate overall statistics
99 | stats = analyze_total_statistics(df)
100 | print_statistics(stats)
101 |
102 | # analyze recent years
103 | print("\n=== Recent Years Analysis ===")
104 | for year in ["22", "23"]:
105 | year_stats = analyze_yearly_data(df, year)
106 | print(f"\nYear 20{year}:")
107 | print(f"- Articles: {year_stats['total_articles']:,}")
108 | print(f"- Size: {year_stats['total_size_gb']:.2f} GB")
109 |
110 | # export year-specific data if needed
111 | year_stats["data"].to_csv(f"df_{year}.csv", index=False)
112 |
113 | # print unique YYMM values for reference
114 | print("\n=== Time Coverage ===")
115 | unique_yymm = sorted(df["YYMM"].unique())
116 | print(f"Coverage period: {unique_yymm[0]} to {unique_yymm[-1]}")
117 |
118 |
119 | if __name__ == "__main__":
120 | main()
--------------------------------------------------------------------------------
/arxiv-tools/extract_pdfs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | arXiv Tar Extraction Script
4 |
5 | Extracts PDF files from arXiv tar archives into organized directories.
6 | The script processes tar files named like 'arXiv_pdf_YY_MM_N.tar' and organizes PDFs into
7 | YYMM directories.
8 |
9 | Usage:
10 | python extract_arxiv.py --data_dir /path/to/tar/files
11 | """
12 |
13 | import os
14 | import tarfile
15 | import logging
16 | import argparse
17 | from typing import List
18 | from pathlib import Path
19 |
20 |
21 | def setup_logging() -> logging.Logger:
22 | """Configure and return logger."""
23 | logging.basicConfig(
24 | level=logging.INFO,
25 | format="%(asctime)s - %(levelname)s - %(message)s",
26 | handlers=[
27 | logging.FileHandler("arxiv_extract.log"),
28 | logging.StreamHandler()
29 | ]
30 | )
31 | return logging.getLogger("arxiv_extractor")
32 |
33 |
34 | def parse_args() -> argparse.Namespace:
35 | """Parse command line arguments."""
36 | parser = argparse.ArgumentParser(
37 | description="Extract PDFs from arXiv tar archives",
38 | formatter_class=argparse.ArgumentDefaultsHelpFormatter
39 | )
40 | parser.add_argument(
41 | "--data_dir",
42 | type=str,
43 | required=True,
44 | help="Directory containing arXiv tar files"
45 | )
46 | parser.add_argument(
47 | "--output_dir",
48 | type=str,
49 | default="extracted",
50 | help="Base directory for extracted PDFs"
51 | )
52 | parser.add_argument(
53 | "--keep_tars",
54 | action="store_true",
55 | help="Keep tar files after extraction (default: delete)"
56 | )
57 |
58 | return parser.parse_args()
59 |
60 |
61 | def get_tar_files(data_dir: Path) -> List[Path]:
62 | """
63 | Find all tar files in the specified directory.
64 |
65 | Args:
66 | data_dir: Directory to search for tar files
67 |
68 | Returns:
69 | List of paths to tar files
70 | """
71 | return list(data_dir.glob("*.tar"))
72 |
73 |
74 | def extract_pdfs_from_tar(
75 | tar_path: Path,
76 | output_base: Path,
77 | keep_tar: bool = False
78 | ) -> bool:
79 | """
80 | Extract PDF files from a tar archive into a year-month directory.
81 |
82 | Args:
83 | tar_path: Path to the tar file
84 | output_base: Base directory for extracted files
85 | keep_tar: Whether to keep the tar file after extraction
86 |
87 | Returns:
88 | bool: True if extraction successful, False otherwise
89 | """
90 | logger = logging.getLogger("arxiv_extractor")
91 |
92 | try:
93 | # extract directory name (YYMM) from tar file name
94 | # example: "arXiv_pdf_23_10_1.tar" -> "2310"
95 | parts = tar_path.stem.split("_")
96 | if len(parts) != 5 or not parts[2].isdigit() or not parts[3].isdigit():
97 | logger.warning("Invalid tar file name format: %s", tar_path.name)
98 | return False
99 |
100 | dir_name = parts[2] + parts[3] # e.g., "2310" for year 23, month 10
101 | output_dir = output_base / dir_name
102 | output_dir.mkdir(parents=True, exist_ok=True)
103 |
104 | logger.info("Extracting %s to directory %s", tar_path.name, output_dir)
105 |
106 | # Extract PDF files
107 | with tarfile.open(tar_path, "r") as tar:
108 | pdf_members = [m for m in tar.getmembers()
109 | if m.isreg() and m.name.endswith(".pdf")]
110 |
111 | total_pdfs = len(pdf_members)
112 | for i, member in enumerate(pdf_members, 1):
113 | tar.extract(member, output_dir)
114 | if i % 100 == 0: # log progress every 100 files
115 | logger.info("Extracted %d/%d PDFs (%.1f%%)",
116 | i, total_pdfs, (i/total_pdfs)*100)
117 |
118 | logger.info("Successfully extracted %d PDF files to %s",
119 | total_pdfs, output_dir)
120 |
121 | # clean up tar file if requested
122 | if not keep_tar:
123 | tar_path.unlink()
124 | logger.info("Deleted tar file: %s", tar_path.name)
125 |
126 | return True
127 |
128 | except Exception as e:
129 | logger.error("Error processing %s: %s", tar_path.name, str(e))
130 | return False
131 |
132 |
133 | def main():
134 | args = parse_args()
135 | logger = setup_logging()
136 |
137 | data_dir = Path(args.data_dir)
138 | output_base = Path(args.output_dir)
139 |
140 | if not data_dir.is_dir():
141 | logger.error("Data directory does not exist: %s", data_dir)
142 | return
143 |
144 | # create output base directory
145 | output_base.mkdir(parents=True, exist_ok=True)
146 |
147 | # get list of tar files
148 | tar_files = get_tar_files(data_dir)
149 | if not tar_files:
150 | logger.error("No tar files found in %s", data_dir)
151 | return
152 |
153 | logger.info("Found %d tar files to process", len(tar_files))
154 |
155 | # process each tar file
156 | successful = 0
157 | failed = 0
158 |
159 | for tar_path in tar_files:
160 | if extract_pdfs_from_tar(tar_path, output_base, args.keep_tars):
161 | successful += 1
162 | else:
163 | failed += 1
164 |
165 | # log final statistics
166 | logger.info("\nExtraction complete:")
167 | logger.info("- Successful: %d", successful)
168 | logger.info("- Failed: %d", failed)
169 | logger.info("- Total processed: %d", len(tar_files))
170 |
171 |
172 | if __name__ == "__main__":
173 | main()
174 |
175 |
176 |
177 |
--------------------------------------------------------------------------------
/job_status_server.py:
--------------------------------------------------------------------------------
1 | """
2 | !/usr/bin/env python3
3 | Job Status Server
4 |
5 | A FastAPI server that monitors and reports Nougat inference progress.
6 | Scans directories of MMD files to track processing progress.
7 |
8 | Dependencies:
9 | - fastapi
10 | - uvicorn[standard]
11 |
12 | Usage:
13 | python job_status_server.py
14 | """
15 |
16 | import logging
17 | import argparse
18 | from datetime import datetime
19 | from pathlib import Path
20 | from typing import Tuple
21 |
22 | from fastapi import FastAPI, HTTPException
23 | from fastapi.responses import HTMLResponse
24 | import uvicorn
25 |
26 |
27 | def parse_args() -> argparse.Namespace:
28 | """Parse command line arguments."""
29 | parser = argparse.ArgumentParser(
30 | description="Monitor Nougat inference progress",
31 | formatter_class=argparse.ArgumentDefaultsHelpFormatter
32 | )
33 | parser.add_argument(
34 | "--input_dir",
35 | type=str,
36 | required=True,
37 | help="Directory containing source PDF files"
38 | )
39 | parser.add_argument(
40 | "--output_dir",
41 | type=str,
42 | required=True,
43 | help="Directory containing output MMD files"
44 | )
45 | parser.add_argument(
46 | "--port",
47 | type=int,
48 | default=8005,
49 | help="Port number for the server"
50 | )
51 | return parser.parse_args()
52 |
53 |
54 | # Configure logging
55 | logging.basicConfig(
56 | level=logging.INFO,
57 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
58 | handlers=[
59 | logging.FileHandler("job_server.log"),
60 | logging.StreamHandler()
61 | ]
62 | )
63 | logger = logging.getLogger("job_status_server")
64 |
65 | # initialize FastAPI app
66 | app = FastAPI(
67 | title="Nougat Job Status Server",
68 | description="Monitor Nougat inference progress",
69 | version="1.0.0"
70 | )
71 |
72 | input_dir: Path = None
73 | output_dir: Path = None
74 | start_time: datetime = None
75 |
76 |
77 | def calculate_time_difference(start: datetime, end: datetime) -> str:
78 | """Calculate and format the time difference between two timestamps."""
79 | time_difference = abs(end - start)
80 | days = time_difference.days
81 | hours, remainder = divmod(time_difference.seconds, 3600)
82 | minutes, _ = divmod(remainder, 60)
83 |
84 | return f"{days} days, {hours} hours, and {minutes} minutes"
85 |
86 |
87 | def count_pdf_files() -> int:
88 | """Count total number of PDF files in input directory."""
89 | total = 0
90 | for month_dir in input_dir.iterdir():
91 | if month_dir.is_dir():
92 | total += len(list(month_dir.glob("*.pdf")))
93 | return total
94 |
95 |
96 | def get_processed_files() -> dict:
97 | """Get count of processed files per month directory."""
98 | processed = {}
99 | if output_dir.exists():
100 | for month_dir in output_dir.iterdir():
101 | if month_dir.is_dir():
102 | processed[month_dir.name] = len(list(month_dir.glob("*.mmd")))
103 | return processed
104 |
105 |
106 | def get_job_stats() -> Tuple[int, int, float, dict]:
107 | """Calculate current job statistics."""
108 | try:
109 | total_pdfs = count_pdf_files()
110 | processed_files = get_processed_files()
111 | total_processed = sum(processed_files.values())
112 | remaining = total_pdfs - total_processed
113 | percentage = (total_processed / total_pdfs * 100) if total_pdfs > 0 else 0
114 |
115 | logger.info(
116 | "Stats - Total PDFs: %d, Processed: %d, Remaining: %d, Percentage: %.2f%%",
117 | total_pdfs, total_processed, remaining, percentage
118 | )
119 |
120 | return total_pdfs, total_processed, remaining, percentage, processed_files
121 |
122 | except Exception as e:
123 | logger.error("Error calculating job stats: %s", str(e))
124 | raise HTTPException(status_code=500, detail="Error calculating job statistics")
125 |
126 |
127 | @app.get("/", response_class=HTMLResponse)
128 | def status() -> HTMLResponse:
129 | """Generate HTML status page showing current job statistics."""
130 | try:
131 | total_pdfs, processed, remaining, percentage, processed_files = get_job_stats()
132 | elapsed_time = calculate_time_difference(start_time, datetime.now())
133 |
134 | # generate month-wise progress HTML
135 | month_progress = ""
136 | for month, count in sorted(processed_files.items()):
137 | month_progress += f"
Month {month}: {count:,} files processed
\n" 138 | 139 | return f""" 140 | 141 | 142 |Total PDF files: {total_pdfs:,}
176 |Processed files: {processed:,}
177 |Remaining files: {remaining:,}
178 |Completion: {percentage:.2f}%
179 |Time elapsed: {elapsed_time}
180 | 181 |