├── .gitignore ├── LICENSE ├── README.md ├── README_ANNOTATION.md ├── README_CC_URL_PARSE.md ├── README_DOWNLOAD.md ├── README_EXTENSIONS.md ├── app ├── Dockerfile ├── alembic.ini ├── alembic │ ├── README │ ├── env.py │ ├── script.py.mako │ └── versions │ │ ├── 17cdff8379cb_initial_sources_record.py │ │ ├── 2de829bd1ca3_remove_sensitive_info_add_bytehash.py │ │ ├── d1fbae82c5fa_initial_metadata_model.py │ │ └── eb22a058c4c9_less_mandatory_fields_for_doc_sources.py ├── annotate_run.py ├── cc_parse_merge_and_recover_urls.py ├── cc_parse_partition_listings.py ├── cc_parse_snapshot.py ├── configs │ ├── default_config.yaml │ └── extensions │ │ ├── obj_detection │ │ └── ws_yolo │ │ │ ├── 1header.json │ │ │ ├── 3headers.json │ │ │ ├── baseline.json │ │ │ ├── baseline_quality.json │ │ │ ├── local │ │ │ ├── 1header_balanced_quality.json │ │ │ ├── 1header_balanced_quality_multilang.json │ │ │ ├── 1header_balanced_quality_report.json │ │ │ ├── 3headers_balanced_quality.json │ │ │ ├── 3headers_balanced_quality_report.json │ │ │ ├── tableonly_balanced.json │ │ │ └── tableonly_balanced_cut.json │ │ │ ├── spaceml │ │ │ ├── 1header.json │ │ │ ├── 1header_quality.json │ │ │ ├── 3headers.json │ │ │ ├── 3headers_quality.json │ │ │ ├── baseline.json │ │ │ ├── baseline_quality.json │ │ │ ├── tableonly.json │ │ │ └── tableonly_quality.json │ │ │ ├── tableonly.json │ │ │ └── tableonly_quality.json │ │ └── pretrain │ │ └── layoutlm │ │ ├── 1header_balanced_quality.json │ │ └── 1header_balanced_quality_test.json ├── download_dump_data.py ├── download_prepare_urls.py ├── download_run.py ├── orm │ ├── __init__.py │ ├── dbutils │ │ └── db_connection.py │ └── models.py ├── pp_compute_perplexity.py ├── requirements.txt ├── resources │ ├── fasttext-models │ │ └── .gitkeep │ └── wikipedia-models │ │ └── .gitkeep ├── scripts │ ├── annotation-kickoff.sh │ ├── annotation-launch.sbatch │ ├── cc-parse-launch.sbatch │ ├── download-launch.sbatch │ ├── install_libreoffice_centos.sh │ ├── pp-compute-perplexity.sbatch │ ├── run-filter-tars.sbatch │ └── run_single_node.sh ├── settings │ ├── __init__.py │ ├── annotation.py │ ├── bbox.py │ ├── colors.py │ ├── content_awareness.py │ ├── download.py │ ├── entities.py │ ├── entity_names.json │ └── filesystem.py ├── src │ ├── __init__.py │ ├── annotation │ │ ├── __init__.py │ │ ├── annotation_objects.py │ │ ├── annotation_quality.py │ │ ├── annotator_process.py │ │ ├── builtin_styles.py │ │ ├── colorization │ │ │ ├── __init__.py │ │ │ ├── colorization_handler.py │ │ │ ├── colorize_doc.py │ │ │ ├── entities │ │ │ │ ├── __init__.py │ │ │ │ ├── figure.py │ │ │ │ ├── form.py │ │ │ │ ├── header_footer.py │ │ │ │ ├── paragraph.py │ │ │ │ ├── tables │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── colorize_table.py │ │ │ │ │ ├── element_parsers.py │ │ │ │ │ ├── styles.py │ │ │ │ │ ├── table_colorization_handler.py │ │ │ │ │ └── utils.py │ │ │ │ ├── text_box.py │ │ │ │ └── toc.py │ │ │ ├── heuristics │ │ │ │ ├── __init__.py │ │ │ │ ├── build_heuristics.py │ │ │ │ ├── content_awareness.py │ │ │ │ └── utils.py │ │ │ └── mappings.py │ │ ├── config.py │ │ ├── entity_detection │ │ │ ├── __init__.py │ │ │ ├── detection.py │ │ │ ├── entity_detector.py │ │ │ └── utils.py │ │ ├── language_detection │ │ │ ├── __init__.py │ │ │ ├── inference.py │ │ │ └── utils.py │ │ ├── oxml_metadata.py │ │ ├── postprocessing │ │ │ ├── __init__.py │ │ │ ├── filters.py │ │ │ ├── postprocess.py │ │ │ └── table.py │ │ ├── preprocessing │ │ │ ├── __init__.py │ │ │ └── highlighting.py │ │ ├── sanity_checks.py │ │ ├── soffice │ │ │ ├── __init__.py │ │ │ ├── conversion_manager.py │ │ │ └── utils.py │ │ ├── text │ │ │ ├── __init__.py │ │ │ ├── text_entity_matching.py │ │ │ └── text_extraction.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── bbox_utils.py │ │ │ ├── color_utils.py │ │ │ ├── docx_utils.py │ │ │ ├── identifiers.py │ │ │ ├── pdf_utils.py │ │ │ ├── updateable_zipfile.py │ │ │ └── zip_bomb.py │ ├── cc_processing │ │ ├── __init__.py │ │ ├── cc_url_process.py │ │ ├── deduplicate.py │ │ └── preprocess_cc_urls.py │ ├── data_sources │ │ ├── __init__.py │ │ ├── download_exceptions.py │ │ ├── download_process.py │ │ ├── http_handlers.py │ │ └── maldoc_check.py │ ├── exceptions.py │ ├── extensions │ │ ├── __init__.py │ │ ├── obj_detection │ │ │ ├── __init__.py │ │ │ ├── data_prep │ │ │ │ ├── __init__.py │ │ │ │ ├── doclaynet_yolo_formatter.py │ │ │ │ ├── publaynet_yolo_formatter.py │ │ │ │ ├── pubtabnet_yolo_formatter.py │ │ │ │ ├── wordscape_yolo_config_handler.py │ │ │ │ ├── wordscape_yolo_formatter.py │ │ │ │ ├── wordscape_yolo_formatter_unzipped.py │ │ │ │ └── yolo_dataset_report.py │ │ │ └── spaceml │ │ │ │ ├── move_train_data.py │ │ │ │ ├── move_train_data_singlefiles.py │ │ │ │ ├── ws_yolo_dataprep.py │ │ │ │ └── ws_yolo_experimentrun.py │ │ └── pretrain │ │ │ └── layoutlmv3 │ │ │ └── data_prep │ │ │ ├── __init__.py │ │ │ ├── wordscape_layoutlmv3_config_handler.py │ │ │ ├── wordscape_layoutlmv3_dataprep.py │ │ │ ├── wordscape_layoutlmv3_datasetbuilder.py │ │ │ └── wordscape_layoutlmv3_formatter.py │ └── quality │ │ ├── __init__.py │ │ ├── perplexity.py │ │ └── text_normalizer.py ├── utilities │ ├── checksums.parquet │ ├── compute_checksums.py │ ├── merge_annotations_metadata.py │ ├── merge_sources_metadata.py │ ├── run_filter_tars.py │ └── run_whitelist_pages.py └── visualize_annotations.py ├── data └── .gitkeep └── docs └── wordscape.png /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | .vscode/* 3 | *.pyc 4 | *.DS_Store 5 | Pipfile 6 | Pipfile.lock 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | 11 | # data folders 12 | data/* 13 | !data/.gitkeep 14 | !data/tmp/.gitkeep 15 | annotated/* 16 | resources/*/*.bin 17 | resources/*/*.model 18 | resources/*/*.ftz 19 | 20 | # venv 21 | .venv -------------------------------------------------------------------------------- /README_ANNOTATION.md: -------------------------------------------------------------------------------- 1 | # Annotation of Word files 2 | 3 | All instructions are assumed to be executed from the root directory of the project, with the python virtual environment 4 | activated and all necessary dependencies installed. In this example, we will be processing the word files downloaded in 5 | the previous step of the WordScape pipeline. 6 | 7 | This part of WordScape assumes that you have input files (i.e. Word .doc or .docx files) stored in 8 | gzip compressed tar archives: 9 | 10 | ``` 11 | DATA_DIR 12 | ├── archive_1.tar.gz 13 | ├── file_1.doc 14 | ├── file_2.docx 15 | ... 16 | ├── file_k.doc 17 | ├── archive_2.tar.gz 18 | ├── file_1.doc 19 | ... 20 | ├── archive_n.tar.gz 21 | ``` 22 | 23 | The Word Scape pipeline will process each archive in parallel. The output will have the following components: 24 | 25 | - A `failed` directory with jsonl files that contain the filenames of Word files that failed to process, including the 26 | reason for 27 | failure. 28 | - A `logs` directory with log files from each worker. 29 | - A `meta` directory with jsonl files that contain metadata on document level, and metadata on page level. 30 | - A `multimodal` directory with tar.gz files that contain multimodal data for each document. The multimodal data 31 | includes images of each page, and json files that contain OCR text, word bounding boxes, and entity bounding boxes. 32 | - A `text` directory with jsonl files that contain OCR text for each document and each page. 33 | - A `version_info.txt` file that contains the timestamp and git branch and commit hash of the code used to process the 34 | data. 35 | - A `args.json` file that contains the arguments used to run the pipeline. 36 | - A `config.yaml` file that contains the configuration used to run the pipeline. 37 | 38 | The output directory structure will look like this: 39 | 40 | ``` 41 | ./data/annotated/// 42 | ├── failed 43 | ├── failed_.jsonl 44 | ... 45 | ├── logs 46 | ├── .log 47 | ... 48 | ├── meta 49 | ├── doc_meta_.jsonl 50 | ├── page_meta_.jsonl 51 | ... 52 | ├── multimodal 53 | ├── docs_.tar.gz 54 | ├── doc__p.jpg 55 | ├── entities__p.json 56 | ├── text__p.json 57 | ├── words__p.json 58 | ... 59 | ... 60 | ├── text 61 | ├── doc_text_.jsonl 62 | ├── page_text_.jsonl 63 | ... 64 | ├── version_info.txt 65 | ├── args.json 66 | ├── config.yaml 67 | ``` 68 | 69 | ## Running annotation scripts 70 | 71 | Here we describe how to run the annotation scripts. The scripts are designed to be run on a Slurm cluster, but can also 72 | be run locally. 73 | 74 | ### Running on a Slurm cluster 75 | 76 | To run WordScape on a Slurm cluster, you can use the `annotation-kickoff.sh` script from the `scripts` directory. 77 | This script will divide all files ending in `.tar.gz` into partitions. Each partition will be processed by a separate 78 | Slurm job. To run using slurm, use 79 | 80 | ```bash 81 | bash scripts/annotation-kickoff.sh $CRAWL_ID $DATA_DIR 82 | ``` 83 | 84 | where the environment variables `$CRAWL_ID` corresponds to the id of the crawl (e.g., "CC-MAIN-2022-49") and `$DATA_DIR` 85 | is the directory of the Word source files. After creating the partitions, the script submits the jobs to 86 | the slurm cluster by calling the script `scripts/annotation-launch.sbatch`. 87 | 88 | ### Running locally 89 | 90 | Alternatively, you can also run the annotation script locally. To do so, you can directly call the `run_annotate.py` 91 | script: 92 | 93 | ```bash 94 | python annotate_run.py \ 95 | --data_dir $DATA_DIR \ 96 | --crawl_id $CRAWL_ID \ 97 | --max_docs -1 \ 98 | --output_dir $OUTPUT_DIR 99 | ``` 100 | 101 | ## Computing perplexity scores 102 | 103 | Perplexity scores can be computed using the `pp_compute_perplexity.py` script. This script will download the 5-gram 104 | Kneser-Ney models and SentencePiece tokenizers used in the [CCNet pipeline](https://github.com/facebookresearch/cc_net). 105 | You can run the script with the following command: 106 | 107 | ```bash 108 | python pp_compute_perplexity.py \ 109 | --lang $LANG \ 110 | --data $ANNOTATIONS_ROOT 111 | ``` 112 | 113 | After downloading the language model for the specified language, the script will compute the perplexity scores for each 114 | document in the annotations directory, and write the results to the `meta_ppl` directory, that contains the same data 115 | as the `meta` directory, but with the perplexity scores added to the document level metadata. 116 | -------------------------------------------------------------------------------- /README_CC_URL_PARSE.md: -------------------------------------------------------------------------------- 1 | # Getting DOC / DOCX URLS from CommonCrawl 2 | 3 | All instructions are assumed to be executed from the root directory of the project, with the python virtual environment 4 | activated. In this example, we will be processing the commoncrawl dump `CC-MAIN-2023-06`. It is recommended to use 5 | absolute paths for arguments wherever possible, as outlined in the example. 6 | 7 | ## Preparing WAT segment partitions for input to nodes 8 | 9 | Extraction of relevant URLs is based on metadata supplied by the WAT files of a given commoncrawl dump. 10 | In order to prepare for downloading URLs, we must therefore first split the responsibility of 11 | downloading dumps accross slurm nodes. Each slurm node will in turn assign individual 12 | WAT files to be downloaded and processed by workers. 13 | 14 | In order to prepare these files, run 15 | 16 | ```shell 17 | python cc_parse_partition_listings.py --crawl CC-MAIN-2023-06 --partition-size 13 --num_nodes 180 18 | ``` 19 | 20 | The `crawl` argument specifies which cc dump to process. 21 | 22 | The `partition_size` argument influences the internal task distribution of one slurm node; ideally, it should be set to 23 | the number of cores on each node minus 3; if running locally, it should be set to the number of cores your CPU has. 24 | 25 | The `num_nodes` argument must be the same as the number of slurm nodes you intend to run the download job on. If running 26 | via the `cc-parse-launch.sbatch` 27 | script, it should be the same as sbatch `array` pragma: e.g. with `array=1-180`, this argument should be set to `180`. 28 | If running locally, this argument should be set to `1`. 29 | 30 | Running this script will output the listings directory to which the results have been written; take note of this, as it 31 | will be needed in the next step: 32 | 33 | ```shell 34 | [2023-05-25 20:04:38] Downloading Common Crawl paths listings 35 | * crawl: CC-MAIN-2023-06 36 | * data-type: wat 37 | * partition-size: 13 38 | * listings dir: ./data/crawl-data/CC-MAIN-2023-06/listings 39 | ``` 40 | 41 | ## Running URL download process 42 | 43 | Now that the distribution of WAT files to slurm nodes and their respective worker processes have been set, we can run 44 | the download process. 45 | To run using sbatch, you can use the included script: 46 | 47 | ```shell 48 | sbatch ./scripts/cc-parse-launch.sbatch "./data/crawl-data/CC-MAIN-2023-06/listings" "CC-MAIN-2023-06" 49 | ``` 50 | 51 | The first argument must be the listings directory from the output in the last step, and the second the name of the cc 52 | dump (same as above). 53 | 54 | In order to run locally, you can use: 55 | 56 | ```shell 57 | python cc_parse_snapshot.py \ 58 | --input "./data/crawl-data/CC-MAIN-2023-06/listings/1" \ 59 | --cc_dump "CC-MAIN-2023-06" 60 | ``` 61 | 62 | Note that a `/1` must be added to the end of the listings directory in the local case, as your local machine will be 63 | operating analagously to a single slurm node. 64 | 65 | These processes will then begin outputting raw URL data to the `cc_urls` data folder. 66 | 67 | ## Cleanup, merge and recovery 68 | 69 | After the raw URL download job completes, the produced URLs must be cleaned and merged into a single parquet file in 70 | the `clean_urls` folder for the next steps of the pipeline. 71 | 72 | To do this, you can run: 73 | 74 | ```shell 75 | python cc_parse_merge_and_recover_urls.py \ 76 | --input ./data/cc_urls/CC-MAIN-2023-06 \ 77 | --listings_dir ./data/crawl-data/CC-MAIN-2023-06/listings \ 78 | --cc_dump CC-MAIN-2023-06 \ 79 | --dedupe 1 80 | ``` 81 | 82 | The `input` argument must be the `cc_urls` directory being cleaned. 83 | 84 | The `listings_dir` and `cc_dump` arguments are the same as above. 85 | 86 | If the `dedupe` flag is set, the resulting parquet file will be globally deduplicated against all already processed 87 | dumps inside the `clean_urls` folder; it is recommended to set this flag if you intend to process multiple dumps. 88 | 89 | After completing these steps, you should have one parquet file with a list of cleaned 90 | URLS: `./data/clean_urls/CC-MAIN-2023-06.parquet`. 91 | 92 | Note that, due to contention on commoncrawl resources, it is possible that some WATs were not able to be processed. 93 | These will be written to `./data/clean_urls/CC-MAIN-2023-06_recovery_segments.txt`, and a report will be output by the 94 | script on how many (if any) segments were missed. Optionally, you may 95 | re-run the download job at a later time, using only these segments as input. 96 | 97 | After completing the above steps, you should be ready to move on to the download phase of the pipeline. 98 | -------------------------------------------------------------------------------- /app/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM --platform=amd64 ubuntu:22.04 2 | 3 | WORKDIR /usr/app 4 | 5 | RUN apt-get update && apt-get install -y \ 6 | python3.11 \ 7 | python3-pip \ 8 | wget \ 9 | curl \ 10 | default-jre \ 11 | libcairo2-dev 12 | 13 | # copy requirements.txt to the working directory 14 | COPY requirements.txt requirements.txt 15 | 16 | # install python dependencies 17 | RUN pip3 install --no-cache-dir --upgrade pip 18 | RUN pip3 install --no-cache-dir -r requirements.txt 19 | RUN pip3 install --no-cache-dir gdown 20 | 21 | # install libreoffice 22 | RUN wget https://downloadarchive.documentfoundation.org/libreoffice/old/7.4.6.2/deb/x86_64/LibreOffice_7.4.6.2_Linux_x86-64_deb.tar.gz 23 | RUN tar -xzvf LibreOffice_7.4.6.2_Linux_x86-64_deb.tar.gz && cd LibreOffice_7.4.6.2_Linux_x86-64_deb/DEBS && dpkg -i *.deb 24 | RUN rm -rf LibreOffice_7.4.6.2_Linux_x86-64_deb.tar.gz LibreOffice_7.4.6.2_Linux_x86-64_deb 25 | 26 | # add to path 27 | ENV PATH="${PATH}:/opt/libreoffice7.4/program/" 28 | 29 | RUN wget https://bootstrap.pypa.io/get-pip.py 30 | RUN /opt/libreoffice7.4/program/python get-pip.py 31 | RUN /opt/libreoffice7.4/program/python -m pip install --no-cache-dir unoserver==1.6 32 | 33 | # fix shebangs 34 | RUN sed -i '1s/python\.bin/python/' "/opt/libreoffice7.4/program/python-core-3.8.16/bin/unoserver" 35 | RUN sed -i '1s/python\.bin/python/' "/opt/libreoffice7.4/program/python-core-3.8.16/bin/unoconvert" 36 | ENV PATH="/opt/libreoffice7.4/program/python-core-3.8.16/bin:${PATH}" 37 | 38 | COPY resources resources 39 | RUN curl https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -o resources/fasttext-models/lid.176.ftz 40 | 41 | RUN apt-get install -y poppler-utils 42 | 43 | COPY . . 44 | 45 | ENTRYPOINT ["/bin/bash", "/usr/app/scripts/run_single_node.sh"] -------------------------------------------------------------------------------- /app/alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # path to migration scripts 5 | script_location = alembic 6 | 7 | # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s 8 | # Uncomment the line below if you want the files to be prepended with date and time 9 | # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file 10 | # for all available tokens 11 | # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s 12 | 13 | # sys.path path, will be prepended to sys.path if present. 14 | # defaults to the current working directory. 15 | prepend_sys_path = . 16 | 17 | # timezone to use when rendering the date within the migration file 18 | # as well as the filename. 19 | # If specified, requires the python-dateutil library that can be 20 | # installed by adding `alembic[tz]` to the pip requirements 21 | # string value is passed to dateutil.tz.gettz() 22 | # leave blank for localtime 23 | # timezone = 24 | 25 | # max length of characters to apply to the 26 | # "slug" field 27 | # truncate_slug_length = 40 28 | 29 | # set to 'true' to run the environment during 30 | # the 'revision' command, regardless of autogenerate 31 | # revision_environment = false 32 | 33 | # set to 'true' to allow .pyc and .pyo files without 34 | # a source .py file to be detected as revisions in the 35 | # versions/ directory 36 | # sourceless = false 37 | 38 | # version location specification; This defaults 39 | # to alembic/versions. When using multiple version 40 | # directories, initial revisions must be specified with --version-path. 41 | # The path separator used here should be the separator specified by "version_path_separator" below. 42 | # version_locations = %(here)s/bar:%(here)s/bat:alembic/versions 43 | 44 | # version path separator; As mentioned above, this is the character used to split 45 | # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. 46 | # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. 47 | # Valid values for version_path_separator are: 48 | # 49 | # version_path_separator = : 50 | # version_path_separator = ; 51 | # version_path_separator = space 52 | version_path_separator = os # Use os.pathsep. Default configuration used for new projects. 53 | 54 | # set to 'true' to search source files recursively 55 | # in each "version_locations" directory 56 | # new in Alembic version 1.10 57 | # recursive_version_locations = false 58 | 59 | # the output encoding used when revision files 60 | # are written from script.py.mako 61 | # output_encoding = utf-8 62 | 63 | # TODO: DO NOT DO THIS IN PRODUCTION; NEED A SECRET MANAGER! 64 | # this simply serves as an example for how you might set up your own test environment. 65 | sqlalchemy.url = postgresql://localtest:localtestpass@localhost:5432/docparser-dataset-testing 66 | 67 | 68 | [post_write_hooks] 69 | # post_write_hooks defines scripts or Python functions that are run 70 | # on newly generated revision scripts. See the documentation for further 71 | # detail and examples 72 | 73 | # format using "black" - use the console_scripts runner, against the "black" entrypoint 74 | # hooks = black 75 | # black.type = console_scripts 76 | # black.entrypoint = black 77 | # black.options = -l 79 REVISION_SCRIPT_FILENAME 78 | 79 | # Logging configuration 80 | [loggers] 81 | keys = root,sqlalchemy,alembic 82 | 83 | [handlers] 84 | keys = console 85 | 86 | [formatters] 87 | keys = generic 88 | 89 | [logger_root] 90 | level = WARN 91 | handlers = console 92 | qualname = 93 | 94 | [logger_sqlalchemy] 95 | level = WARN 96 | handlers = 97 | qualname = sqlalchemy.engine 98 | 99 | [logger_alembic] 100 | level = INFO 101 | handlers = 102 | qualname = alembic 103 | 104 | [handler_console] 105 | class = StreamHandler 106 | args = (sys.stderr,) 107 | level = NOTSET 108 | formatter = generic 109 | 110 | [formatter_generic] 111 | format = %(levelname)-5.5s [%(name)s] %(message)s 112 | datefmt = %H:%M:%S 113 | -------------------------------------------------------------------------------- /app/alembic/README: -------------------------------------------------------------------------------- 1 | Generic single-database configuration. -------------------------------------------------------------------------------- /app/alembic/env.py: -------------------------------------------------------------------------------- 1 | from logging.config import fileConfig 2 | 3 | from sqlalchemy import engine_from_config 4 | from sqlalchemy import pool 5 | 6 | from alembic import context 7 | 8 | from orm import models 9 | 10 | # this is the Alembic Config object, which provides 11 | # access to the values within the .ini file in use. 12 | config = context.config 13 | 14 | # Interpret the config file for Python logging. 15 | # This line sets up loggers basically. 16 | if config.config_file_name is not None: 17 | fileConfig(config.config_file_name) 18 | 19 | # add your model's MetaData object here 20 | # for 'autogenerate' support 21 | target_metadata = models.Base.metadata 22 | 23 | # other values from the config, defined by the needs of env.py, 24 | # can be acquired: 25 | # my_important_option = config.get_main_option("my_important_option") 26 | # ... etc. 27 | 28 | 29 | def run_migrations_offline() -> None: 30 | """Run migrations in 'offline' mode. 31 | 32 | This configures the context with just a URL 33 | and not an Engine, though an Engine is acceptable 34 | here as well. By skipping the Engine creation 35 | we don't even need a DBAPI to be available. 36 | 37 | Calls to context.execute() here emit the given string to the 38 | script output. 39 | 40 | """ 41 | url = config.get_main_option("sqlalchemy.url") 42 | context.configure( 43 | url=url, 44 | target_metadata=target_metadata, 45 | literal_binds=True, 46 | dialect_opts={"paramstyle": "named"}, 47 | ) 48 | 49 | with context.begin_transaction(): 50 | context.run_migrations() 51 | 52 | 53 | def run_migrations_online() -> None: 54 | """Run migrations in 'online' mode. 55 | 56 | In this scenario we need to create an Engine 57 | and associate a connection with the context. 58 | 59 | """ 60 | connectable = engine_from_config( 61 | config.get_section(config.config_ini_section, {}), 62 | prefix="sqlalchemy.", 63 | poolclass=pool.NullPool, 64 | ) 65 | 66 | with connectable.connect() as connection: 67 | context.configure( 68 | connection=connection, target_metadata=target_metadata 69 | ) 70 | 71 | with context.begin_transaction(): 72 | context.run_migrations() 73 | 74 | 75 | if context.is_offline_mode(): 76 | run_migrations_offline() 77 | else: 78 | run_migrations_online() 79 | -------------------------------------------------------------------------------- /app/alembic/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade() -> None: 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade() -> None: 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /app/alembic/versions/17cdff8379cb_initial_sources_record.py: -------------------------------------------------------------------------------- 1 | """Initial sources_record 2 | 3 | Revision ID: 17cdff8379cb 4 | Revises: 5 | Create Date: 2023-03-15 01:52:21.177793 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '17cdff8379cb' 14 | down_revision = None 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.create_table('sources_record', 22 | sa.Column('url', sa.String(length=10000), nullable=False), 23 | sa.Column('url_hash', sa.String(length=1000), nullable=False), 24 | sa.Column('crawl_id', sa.String(length=1000), nullable=False), 25 | sa.Column('shard_id', sa.String(length=1000), nullable=False), 26 | sa.Column('filename', sa.String(length=10000), nullable=False), 27 | sa.Column('status_code', sa.String(length=200), nullable=False), 28 | sa.Column('content_type', sa.String(length=1000), nullable=True), 29 | sa.Column('content_length', sa.String(length=1000), nullable=True), 30 | sa.Column('content_encoding', sa.String(length=1000), nullable=False), 31 | sa.Column('content_language', sa.ARRAY(sa.String()), nullable=True), 32 | sa.Column('last_modified', sa.DateTime(), nullable=True), 33 | sa.Column('source_filename', sa.String(length=10000), nullable=True), 34 | sa.Column('ip_address', sa.String(length=15), nullable=True), 35 | sa.Column('olet_ftype', sa.String(length=200), nullable=True), 36 | sa.Column('olet_container', sa.String(length=200), nullable=True), 37 | sa.Column('olet_appname', sa.String(length=200), nullable=True), 38 | sa.Column('olet_codepage', sa.String(length=200), nullable=True), 39 | sa.Column('olet_author', sa.String(length=400), nullable=True), 40 | sa.Column('olet_encrypted', sa.String(length=200), nullable=True), 41 | sa.Column('olet_vba', sa.String(length=400), nullable=True), 42 | sa.Column('olet_xlm', sa.String(length=400), nullable=True), 43 | sa.Column('olet_ext_rels', sa.String(length=200), nullable=True), 44 | sa.Column('olet_ObjectPool', sa.String(length=200), nullable=True), 45 | sa.Column('olet_flash', sa.String(length=200), nullable=True), 46 | sa.Column('olet_python_codec', sa.String(length=200), nullable=True), 47 | sa.Column('olet_pass', sa.String(length=200), nullable=True), 48 | sa.Column('timestamp', sa.DateTime(), nullable=False), 49 | sa.Column('exception', sa.String(length=1000), nullable=True), 50 | sa.PrimaryKeyConstraint('url_hash') 51 | ) 52 | # ### end Alembic commands ### 53 | 54 | 55 | def downgrade() -> None: 56 | # ### commands auto generated by Alembic - please adjust! ### 57 | op.drop_table('sources_record') 58 | # ### end Alembic commands ### 59 | -------------------------------------------------------------------------------- /app/alembic/versions/d1fbae82c5fa_initial_metadata_model.py: -------------------------------------------------------------------------------- 1 | """Initial metadata model 2 | 3 | Revision ID: d1fbae82c5fa 4 | Revises: 17cdff8379cb 5 | Create Date: 2023-04-11 06:11:03.232383 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'd1fbae82c5fa' 14 | down_revision = '17cdff8379cb' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.create_table('metadata_record', 22 | sa.Column('doc_id', sa.String(length=10000), nullable=False), 23 | sa.Column('url', sa.String(length=10000), nullable=False), 24 | sa.Column('url_hash', sa.String(length=1000), nullable=False), 25 | sa.Column('crawl_id', sa.String(length=1000), nullable=False), 26 | sa.Column('shard_id', sa.String(length=1000), nullable=False), 27 | sa.Column('filename', sa.String(length=10000), nullable=False), 28 | sa.Column('geo_location', sa.ARRAY(sa.String()), nullable=True), 29 | sa.Column('languages_fasttest', sa.ARRAY(sa.String()), nullable=True), 30 | sa.Column('languages_autocorrect', sa.ARRAY(sa.String()), nullable=True), 31 | sa.Column('doc_type', sa.String(length=1000), nullable=True), 32 | sa.Column('industry', sa.String(length=1000), nullable=True), 33 | sa.Column('word_count', sa.Integer(), nullable=False), 34 | sa.Column('num_figures', sa.Integer(), nullable=False), 35 | sa.Column('num_tables', sa.Integer(), nullable=False), 36 | sa.Column('num_table_cells', sa.Integer(), nullable=False), 37 | sa.Column('num_quotes', sa.Integer(), nullable=False), 38 | sa.Column('num_equations', sa.Integer(), nullable=False), 39 | sa.Column('num_sections_1', sa.Integer(), nullable=False), 40 | sa.Column('num_sections_2', sa.Integer(), nullable=False), 41 | sa.Column('num_sections_3', sa.Integer(), nullable=False), 42 | sa.Column('num_sections_4', sa.Integer(), nullable=False), 43 | sa.Column('num_sections_5', sa.Integer(), nullable=False), 44 | sa.Column('num_sections_6', sa.Integer(), nullable=False), 45 | sa.Column('num_sections_7', sa.Integer(), nullable=False), 46 | sa.Column('num_sections_8', sa.Integer(), nullable=False), 47 | sa.Column('num_sections_9', sa.Integer(), nullable=False), 48 | sa.Column('annotation_sources', sa.JSON(), nullable=False), 49 | sa.Column('form_hover_tags', sa.JSON(), nullable=True), 50 | sa.Column('template_name', sa.String(length=1000), nullable=True), 51 | sa.Column('creator_username', sa.String(length=1000), nullable=True), 52 | sa.PrimaryKeyConstraint('doc_id') 53 | ) 54 | # ### end Alembic commands ### 55 | 56 | 57 | def downgrade() -> None: 58 | # ### commands auto generated by Alembic - please adjust! ### 59 | op.drop_table('metadata_record') 60 | # ### end Alembic commands ### 61 | -------------------------------------------------------------------------------- /app/alembic/versions/eb22a058c4c9_less_mandatory_fields_for_doc_sources.py: -------------------------------------------------------------------------------- 1 | """Less mandatory fields for doc_sources 2 | 3 | Revision ID: eb22a058c4c9 4 | Revises: 2de829bd1ca3 5 | Create Date: 2023-05-29 21:25:05.888669 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'eb22a058c4c9' 14 | down_revision = '2de829bd1ca3' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.alter_column('sources_record', 'filename', 22 | existing_type=sa.VARCHAR(length=10000), 23 | nullable=True) 24 | op.alter_column('sources_record', 'bytehash', 25 | existing_type=sa.VARCHAR(length=10000), 26 | nullable=True) 27 | op.alter_column('sources_record', 'status_code', 28 | existing_type=sa.VARCHAR(length=200), 29 | nullable=True) 30 | op.alter_column('sources_record', 'content_encoding', 31 | existing_type=sa.VARCHAR(length=1000), 32 | nullable=True) 33 | # ### end Alembic commands ### 34 | 35 | 36 | def downgrade() -> None: 37 | # ### commands auto generated by Alembic - please adjust! ### 38 | op.alter_column('sources_record', 'content_encoding', 39 | existing_type=sa.VARCHAR(length=1000), 40 | nullable=False) 41 | op.alter_column('sources_record', 'status_code', 42 | existing_type=sa.VARCHAR(length=200), 43 | nullable=False) 44 | op.alter_column('sources_record', 'bytehash', 45 | existing_type=sa.VARCHAR(length=10000), 46 | nullable=False) 47 | op.alter_column('sources_record', 'filename', 48 | existing_type=sa.VARCHAR(length=10000), 49 | nullable=False) 50 | # ### end Alembic commands ### 51 | -------------------------------------------------------------------------------- /app/cc_parse_merge_and_recover_urls.py: -------------------------------------------------------------------------------- 1 | """ 2 | Util used after downloading one URL-Batch from CC 3 | - Merges parquet files into one input file for DL script 4 | - Recovers any missed WAT segments into one TXT file 5 | - Deduplicates URLs within one CC Dump 6 | """ 7 | 8 | import argparse 9 | import pandas as pd 10 | from pathlib import Path 11 | import pyarrow as pa 12 | import pyarrow.parquet as pq 13 | import time 14 | 15 | import settings 16 | from src.cc_processing.preprocess_cc_urls import process_urls 17 | 18 | BASE_URL = "https://data.commoncrawl.org/" 19 | 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument( 24 | "--input", "-i", default=None, type=str, 25 | help="directory containing URL parquets to merge and deduplicate" 26 | ) 27 | parser.add_argument( 28 | "--cc_dump", "-cc", default=None, type=str, 29 | help="cc dump being processed" 30 | ) 31 | parser.add_argument( 32 | "--listings_dir", "-ld", type=str, default=None, 33 | help="listings dir to compare against", 34 | ) 35 | parser.add_argument( 36 | "--dedupe", "-dd", type=bool, default=False, 37 | help="set to true in order to deduplicate current dump compared to " 38 | "already processed dumps" 39 | ) 40 | args = parser.parse_args() 41 | 42 | # end-results will be written to here 43 | write_dir = settings.filesystem.CLEAN_URLS_DIR 44 | 45 | if not Path(write_dir).exists(): 46 | Path(write_dir).mkdir(parents=True) 47 | 48 | pdir = Path(args.input) 49 | pqfiles = [i for i in pdir.glob('*.parquet')] 50 | with pq.ParquetWriter(str(pdir / (args.cc_dump + "_merged_raw.parquet")), 51 | schema=pa.schema([('url', pa.string())])) as writer: 52 | for item in pqfiles: 53 | pqtab = pq.read_table(item) 54 | # some parquets may be empty (no docx urls in segment) 55 | if pqtab.schema.equals(writer.schema): 56 | writer.write_table(pq.read_table(item)) 57 | 58 | time.sleep(5) 59 | 60 | # deduplicate parquet 61 | df = pd.read_parquet(str(pdir / (args.cc_dump + "_merged_raw.parquet"))) 62 | num_undupe_rows = len(df) 63 | df = df.drop_duplicates() 64 | num_rows = len(df) 65 | df.to_parquet(str(pdir / (args.cc_dump + "_merged.parquet"))) 66 | 67 | print("total unique URLs: " + str(num_rows) + " removed " + str( 68 | num_undupe_rows - num_rows) + " duplicates") 69 | 70 | # check if any segments need to be recovered 71 | lstdir = Path(args.listings_dir) 72 | lstfiles = [i for i in lstdir.glob('**/*.txt')] 73 | needed_segments = [] 74 | for item in lstfiles: 75 | with open(item) as file: 76 | for line in file: 77 | needed_segments.append(line.strip()) 78 | 79 | logfiles = [i for i in pdir.glob('worker_log_*')] 80 | gotten_segments = [] 81 | for item in logfiles: 82 | with open(item) as file: 83 | last_seen_seg = '' 84 | for line in file: 85 | if 'Fetching ' in line: 86 | last_seen_seg = line.split('Fetching ')[-1].strip() 87 | if 'Success! got URL list' in line: 88 | gotten_segments.append(last_seen_seg) 89 | 90 | missed_segments = [x for x in needed_segments if 91 | ((BASE_URL + x) not in gotten_segments)] 92 | 93 | # write the segments to recover to a txt file 94 | with open(str(write_dir / (args.cc_dump + "_recovery_segments.txt")), 95 | 'w') as file: 96 | for item in missed_segments: 97 | file.write(item + '\n') 98 | 99 | print("sucessfully parsed " + str( 100 | len(gotten_segments)) + " segments, missed " + str( 101 | len(missed_segments))) 102 | 103 | # do remaining processing and cleaning of urls 104 | process_urls(input=str(pdir / (args.cc_dump + "_merged.parquet")), 105 | output=str(write_dir / (args.cc_dump + ".parquet")), 106 | dedupe=args.dedupe) 107 | 108 | 109 | if __name__ == '__main__': 110 | main() 111 | -------------------------------------------------------------------------------- /app/cc_parse_partition_listings.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import urllib.request 4 | import io 5 | import gzip 6 | from datetime import datetime 7 | from pathlib import Path 8 | import settings 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--crawl", default=None, type=str, 12 | help="Common Crawl crawl") 13 | parser.add_argument("--partition-size", default=13, type=int, 14 | help="Partition size") 15 | parser.add_argument("--num_nodes", default=180, type=int, 16 | help="number of nodes") 17 | args = parser.parse_args() 18 | 19 | DATA_TYPE = "wat" 20 | BASE_URL = "https://data.commoncrawl.org" 21 | 22 | LISTINGS_DIR = settings.filesystem.CC_SEGMENT_DIR / (args.crawl + "/listings") 23 | 24 | 25 | def get_timestamp(): 26 | return datetime.now().strftime("[%Y-%m-%d %H:%M:%S]") 27 | 28 | 29 | def get_idx(idx: int, n_digits: int = 9): 30 | return "0" * (n_digits - len(str(idx))) + str(idx) 31 | 32 | 33 | def main(): 34 | # commoncrawl params 35 | crawl = args.crawl 36 | partition_size = args.partition_size 37 | 38 | # directory structure 39 | listings_dir = LISTINGS_DIR 40 | 41 | print( 42 | "{} Downloading Common Crawl paths listings" 43 | "\n\t* crawl: {}" 44 | "\n\t* data-type: {}" 45 | "\n\t* partition-size: {}" 46 | "\n\t* listings dir: {}".format( 47 | get_timestamp(), crawl, DATA_TYPE, partition_size, listings_dir 48 | ) 49 | ) 50 | 51 | listings_url = os.path.join(BASE_URL, 52 | f"crawl-data/{crawl}/{DATA_TYPE}.paths.gz") 53 | 54 | # create dir to save partitioned listings 55 | if not os.path.exists(listings_dir): 56 | os.makedirs(listings_dir) 57 | 58 | # download listings 59 | response = urllib.request.urlopen(listings_url) 60 | compressed_file = io.BytesIO(response.read()) 61 | decompressed_file = gzip.GzipFile(fileobj=compressed_file) 62 | listings = decompressed_file.read().decode("utf-8").splitlines() 63 | 64 | # partition listings and save as txt files 65 | idx = 0 66 | for i in range(0, len(listings), int(partition_size)): 67 | save_as = os.path.join( 68 | listings_dir, f"wat.paths.part_{get_idx(idx, n_digits=4)}.txt" 69 | ) 70 | 71 | with open(save_as, "w") as f: 72 | f.write("\n".join(listings[i: i + int(partition_size)])) 73 | 74 | idx += 1 75 | 76 | # distribute accross node folders 77 | for i in range(1, args.num_nodes + 1): 78 | subdir = os.path.join(listings_dir, str(i)) 79 | os.mkdir(subdir) 80 | 81 | files = list(Path(listings_dir).glob('*.txt')) 82 | 83 | curr_subdir = 1 84 | for f in files: 85 | f.rename(Path(listings_dir) / str(curr_subdir) / f.name) 86 | curr_subdir = (curr_subdir % args.num_nodes) + 1 87 | 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /app/cc_parse_snapshot.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import joblib 3 | import multiprocessing as mp 4 | import os 5 | from pathlib import Path 6 | 7 | from src.cc_processing.cc_url_process import CCURLProcess 8 | import settings 9 | 10 | BASE_URL = "https://data.commoncrawl.org/" 11 | 12 | 13 | class URIBatchProvider(mp.Process): 14 | def __init__(self, inputs_queue: mp.Queue, listings_dir: str, parts: int, 15 | num_workers: int): 16 | """ 17 | Provides warc URIs to cc_url processes input queue. 18 | 19 | @param inputs_queue: Queue to write uris to 20 | @param listings_dir: Directory containing wat file part listings to be 21 | distributed to worker processes 22 | @param parts: Number of parts to proccess 23 | @param num_workers: Number of workers to supply with uris 24 | """ 25 | 26 | super(URIBatchProvider, self).__init__() 27 | self.inputs_queue = inputs_queue 28 | self.listings_dir = listings_dir 29 | self.parts = parts 30 | self.num_workers = num_workers 31 | 32 | def run(self): 33 | """ 34 | Core process loop required by python multiprocessing 35 | """ 36 | 37 | parts_dir = Path(self.listings_dir) 38 | 39 | for parts_file in parts_dir.glob('*.txt'): 40 | with parts_file.open() as file: 41 | contents = file.read() 42 | processed_contents = list( 43 | map(lambda x: BASE_URL + x, contents.split('\n'))) 44 | # split individually over all cores --> better DL parallelism 45 | for s in processed_contents: 46 | self.inputs_queue.put([s]) 47 | 48 | for _ in range(self.num_workers): 49 | self.inputs_queue.put(None) 50 | 51 | 52 | def get_args() -> argparse.Namespace: 53 | arg_parser = argparse.ArgumentParser() 54 | arg_parser.add_argument("--input", "-i", 55 | help="path to folder containing URL listing parts", 56 | type=str, default=None) 57 | arg_parser.add_argument("--parts", "-p", 58 | help="number of parts files to process, -1 for all", 59 | type=str, default=-1) 60 | arg_parser.add_argument("--cc_dump", "-cc", help="cc dump being processed", 61 | type=str, default="CC-MAIN-2023-06") 62 | args = arg_parser.parse_args() 63 | return args 64 | 65 | 66 | def main(): 67 | args = get_args() 68 | 69 | num_cpus = int(os.environ.get("SLURM_CPUS_PER_TASK", joblib.cpu_count())) 70 | 71 | # make cc dir if not yet existant 72 | if not Path.exists(settings.filesystem.CC_DIR): 73 | Path.mkdir(settings.filesystem.CC_DIR) 74 | if not Path.exists(settings.filesystem.CC_DIR / args.cc_dump): 75 | Path.mkdir(settings.filesystem.CC_DIR / args.cc_dump) 76 | 77 | num_worker_processes = num_cpus - 2 78 | 79 | queue_buffer_size = 4 * num_worker_processes 80 | inputs_queue = mp.Queue(maxsize=queue_buffer_size) 81 | 82 | cc_processes = [] 83 | for i in range(num_worker_processes): 84 | cc_process = CCURLProcess(inputs_queue, BASE_URL, args.cc_dump) 85 | cc_process.start() 86 | print("started cc_url parser") 87 | cc_processes.append(cc_process) 88 | 89 | # provide URI batches 90 | provider_process = URIBatchProvider(inputs_queue, args.input, 10, 1) 91 | provider_process.start() 92 | provider_process.join() 93 | 94 | # wait for workers to finish 95 | for cc in cc_processes: 96 | cc.join() 97 | 98 | 99 | if __name__ == '__main__': 100 | main() 101 | -------------------------------------------------------------------------------- /app/configs/default_config.yaml: -------------------------------------------------------------------------------- 1 | IMAGES: 2 | IMAGE_FORMAT: jpg 3 | IMAGE_HEIGHT: null 4 | IMAGE_WIDTH: null 5 | IMAGE_DPI: 90 6 | 7 | DECOMPRESS_BOMB_CHECKS: 8 | MAX_DECOMPRESS_RATIO: 20 9 | MAX_IMAGE_PIXELS: 22369621 10 | 11 | DOCUMENTS: 12 | MAX_DOC_BYTES: 10485760 13 | MAX_DOC_PAGES: 150 14 | 15 | TIME_LIMITS: 16 | ANNOTATION_TIMEOUT_SECS: 180 17 | ANNOTATION_CLEANUP_SECS: 180 18 | 19 | DATA_ORG: 20 | MAX_BYTES_IN_SHARD: 5368709120 21 | 22 | LANGUAGES: 23 | TOP_K_LANGUAGES: 5 24 | 25 | LIBREOFFICE: 26 | UNOSERVER_START_TIMEOUT: 60 27 | UNOCONVERT_TIMEOUT: 60 28 | SOFFICE_LAUNCH_TIMEOUT: 120 29 | SOFFICE_LAUNCH_PING_INTERVAL: 0.1 30 | 31 | ENTITY_DEFINITION: 32 | MAX_HEADING_LEN: 150 33 | FORM_FIELD_MIN_LENGTH: 4 34 | 35 | ENTITY_RELATIONS: 36 | BBOX_RELATION_OVERLAP_THRESHOLD: 0.45 37 | BBOX_RELATION_SCALE_THRESHOLD: 1.2 38 | BBOX_RELATION_CLOSENESS_THRESHOLD: 10 39 | WORD_2_ENTITY_OVERLAP_THRESHOLD: 0.8 40 | 41 | ANNOTATION_FILTER: 42 | MIN_TEXT_CHARS: 200 43 | 44 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/1header.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "1header", 3 | "train_settings": { 4 | "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [], 8 | "elem_mergings": { 9 | "masters": { 10 | "1": "heading_global" 11 | }, 12 | "mapping": { 13 | "2": "1", 14 | "3": "1", 15 | "4": "1", 16 | "5": "1", 17 | "6": "1", 18 | "7": "1", 19 | "8": "1", 20 | "9": "1" 21 | } 22 | }, 23 | "scanify": false, 24 | "quality_threshold": -1, 25 | "language_codes": ["en"], 26 | "language_code_threshold": 0.75 27 | }, 28 | "val_settings": { 29 | "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train", 30 | "is_validation": true, 31 | "max_img": 40000, 32 | "elem_drops": [], 33 | "elem_mergings": { 34 | "masters": { 35 | "1": "heading_global" 36 | }, 37 | "mapping": { 38 | "2": "1", 39 | "3": "1", 40 | "4": "1", 41 | "5": "1", 42 | "6": "1", 43 | "7": "1", 44 | "8": "1", 45 | "9": "1" 46 | } 47 | }, 48 | "scanify": false, 49 | "quality_threshold": -1, 50 | "language_codes": ["en"], 51 | "language_code_threshold": 0.75 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/3headers.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "3headers", 3 | "train_settings": { 4 | "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [], 8 | "elem_mergings": { 9 | "masters": { 10 | "1": "heading_123", 11 | "4": "heading_456", 12 | "5": "heading_789" 13 | }, 14 | "mapping": { 15 | "2": "1", 16 | "3": "1", 17 | "5": "4", 18 | "6": "4", 19 | "8": "7", 20 | "9": "7" 21 | } 22 | }, 23 | "scanify": false, 24 | "quality_threshold": -1, 25 | "language_codes": ["en"], 26 | "language_code_threshold": 0.75 27 | }, 28 | "val_settings": { 29 | "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train", 30 | "is_validation": true, 31 | "max_img": 40000, 32 | "elem_drops": [], 33 | "elem_mergings": { 34 | "masters": { 35 | "1": "heading_123", 36 | "4": "heading_456", 37 | "5": "heading_789" 38 | }, 39 | "mapping": { 40 | "2": "1", 41 | "3": "1", 42 | "5": "4", 43 | "6": "4", 44 | "8": "7", 45 | "9": "7" 46 | } 47 | }, 48 | "scanify": false, 49 | "quality_threshold": -1, 50 | "language_codes": ["en"], 51 | "language_code_threshold": 0.75 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/baseline.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "baseline", 3 | "train_settings": { 4 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [], 8 | "elem_mergings": {}, 9 | "scanify": false, 10 | "quality_threshold": -1, 11 | "language_codes": ["en"], 12 | "language_code_threshold": 0.75 13 | }, 14 | "val_settings": { 15 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/val", 16 | "is_validation": true, 17 | "max_img": 40000, 18 | "elem_drops": [], 19 | "elem_mergings": {}, 20 | "scanify": false, 21 | "quality_threshold": -1, 22 | "language_codes": ["en"], 23 | "language_code_threshold": 0.75 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/baseline_quality.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "baseline_quality", 3 | "train_settings": { 4 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [], 8 | "elem_mergings": {}, 9 | "scanify": false, 10 | "quality_threshold": 0.75, 11 | "language_codes": ["en"], 12 | "language_code_threshold": 0.75 13 | }, 14 | "val_settings": { 15 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train", 16 | "is_validation": true, 17 | "max_img": 40000, 18 | "elem_drops": [], 19 | "elem_mergings": {}, 20 | "scanify": false, 21 | "quality_threshold": 0.75, 22 | "language_codes": ["en"], 23 | "language_code_threshold": 0.75 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/local/1header_balanced_quality.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "1header_balanced_quality", 3 | "train_settings": { 4 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [24, 25, 27, 30], 8 | "elem_mergings": { 9 | "masters": { 10 | "1": "heading_global", 11 | "10": "text_merged", 12 | "11": "list_merged" 13 | }, 14 | "mapping": { 15 | "2": "1", 16 | "3": "1", 17 | "4": "1", 18 | "5": "1", 19 | "6": "1", 20 | "7": "1", 21 | "8": "1", 22 | "9": "1", 23 | "18": "11", 24 | "19": "11", 25 | "20": "10" 26 | } 27 | }, 28 | "elem_mins": { 29 | "0": 2000, 30 | "1": 5000, 31 | "10": 5000, 32 | "11": 5000, 33 | "12": 5000, 34 | "13": 5000, 35 | "14": 2000, 36 | "15": 2000, 37 | "16": 15000, 38 | "17": 20000, 39 | "21": 2000, 40 | "22": 5000, 41 | "23": 5000, 42 | "26": 5000, 43 | "28": 10000, 44 | "29": 10000 45 | }, 46 | "scanify": false, 47 | "quality_threshold": 0.7, 48 | "language_codes": ["en"], 49 | "language_code_threshold": 0.75 50 | }, 51 | "val_settings": { 52 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/val", 53 | "is_validation": true, 54 | "max_img": 40000, 55 | "elem_drops": [24, 25, 27, 30], 56 | "elem_mergings": { 57 | "masters": { 58 | "1": "heading_global", 59 | "10": "text_merged", 60 | "11": "list_merged" 61 | }, 62 | "mapping": { 63 | "2": "1", 64 | "3": "1", 65 | "4": "1", 66 | "5": "1", 67 | "6": "1", 68 | "7": "1", 69 | "8": "1", 70 | "9": "1", 71 | "18": "11", 72 | "19": "11", 73 | "20": "10" 74 | } 75 | }, 76 | "elem_mins": { 77 | "0": 300, 78 | "1": 1000, 79 | "10": 1000, 80 | "11": 1000, 81 | "12": 1000, 82 | "13": 1000, 83 | "14": 300, 84 | "15": 300, 85 | "16": 3000, 86 | "17": 4000, 87 | "21": 300, 88 | "22": 1000, 89 | "23": 1000, 90 | "26": 1000, 91 | "28": 2000, 92 | "29": 2000 93 | }, 94 | "scanify": false, 95 | "quality_threshold": 0.7, 96 | "language_codes": ["en"], 97 | "language_code_threshold": 0.75 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/local/1header_balanced_quality_multilang.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "1header_balanced_quality_multilang", 3 | "train_settings": { 4 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [14, 28, 29, 24, 25, 27, 30], 8 | "elem_mergings": { 9 | "masters": { 10 | "1": "heading_title_global", 11 | "10": "text_merged", 12 | "11": "list_merged", 13 | "17": "table_cell_merged" 14 | }, 15 | "mapping": { 16 | "0": "1", 17 | "2": "1", 18 | "3": "1", 19 | "4": "1", 20 | "5": "1", 21 | "6": "1", 22 | "7": "1", 23 | "8": "1", 24 | "9": "1", 25 | "15": "17", 26 | "18": "11", 27 | "19": "11", 28 | "20": "10", 29 | "23": "10" 30 | } 31 | }, 32 | "elem_mins": { 33 | "1": 20000, 34 | "10": 20000, 35 | "11": 20000, 36 | "12": 20000, 37 | "13": 20000, 38 | "16": 20000, 39 | "17": 20000, 40 | "21": 20000, 41 | "22": 20000, 42 | "26": 20000 43 | }, 44 | "scanify": false, 45 | "quality_threshold": 0.7, 46 | "language_codes": ["es", "fr", "it", "de", "pt", "en"], 47 | "language_code_threshold": 0.75 48 | }, 49 | "val_settings": { 50 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/val", 51 | "is_validation": true, 52 | "max_img": 40000, 53 | "elem_drops": [14, 28, 29, 24, 25, 27, 30], 54 | "elem_mergings": { 55 | "masters": { 56 | "1": "heading_title_global", 57 | "10": "text_merged", 58 | "11": "list_merged", 59 | "17": "table_cell_merged" 60 | }, 61 | "mapping": { 62 | "0": "1", 63 | "2": "1", 64 | "3": "1", 65 | "4": "1", 66 | "5": "1", 67 | "6": "1", 68 | "7": "1", 69 | "8": "1", 70 | "9": "1", 71 | "15": "17", 72 | "18": "11", 73 | "19": "11", 74 | "20": "10", 75 | "23": "10" 76 | } 77 | }, 78 | "elem_mins": { 79 | "1": 4000, 80 | "10": 4000, 81 | "11": 4000, 82 | "12": 4000, 83 | "13": 4000, 84 | "16": 4000, 85 | "17": 4000, 86 | "21": 4000, 87 | "22": 4000, 88 | "26": 4000 89 | }, 90 | "scanify": false, 91 | "quality_threshold": 0.7, 92 | "language_codes": ["es", "fr", "it", "de", "pt", "en"], 93 | "language_code_threshold": 0.75 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/local/1header_balanced_quality_report.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_counts": { 3 | "title": 14372, 4 | "heading_global": 275224, 5 | "text_merged": 698583, 6 | "list_merged": 907053, 7 | "header": 59964, 8 | "footer": 66648, 9 | "table_header": 363, 10 | "table_header_cell": 1529, 11 | "table": 7415, 12 | "table_cell": 36691, 13 | "equation": 1884, 14 | "figure": 83010, 15 | "table_caption": 1170, 16 | "form_field": 55934, 17 | "table_row": 16061, 18 | "table_column": 12296, 19 | "empty_labels": 7 20 | }, 21 | "val_counts": { 22 | "title": 2929, 23 | "heading_global": 55435, 24 | "text_merged": 145300, 25 | "list_merged": 174718, 26 | "header": 13161, 27 | "footer": 13625, 28 | "table_header": 139, 29 | "table_header_cell": 600, 30 | "table": 2554, 31 | "table_cell": 13031, 32 | "equation": 655, 33 | "figure": 15416, 34 | "table_caption": 338, 35 | "form_field": 10153, 36 | "table_row": 5521, 37 | "table_column": 4433, 38 | "empty_labels": 2 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/local/3headers_balanced_quality.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "3headers_balanced_quality", 3 | "train_settings": { 4 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [24, 25, 27], 8 | "elem_mergings": { 9 | "masters": { 10 | "1": "heading_123", 11 | "4": "heading_456", 12 | "7": "heading_789", 13 | "10": "text_merged", 14 | "11": "list_merged" 15 | }, 16 | "mapping": { 17 | "2": "1", 18 | "3": "1", 19 | "5": "4", 20 | "6": "4", 21 | "8": "7", 22 | "9": "7", 23 | "18": "11", 24 | "19": "11", 25 | "20": "10" 26 | } 27 | }, 28 | "elem_mins": { 29 | "0": 2000, 30 | "1": 5000, 31 | "4": 5000, 32 | "7": 5000, 33 | "10": 5000, 34 | "11": 5000, 35 | "12": 5000, 36 | "13": 5000, 37 | "14": 2000, 38 | "15": 2000, 39 | "16": 5000, 40 | "17": 5000, 41 | "21": 2000, 42 | "22": 5000, 43 | "23": 5000, 44 | "26": 5000, 45 | "28": 5000, 46 | "29": 5000, 47 | "30": 5000 48 | }, 49 | "scanify": false, 50 | "quality_threshold": 0.7, 51 | "language_codes": ["en"], 52 | "language_code_threshold": 0.75 53 | }, 54 | "val_settings": { 55 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/val", 56 | "is_validation": true, 57 | "max_img": 40000, 58 | "elem_drops": [24, 25, 27], 59 | "elem_mergings": { 60 | "masters": { 61 | "1": "heading_123", 62 | "4": "heading_456", 63 | "7": "heading_789", 64 | "10": "text_merged", 65 | "11": "list_merged" 66 | }, 67 | "mapping": { 68 | "2": "1", 69 | "3": "1", 70 | "5": "4", 71 | "6": "4", 72 | "8": "7", 73 | "9": "7", 74 | "18": "11", 75 | "19": "11", 76 | "20": "10" 77 | } 78 | }, 79 | "elem_mins": { 80 | "0": 300, 81 | "1": 1000, 82 | "4": 1000, 83 | "7": 1000, 84 | "10": 1000, 85 | "11": 1000, 86 | "12": 1000, 87 | "13": 1000, 88 | "14": 300, 89 | "15": 300, 90 | "16": 1000, 91 | "17": 1000, 92 | "21": 300, 93 | "22": 1000, 94 | "23": 1000, 95 | "26": 1000, 96 | "28": 1000, 97 | "29": 1000, 98 | "30": 1000 99 | }, 100 | "scanify": false, 101 | "quality_threshold": 0.7, 102 | "language_codes": ["en"], 103 | "language_code_threshold": 0.75 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/local/3headers_balanced_quality_report.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_counts": { 3 | "title": 14158, 4 | "heading_123": 222622, 5 | "heading_456": 48883, 6 | "heading_789": 18720, 7 | "text_merged": 711231, 8 | "list_merged": 894044, 9 | "header": 59981, 10 | "footer": 68337, 11 | "table_header": 363, 12 | "table_header_cell": 1529, 13 | "table": 7154, 14 | "table_cell": 36691, 15 | "equation": 1884, 16 | "figure": 82880, 17 | "table_caption": 1170, 18 | "form_field": 55734, 19 | "table_row": 16061, 20 | "table_column": 12296, 21 | "table_header_row": 0 22 | }, 23 | "val_counts": { 24 | "title": 2918, 25 | "heading_123": 45048, 26 | "heading_456": 9700, 27 | "heading_789": 3808, 28 | "text_merged": 148337, 29 | "list_merged": 178763, 30 | "header": 12695, 31 | "footer": 13428, 32 | "table_header": 139, 33 | "table_header_cell": 600, 34 | "table": 1948, 35 | "table_cell": 10588, 36 | "equation": 655, 37 | "figure": 16139, 38 | "table_caption": 338, 39 | "form_field": 10131, 40 | "table_row": 4470, 41 | "table_column": 3558, 42 | "table_header_row": 0 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/local/tableonly_balanced.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tableonly_balanced", 3 | "train_settings": { 4 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [ 8 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 9 | 24, 25, 26, 27, 30 10 | ], 11 | "elem_mins": { 12 | "14": 1000, 13 | "15": 3000, 14 | "16": 20000, 15 | "17": 150000, 16 | "23": 2000, 17 | "28": 30000, 18 | "29": 30000 19 | }, 20 | "elem_mergings": {}, 21 | "scanify": false, 22 | "quality_threshold": -1, 23 | "language_codes": ["en", "de"], 24 | "language_code_threshold": 0.75 25 | }, 26 | "val_settings": { 27 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/val", 28 | "is_validation": true, 29 | "max_img": 40000, 30 | "elem_drops": [ 31 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 32 | 24, 25, 26, 27, 30 33 | ], 34 | "elem_mins": { 35 | "14": 200, 36 | "15": 600, 37 | "16": 4000, 38 | "17": 30000, 39 | "23": 400, 40 | "28": 6000, 41 | "29": 6000 42 | }, 43 | "elem_mergings": {}, 44 | "scanify": false, 45 | "quality_threshold": -1, 46 | "language_codes": ["en", "de"], 47 | "language_code_threshold": 0.75 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/local/tableonly_balanced_cut.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tableonly_balanced_cut", 3 | "train_settings": { 4 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [ 8 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 18, 19, 20, 21, 9 | 22, 23, 24, 25, 26, 27, 28, 29, 30 10 | ], 11 | "elem_mins": { 12 | "16": 190000 13 | }, 14 | "elem_mergings": { 15 | "masters": { 16 | "17": "table_cell_merged" 17 | }, 18 | "mapping": { 19 | "15": "17" 20 | } 21 | }, 22 | "scanify": false, 23 | "quality_threshold": -1, 24 | "language_codes": ["en", "de"], 25 | "language_code_threshold": 0.75 26 | }, 27 | "val_settings": { 28 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/val", 29 | "is_validation": true, 30 | "max_img": 40000, 31 | "elem_drops": [ 32 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 18, 19, 20, 21, 33 | 22, 23, 24, 25, 26, 27, 28, 29, 30 34 | ], 35 | "elem_mins": { 36 | "16": 38000 37 | }, 38 | "elem_mergings": { 39 | "masters": { 40 | "17": "table_cell_merged" 41 | }, 42 | "mapping": { 43 | "15": "17" 44 | } 45 | }, 46 | "scanify": false, 47 | "quality_threshold": -1, 48 | "language_codes": ["en", "de"], 49 | "language_code_threshold": 0.75 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/spaceml/1header.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "1header", 3 | "train_settings": { 4 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [], 8 | "elem_mergings": { 9 | "masters": { 10 | "1": "heading_global" 11 | }, 12 | "mapping": { 13 | "2": "1", 14 | "3": "1", 15 | "4": "1", 16 | "5": "1", 17 | "6": "1", 18 | "7": "1", 19 | "8": "1", 20 | "9": "1" 21 | } 22 | }, 23 | "scanify": false, 24 | "quality_threshold": -1, 25 | "language_codes": ["en"], 26 | "language_code_threshold": 0.75 27 | }, 28 | "val_settings": { 29 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val", 30 | "is_validation": true, 31 | "max_img": 40000, 32 | "elem_drops": [], 33 | "elem_mergings": { 34 | "masters": { 35 | "1": "heading_global" 36 | }, 37 | "mapping": { 38 | "2": "1", 39 | "3": "1", 40 | "4": "1", 41 | "5": "1", 42 | "6": "1", 43 | "7": "1", 44 | "8": "1", 45 | "9": "1" 46 | } 47 | }, 48 | "scanify": false, 49 | "quality_threshold": -1, 50 | "language_codes": ["en"], 51 | "language_code_threshold": 0.75 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/spaceml/1header_quality.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "1header_quality", 3 | "train_settings": { 4 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [], 8 | "elem_mergings": { 9 | "masters": { 10 | "1": "heading_global" 11 | }, 12 | "mapping": { 13 | "2": "1", 14 | "3": "1", 15 | "4": "1", 16 | "5": "1", 17 | "6": "1", 18 | "7": "1", 19 | "8": "1", 20 | "9": "1" 21 | } 22 | }, 23 | "scanify": false, 24 | "quality_threshold": 0.75, 25 | "language_codes": ["en"], 26 | "language_code_threshold": 0.75 27 | }, 28 | "val_settings": { 29 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val", 30 | "is_validation": true, 31 | "max_img": 40000, 32 | "elem_drops": [], 33 | "elem_mergings": { 34 | "masters": { 35 | "1": "heading_global" 36 | }, 37 | "mapping": { 38 | "2": "1", 39 | "3": "1", 40 | "4": "1", 41 | "5": "1", 42 | "6": "1", 43 | "7": "1", 44 | "8": "1", 45 | "9": "1" 46 | } 47 | }, 48 | "scanify": false, 49 | "quality_threshold": 0.75, 50 | "language_codes": ["en"], 51 | "language_code_threshold": 0.75 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/spaceml/3headers.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "3headers", 3 | "train_settings": { 4 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [], 8 | "elem_mergings": { 9 | "masters": { 10 | "1": "heading_123", 11 | "4": "heading_456", 12 | "7": "heading_789" 13 | }, 14 | "mapping": { 15 | "2": "1", 16 | "3": "1", 17 | "5": "4", 18 | "6": "4", 19 | "8": "7", 20 | "9": "7" 21 | } 22 | }, 23 | "scanify": false, 24 | "quality_threshold": -1, 25 | "language_codes": ["en"], 26 | "language_code_threshold": 0.75 27 | }, 28 | "val_settings": { 29 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val", 30 | "is_validation": true, 31 | "max_img": 40000, 32 | "elem_drops": [], 33 | "elem_mergings": { 34 | "masters": { 35 | "1": "heading_123", 36 | "4": "heading_456", 37 | "7": "heading_789" 38 | }, 39 | "mapping": { 40 | "2": "1", 41 | "3": "1", 42 | "5": "4", 43 | "6": "4", 44 | "8": "7", 45 | "9": "7" 46 | } 47 | }, 48 | "scanify": false, 49 | "quality_threshold": -1, 50 | "language_codes": ["en"], 51 | "language_code_threshold": 0.75 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/spaceml/3headers_quality.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "3headers_quality", 3 | "train_settings": { 4 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [], 8 | "elem_mergings": { 9 | "masters": { 10 | "1": "heading_123", 11 | "4": "heading_456", 12 | "7": "heading_789" 13 | }, 14 | "mapping": { 15 | "2": "1", 16 | "3": "1", 17 | "5": "4", 18 | "6": "4", 19 | "8": "7", 20 | "9": "7" 21 | } 22 | }, 23 | "scanify": false, 24 | "quality_threshold": 0.75, 25 | "language_codes": ["en"], 26 | "language_code_threshold": 0.75 27 | }, 28 | "val_settings": { 29 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val", 30 | "is_validation": true, 31 | "max_img": 40000, 32 | "elem_drops": [], 33 | "elem_mergings": { 34 | "masters": { 35 | "1": "heading_123", 36 | "4": "heading_456", 37 | "7": "heading_789" 38 | }, 39 | "mapping": { 40 | "2": "1", 41 | "3": "1", 42 | "5": "4", 43 | "6": "4", 44 | "8": "7", 45 | "9": "7" 46 | } 47 | }, 48 | "scanify": false, 49 | "quality_threshold": 0.75, 50 | "language_codes": ["en"], 51 | "language_code_threshold": 0.75 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/spaceml/baseline.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "baseline", 3 | "train_settings": { 4 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [], 8 | "elem_mergings": {}, 9 | "scanify": false, 10 | "quality_threshold": -1, 11 | "language_codes": ["en"], 12 | "language_code_threshold": 0.75 13 | }, 14 | "val_settings": { 15 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val", 16 | "is_validation": true, 17 | "max_img": 40000, 18 | "elem_drops": [], 19 | "elem_mergings": {}, 20 | "scanify": false, 21 | "quality_threshold": -1, 22 | "language_codes": ["en"], 23 | "language_code_threshold": 0.75 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/spaceml/baseline_quality.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "baseline_quality", 3 | "train_settings": { 4 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [], 8 | "elem_mergings": {}, 9 | "scanify": false, 10 | "quality_threshold": 0.75, 11 | "language_codes": ["en"], 12 | "language_code_threshold": 0.75 13 | }, 14 | "val_settings": { 15 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val", 16 | "is_validation": true, 17 | "max_img": 40000, 18 | "elem_drops": [], 19 | "elem_mergings": {}, 20 | "scanify": false, 21 | "quality_threshold": 0.75, 22 | "language_codes": ["en"], 23 | "language_code_threshold": 0.75 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/spaceml/tableonly.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tableonly", 3 | "train_settings": { 4 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [ 8 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 9 | 23, 24, 25, 26, 27, 28, 29, 30 10 | ], 11 | "elem_mergings": {}, 12 | "scanify": false, 13 | "quality_threshold": -1, 14 | "language_codes": ["en"], 15 | "language_code_threshold": 0.75 16 | }, 17 | "val_settings": { 18 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val", 19 | "is_validation": true, 20 | "max_img": 40000, 21 | "elem_drops": [ 22 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23 | 23, 24, 25, 26, 27, 28, 29, 30 24 | ], 25 | "elem_mergings": {}, 26 | "scanify": false, 27 | "quality_threshold": -1, 28 | "language_codes": ["en"], 29 | "language_code_threshold": 0.75 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/spaceml/tableonly_quality.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tableonly_quality", 3 | "train_settings": { 4 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [ 8 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 9 | 23, 24, 25, 26, 27, 28, 29, 30 10 | ], 11 | "elem_mergings": {}, 12 | "scanify": false, 13 | "quality_threshold": 0.75, 14 | "language_codes": ["en"], 15 | "language_code_threshold": 0.75 16 | }, 17 | "val_settings": { 18 | "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val", 19 | "is_validation": true, 20 | "max_img": 40000, 21 | "elem_drops": [ 22 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23 | 23, 24, 25, 26, 27, 28, 29, 30 24 | ], 25 | "elem_mergings": {}, 26 | "scanify": false, 27 | "quality_threshold": 0.75, 28 | "language_codes": ["en"], 29 | "language_code_threshold": 0.75 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/tableonly.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tableonly", 3 | "train_settings": { 4 | "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [ 8 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 9 | 23, 24, 25, 26, 27, 28, 29, 30 10 | ], 11 | "elem_mergings": {}, 12 | "scanify": false, 13 | "quality_threshold": -1, 14 | "language_codes": ["en"], 15 | "language_code_threshold": 0.75 16 | }, 17 | "val_settings": { 18 | "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train", 19 | "is_validation": true, 20 | "max_img": 40000, 21 | "elem_drops": [ 22 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23 | 23, 24, 25, 26, 27, 28, 29, 30 24 | ], 25 | "elem_mergings": {}, 26 | "scanify": false, 27 | "quality_threshold": -1, 28 | "language_codes": ["en"], 29 | "language_code_threshold": 0.75 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /app/configs/extensions/obj_detection/ws_yolo/tableonly_quality.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tableonly_quality", 3 | "train_settings": { 4 | "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [ 8 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 9 | 23, 24, 25, 26, 27, 28, 29, 30 10 | ], 11 | "elem_mergings": {}, 12 | "scanify": false, 13 | "quality_threshold": 0.75, 14 | "language_codes": ["en"], 15 | "language_code_threshold": 0.75 16 | }, 17 | "val_settings": { 18 | "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train", 19 | "is_validation": true, 20 | "max_img": 40000, 21 | "elem_drops": [ 22 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23 | 23, 24, 25, 26, 27, 28, 29, 30 24 | ], 25 | "elem_mergings": {}, 26 | "scanify": false, 27 | "quality_threshold": 0.75, 28 | "language_codes": ["en"], 29 | "language_code_threshold": 0.75 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /app/configs/extensions/pretrain/layoutlm/1header_balanced_quality.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "1header_balanced_quality_layoutlm", 3 | "settings": { 4 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train", 5 | "is_validation": false, 6 | "max_img": 200000, 7 | "elem_drops": [24, 25, 27, 30], 8 | "elem_mergings": { 9 | "masters": { 10 | "1": "heading_global", 11 | "10": "text_merged", 12 | "11": "list_merged" 13 | }, 14 | "mapping": { 15 | "2": "1", 16 | "3": "1", 17 | "4": "1", 18 | "5": "1", 19 | "6": "1", 20 | "7": "1", 21 | "8": "1", 22 | "9": "1", 23 | "18": "11", 24 | "19": "11", 25 | "20": "10" 26 | } 27 | }, 28 | "elem_mins": { 29 | "0": 2300, 30 | "1": 6000, 31 | "10": 6000, 32 | "11": 6000, 33 | "12": 6000, 34 | "13": 6000, 35 | "14": 2300, 36 | "15": 2300, 37 | "16": 15000, 38 | "17": 20000, 39 | "21": 2300, 40 | "22": 6000, 41 | "23": 6000, 42 | "26": 6000, 43 | "28": 10000, 44 | "29": 10000 45 | }, 46 | "scanify": false, 47 | "quality_threshold": 0.7, 48 | "language_codes": ["en"], 49 | "language_code_threshold": 0.75 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /app/configs/extensions/pretrain/layoutlm/1header_balanced_quality_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "1header_balanced_quality_layoutlm", 3 | "settings": { 4 | "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train", 5 | "is_validation": false, 6 | "max_img": 50, 7 | "elem_drops": [24, 25, 27, 30], 8 | "elem_mergings": { 9 | "masters": { 10 | "1": "heading_global", 11 | "10": "text_merged", 12 | "11": "list_merged" 13 | }, 14 | "mapping": { 15 | "2": "1", 16 | "3": "1", 17 | "4": "1", 18 | "5": "1", 19 | "6": "1", 20 | "7": "1", 21 | "8": "1", 22 | "9": "1", 23 | "18": "11", 24 | "19": "11", 25 | "20": "10" 26 | } 27 | }, 28 | "elem_mins": { 29 | "0": 2, 30 | "1": 2, 31 | "10": 2, 32 | "11": 2, 33 | "12": 2, 34 | "13": 2, 35 | "14": 2, 36 | "15": 2, 37 | "16": 2, 38 | "17": 2, 39 | "21": 2, 40 | "22": 2, 41 | "23": 2, 42 | "26": 2, 43 | "28": 2, 44 | "29": 2 45 | }, 46 | "scanify": false, 47 | "quality_threshold": 0.7, 48 | "language_codes": ["en"], 49 | "language_code_threshold": 0.75 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /app/download_prepare_urls.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | import pathlib 5 | import settings 6 | import argparse 7 | 8 | arg_parser = argparse.ArgumentParser() 9 | arg_parser.add_argument("--cc_dump", "-cc", type=str, default=None, 10 | help="cc dump being processed") 11 | arg_parser.add_argument("--clean_urls_dir", type=str, default=None) 12 | arg_parser.add_argument("--num_nodes", type=int, default=25, 13 | help="number of nodes") 14 | args = arg_parser.parse_args() 15 | 16 | 17 | def main(): 18 | if args.clean_urls_dir is None: 19 | clean_urls_dir = settings.filesystem.CLEAN_URLS_DIR 20 | else: 21 | clean_urls_dir = pathlib.Path(args.clean_urls_dir) 22 | 23 | # make folder 24 | write_folder = clean_urls_dir / args.cc_dump 25 | if not (os.path.exists(write_folder)): 26 | os.mkdir(write_folder) 27 | 28 | # read parquet file 29 | clean_list = pd.read_parquet( 30 | clean_urls_dir / (args.cc_dump + ".parquet") 31 | ) 32 | 33 | # split accross num_nodes 34 | df_split = np.array_split(clean_list, args.num_nodes) 35 | for i in range(1, args.num_nodes + 1): 36 | df_split[i - 1].to_parquet(str(write_folder / (str(i) + ".parquet"))) 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /app/orm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/orm/__init__.py -------------------------------------------------------------------------------- /app/orm/dbutils/db_connection.py: -------------------------------------------------------------------------------- 1 | from logging.config import fileConfig 2 | 3 | from sqlalchemy import create_engine, Engine 4 | from sqlalchemy import pool 5 | import settings 6 | import configparser 7 | 8 | 9 | def connect_to_db() -> Engine: 10 | config = configparser.ConfigParser() 11 | config.read(settings.filesystem.ALEMBIC_INI_LOC) 12 | key = config.get('alembic', 'sqlalchemy.url') 13 | engine = create_engine(key) 14 | return engine 15 | 16 | -------------------------------------------------------------------------------- /app/pp_compute_perplexity.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import itertools 3 | import joblib 4 | import jsonlines 5 | import multiprocessing as mp 6 | import os 7 | from pathlib import Path 8 | from typing import Dict, Union 9 | import warnings 10 | import subprocess 11 | 12 | from src.quality.perplexity import LanguageModel 13 | 14 | WIKI_LM_URL = "http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.arpa.bin" 15 | WIKI_SP_URL = "http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.sp.model" 16 | 17 | 18 | def parse_args() -> argparse.Namespace: 19 | args = argparse.ArgumentParser() 20 | args.add_argument("--lang", "-l", type=str, required=True) 21 | args.add_argument("--data", "-d", type=str, required=True, 22 | help="Path to data directory containing output of the" 23 | "annotation step.") 24 | return args.parse_args() 25 | 26 | 27 | def _compute_ppl( 28 | text_rec: dict, meta_rec: dict, lm: LanguageModel, lang: str 29 | ) -> Union[float, None]: 30 | content = text_rec["text"] 31 | 32 | # identify top lang 33 | langs: Dict[str, float] = meta_rec["languages_fasttext"] 34 | top_lang = max(langs, key=langs.get) 35 | top_lang = top_lang.replace("__label__", "") 36 | 37 | if top_lang == lang: 38 | # compute perplexity 39 | perplexity = lm.compute_perplexity(content=content) 40 | else: 41 | perplexity = meta_rec.get("perplexity", None) 42 | 43 | return perplexity 44 | 45 | 46 | def _compute_doclaynet_score() -> Union[float, None]: 47 | warnings.warn("doclaynet similarity score not implemented yet") 48 | return None 49 | 50 | 51 | def process_shard(shard_id: str, data_dir, args: argparse.Namespace): 52 | print(f"(worker_id={os.getpid()}) start processing shard {shard_id}...") 53 | 54 | # get file paths 55 | text_fp = data_dir / "text" / f"doc_text_{shard_id}.jsonl" 56 | meta_fp = data_dir / "meta" / f"doc_meta_{shard_id}.jsonl" 57 | 58 | # make temporary file to store results 59 | ppl_meta_fp = data_dir / "meta_ppl" / f"temp_doc_meta_{shard_id}.jsonl" 60 | 61 | if not (data_dir / "meta_ppl").exists(): 62 | (data_dir / "meta_ppl").mkdir() 63 | print(f"(worker_id={os.getpid()}) created directory " 64 | f"{str(data_dir / 'meta_ppl')}") 65 | 66 | # load models 67 | sp_fp = Path("resources", "wikipedia-models", f"{args.lang}.sp.model") 68 | lm_fp = Path("resources", "wikipedia-models", f"{args.lang}.arpa.bin") 69 | lm = LanguageModel(sp_model=sp_fp, lm_model=lm_fp) 70 | 71 | num_records = 0 72 | 73 | # load data 74 | with jsonlines.open(ppl_meta_fp, "w") as res_writer: 75 | with jsonlines.open(text_fp) as text_reader, \ 76 | jsonlines.open(meta_fp) as meta_reader: 77 | for text, meta in zip(text_reader, meta_reader): 78 | # compute perplexity 79 | perplexity = _compute_ppl(text, meta, lm, args.lang) 80 | meta["perplexity"] = perplexity 81 | 82 | # add to results 83 | res_writer.write(meta) 84 | 85 | num_records += 1 86 | 87 | print(f"[worker_id={os.getpid()}] done with {shard_id}; " 88 | f"num_recs: {num_records:<6}") 89 | 90 | 91 | def _prepare_models(args: argparse.Namespace): 92 | def _dl_model(url, out_dir: Path): 93 | subprocess.run(["wget", "-c", "-P", out_dir, url]) 94 | 95 | sp_fp = Path("resources", "wikipedia-models", f"{args.lang}.sp.model") 96 | if not sp_fp.is_file(): 97 | print(f"downloading {args.lang} sentencepiece model...") 98 | _dl_model(WIKI_SP_URL.format(lang=args.lang), sp_fp.parent) 99 | 100 | lm_fp = Path("resources", "wikipedia-models", f"{args.lang}.arpa.bin") 101 | if not lm_fp.is_file(): 102 | print(f"downloading {args.lang} Kneser-Ney model...") 103 | _dl_model(WIKI_LM_URL.format(lang=args.lang), lm_fp.parent) 104 | 105 | 106 | def main(): 107 | args = parse_args() 108 | 109 | # check if models exist -- if not, downlaoad them. 110 | _prepare_models(args) 111 | 112 | data_root = Path(args.data) 113 | 114 | if not data_root.exists(): 115 | raise FileNotFoundError(f"could not find data directory: {data_root}") 116 | 117 | text_dir = data_root / "text" 118 | 119 | shard_ids = list( 120 | s.stem.replace("doc_text_", "") for s in text_dir.glob("*.jsonl") 121 | if s.is_file() and s.stem.startswith("doc_text_") 122 | ) 123 | 124 | num_workers = joblib.cpu_count() // 2 125 | print(f"num_workers: {num_workers}") 126 | 127 | with mp.Pool(processes=num_workers) as pool: 128 | pool.starmap( 129 | process_shard, 130 | itertools.product(shard_ids, [data_root], [args]) 131 | ) 132 | 133 | 134 | if __name__ == '__main__': 135 | main() 136 | -------------------------------------------------------------------------------- /app/requirements.txt: -------------------------------------------------------------------------------- 1 | aiosignal==1.3.1 2 | anyio==3.6.2 3 | attrs==22.2.0 4 | certifi==2022.12.7 5 | cffi==1.15.1 6 | charset-normalizer==3.1.0 7 | click==8.1.3 8 | colorclass==2.2.2 9 | contourpy==1.0.7 10 | cramjam==2.6.2 11 | cryptography==40.0.1 12 | cycler==0.11.0 13 | deprecation==2.1.0 14 | dill==0.3.6 15 | distlib==0.3.6 16 | easygui==0.98.3 17 | fastparquet==2023.4.0 18 | fasttext==0.9.2 19 | filelock==3.11.0 20 | fonttools==4.39.3 21 | frozenlist==1.3.3 22 | fsspec==2023.4.0 23 | greenlet==2.0.2 24 | grpcio==1.53.0 25 | idna==3.4 26 | img2pdf==0.4.4 27 | importlib-resources==5.12.0 28 | iso639==0.1.4 29 | joblib==1.2.0 30 | jsonlines==3.1.0 31 | jsonschema==4.17.3 32 | kiwisolver==1.4.4 33 | lxml==4.9.2 34 | matplotlib==3.7.1 35 | msgpack==1.0.5 36 | msoffcrypto-tool==5.0.1 37 | numpy==1.21.5 38 | olefile==0.46 39 | oletools==0.60.1 40 | opencv-python==4.5.5.64 41 | packaging==23.0 42 | pandas==1.5.1 43 | pcodedmp==1.2.6 44 | pdf2image==1.16.0 45 | pdfminer.six==20221105 46 | pdfplumber==0.8.1 47 | pikepdf==7.1.2 48 | Pillow==9.4.0 49 | pkgutil_resolve_name==1.3.10 50 | platformdirs==3.2.0 51 | protobuf==4.22.1 52 | psutil==5.9.4 53 | pybind11==2.10.4 54 | pycparser==2.21 55 | pyparsing==2.4.7 56 | pyrsistent==0.19.3 57 | python-dateutil==2.8.2 58 | python-docx==0.8.11 59 | pytz==2023.3 60 | PyYAML==6.0 61 | regex==2021.11.10 62 | requests==2.28.2 63 | six==1.16.0 64 | sniffio==1.3.0 65 | SQLAlchemy==2.0.9 66 | tabulate==0.9.0 67 | tqdm==4.64.1 68 | typing_extensions==4.5.0 69 | urllib3==1.26.15 70 | zipp==3.15.0 71 | py4j==0.10.9.5 72 | warcio==1.7.4 73 | pyarrow==12.0.0 74 | gitpython==3.1.32 75 | -------------------------------------------------------------------------------- /app/resources/fasttext-models/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/resources/fasttext-models/.gitkeep -------------------------------------------------------------------------------- /app/resources/wikipedia-models/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/resources/wikipedia-models/.gitkeep -------------------------------------------------------------------------------- /app/scripts/annotation-kickoff.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | WORKERS=25 6 | TIMESTAMP=$(date +"%Y%m%d_%H%M%S") 7 | CRAWL_ID="$1" 8 | DATA_ROOT="$2" 9 | OUTPUT_DIR="data/annotated/${CRAWL_ID}/${TIMESTAMP}" 10 | PARTITIONS_DIR="${OUTPUT_DIR}/partitions" 11 | mkdir -p "$PARTITIONS_DIR" 12 | 13 | echo "CRAWL_ID: $CRAWL_ID" 14 | echo "DATA_ROOT: $DATA_ROOT" 15 | echo "PARTITIONS_DIR: $PARTITIONS_DIR" 16 | echo "OUTPUT_DIR: $OUTPUT_DIR" 17 | 18 | TMP_FILE="${PARTITIONS_DIR}/tmp.txt" 19 | 20 | echo $(find "$DATA_ROOT" -type f -name "*.tar.gz") | tr " " "\n" >"$TMP_FILE" 21 | 22 | # split into partitions 23 | N_FILES=$(wc -l <"$TMP_FILE") 24 | N_FILES_PER_PARTITION=$((N_FILES / WORKERS + 1)) 25 | split -d -l $N_FILES_PER_PARTITION "$TMP_FILE" "${PARTITIONS_DIR}/part_" 26 | 27 | # remove tmp file 28 | rm "$TMP_FILE" 29 | 30 | # rename partitions to have .txt extension 31 | for f in "${PARTITIONS_DIR}/part_"*; do 32 | mv "$f" "${f}.txt" 33 | echo "created partition ${f}.txt" 34 | done 35 | 36 | sbatch scripts/annotation-launch.sbatch "$CRAWL_ID" "$OUTPUT_DIR" "$PARTITIONS_DIR" 37 | -------------------------------------------------------------------------------- /app/scripts/annotation-launch.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # shellcheck disable=SC2206 3 | #SBATCH --time=93:59:00 4 | #SBATCH --job-name=annotate 5 | #SBATCH --cpus-per-task=24 6 | #SBATCH --mem-per-cpu=4GB 7 | #SBATCH --nodes=1 8 | #SBATCH --ntasks=1 9 | #SBATCH --tasks-per-node=1 10 | #SBATCH --array=1-25 11 | #SBATCH --output="logs/annotation/mp_annotation-docs-%j.out" 12 | #SBATCH --error="logs/annotation/mp_annotation-docs-%j.err" 13 | 14 | set -e 15 | 16 | mkdir -p logs/annotation 17 | 18 | # load modules 19 | # [... placeholder ...] 20 | 21 | # activate virtual environment 22 | source .venv/bin/activate 23 | 24 | # read args 25 | CRAWL_ID="$1" 26 | OUTPUT_DIR="$2" 27 | PARTITIONS_DIR="$3" 28 | 29 | # export env variables 30 | export SLURM_CPUS_PER_TASK 31 | 32 | echo "SLURM_CPUS_PER_TASK: ${SLURM_CPUS_PER_TASK}" 33 | echo "SLURM_MEM_PER_CPU: ${SLURM_MEM_PER_CPU}" 34 | 35 | if [ -z "$PARTITIONS_DIR" ]; then 36 | echo "PARTITIONS_DIR is not set" 37 | exit 1 38 | fi 39 | 40 | if [ -z "$CRAWL_ID" ]; then 41 | echo "CRAWL_ID is not set" 42 | exit 1 43 | fi 44 | 45 | if [ -z "$OUTPUT_DIR" ]; then 46 | echo "OUTPUT_DIR is not set" 47 | exit 1 48 | fi 49 | 50 | INPUT_FILE=$(ls ${PARTITIONS_DIR}/part_*.txt | sed -n "${SLURM_ARRAY_TASK_ID}p") 51 | echo "starting annotation on $(hostname) with inputs from ${INPUT_FILE}; using ${SLURM_CPUS_PER_TASK} cpu cores." 52 | 53 | python -u annotate_run.py \ 54 | --input_files "$INPUT_FILE" \ 55 | --crawl_id "$CRAWL_ID" \ 56 | --output_dir "$OUTPUT_DIR" \ 57 | --soffice_executable ".apps/libreoffice/opt/libreoffice7.4/program/soffice" \ 58 | --max_docs -1 59 | -------------------------------------------------------------------------------- /app/scripts/cc-parse-launch.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # shellcheck disable=SC2206 3 | #SBATCH --time=23:59:00 4 | #SBATCH --job-name=cc_docs 5 | #SBATCH --cpus-per-task=16 6 | #SBATCH --mem-per-cpu=2GB 7 | #SBATCH --nodes=1 8 | #SBATCH --ntasks=1 9 | #SBATCH --tasks-per-node=1 10 | #SBATCH --array=1-180 11 | #SBATCH --output="logs/cc_docs/mp_cc-docs-%j.out" 12 | #SBATCH --error="logs/cc_docs/mp_cc-docs-%j.err" 13 | 14 | set -e 15 | 16 | mkdir -p logs/cc_docs 17 | 18 | # load modules 19 | # [... placeholder ...] 20 | 21 | # activate virtual environment 22 | source .venv/bin/activate 23 | 24 | export SLURM_CPUS_PER_TASK 25 | 26 | INPUT_DIR="$1"/"${SLURM_ARRAY_TASK_ID}" 27 | CC_DUMP="$2" 28 | 29 | echo "starting url parsing on ${HOSTNAME} with inputs from ${INPUT_DIR} for dump ${CC_DUMP}; using ${SLURM_CPUS_PER_TASK} cpu cores." 30 | 31 | python -u cc_parse_snapshot.py \ 32 | --input "$INPUT_DIR" \ 33 | --cc_dump "$CC_DUMP" 34 | -------------------------------------------------------------------------------- /app/scripts/download-launch.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # shellcheck disable=SC2206 3 | #SBATCH --time=23:59:00 4 | #SBATCH --job-name=download 5 | #SBATCH --cpus-per-task=64 6 | #SBATCH --mem-per-cpu=2GB 7 | #SBATCH --nodes=1 8 | #SBATCH --ntasks=1 9 | #SBATCH --tasks-per-node=1 10 | #SBATCH --array=1-25 11 | #SBATCH --output="logs/download/mp_download-docs-%j.out" 12 | #SBATCH --error="logs/download/mp_download-docs-%j.err" 13 | 14 | set -e 15 | 16 | mkdir -p logs/download 17 | 18 | # load modules 19 | # [... placeholder ...] 20 | 21 | # activate virtual environment 22 | source .venv/bin/activate 23 | 24 | INPUT_FILE="$1/${SLURM_ARRAY_TASK_ID}.parquet" 25 | OUTPUT="$3" 26 | 27 | echo "starting download on ${HOSTNAME} with inputs from ${INPUT_FILE}, outputting to ${OUTPUT}; using ${SLURM_CPUS_PER_TASK} cpu cores" 28 | python -u download_run.py -i "$INPUT_FILE" -ss $2 -wd "$OUTPUT" 29 | -------------------------------------------------------------------------------- /app/scripts/install_libreoffice_centos.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # exit when any command fails 4 | set -e 5 | 6 | # echo an error message before exiting 7 | trap 'echo "\"${last_command}\" command exited with code $?."' EXIT 8 | 9 | if [[ "$OSTYPE" == "linux-gnu"* ]]; then 10 | distro=$(cat /etc/os-release | grep "^ID=" | cut -d "=" -f 2 | tr -d '"') 11 | else 12 | echo "This script does not handle OS $OSTYPE ! Check the README.md for installation instructions." 13 | exit 1 14 | fi 15 | 16 | if [[ "$distro" == "centos" ]]; then 17 | echo "installing libreoffice on centos..." 18 | lo_pkg=LibreOffice_7.4.7_Linux_x86-64_rpm.tar.gz 19 | lo_path=/libreoffice/stable/7.4.7/rpm/x86_64/${lo_pkg} 20 | 21 | OPENDOC_ROOT=$(dirname "$(dirname "$(readlink -f "$0")")") 22 | APP_LOCATION=${OPENDOC_ROOT}/.apps/libreoffice 23 | 24 | # create directory for libreoffice app 25 | mkdir -p "${APP_LOCATION}" 26 | 27 | # download and unpack package 28 | wget https://download.documentfoundation.org/${lo_path} -P "${APP_LOCATION}" 29 | tar xvzf ${APP_LOCATION}/LibreOffice_7.4.7_Linux_x86-64_rpm.tar.gz --directory "${APP_LOCATION}" 30 | 31 | # unpack rpm files 32 | for i in ${APP_LOCATION}/LibreOffice_7.4.7.2_Linux_x86-64_rpm/RPMS/*.rpm; do 33 | rpm2cpio $i | ( 34 | cd $APP_LOCATION 35 | cpio -id 36 | ) 37 | done 38 | 39 | # cleanup 40 | echo "cleaning up..." 41 | rm -rv ${APP_LOCATION}/LibreOffice_7.4.7.2_Linux_x86-64_rpm/ 42 | rm -v ${APP_LOCATION}/LibreOffice_7.4.7_Linux_x86-64_rpm.tar.gz 43 | 44 | # install unoserver 45 | echo "pip installing unoserver..." 46 | wget https://bootstrap.pypa.io/get-pip.py 47 | ${APP_LOCATION}/opt/libreoffice7.4/program/python get-pip.py 48 | ${APP_LOCATION}/opt/libreoffice7.4/program/python -m pip install unoserver 49 | 50 | # fix shebangs in unoserver and unoconvert (when install with pip the shebangs get messed up) 51 | sed -i '1s/python\.bin/python/' ${APP_LOCATION}/opt/libreoffice7.4/program/python-core-3.8.16/bin/unoserver 52 | sed -i '1s/python\.bin/python/' ${APP_LOCATION}/opt/libreoffice7.4/program/python-core-3.8.16/bin/unoconvert 53 | 54 | # add unoserver and unoconvert to path 55 | echo "export PATH=${APP_LOCATION}/opt/libreoffice7.4/program/python-core-3.8.16/bin:\$PATH" >>~/.bashrc 56 | echo "added unoserver and unoconvert to path. To test it, run 'unoserver -h' and 'unoconvert -h'." 57 | 58 | else 59 | echo "this script does not support distro $distro" 60 | exit 1 61 | fi 62 | -------------------------------------------------------------------------------- /app/scripts/pp-compute-perplexity.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # shellcheck disable=SC2206 3 | #SBATCH --time=23:59:00 4 | #SBATCH --job-name=annotate 5 | #SBATCH --cpus-per-task=32 6 | #SBATCH --mem-per-cpu=4GB 7 | #SBATCH --nodes=1 8 | #SBATCH --ntasks=1 9 | #SBATCH --tasks-per-node=1 10 | #SBATCH --output="logs/postprocess/quality-indicators-%j.out" 11 | #SBATCH --error="logs/postprocess/quality-indicators-%j.err" 12 | 13 | set -e 14 | 15 | # load modules 16 | module load eth_proxy jdk gcc/6.3.0 python/3.8.5 17 | 18 | # activate virtual environment 19 | source .venv/bin/activate 20 | 21 | DATA_ROOT="/cluster/project/zhang/opendoc/data/annotated/cc_main_2022_49/20230531_144800" 22 | LANGS=("ru" "en" "uk" "pl" "es" "fr" "it" "pt" "cs" "hu" "de" "bg" "tr" "nl" "el") 23 | for lang in "${LANGS[@]}"; do 24 | echo "computing perplexity values for $lang" 25 | 26 | python pp_compute_perplexity.py \ 27 | --data "$DATA_ROOT" \ 28 | --lang "$lang" 29 | 30 | # remove language models 31 | rm resources/wikipedia-models/${lang}.arpa.bin 32 | rm resources/wikipedia-models/${lang}.sp.model 33 | done 34 | -------------------------------------------------------------------------------- /app/scripts/run-filter-tars.sbatch: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # shellcheck disable=SC2206 3 | #SBATCH --time=23:59:00 4 | #SBATCH --job-name=annotate 5 | #SBATCH --cpus-per-task=64 6 | #SBATCH --mem-per-cpu=2GB 7 | #SBATCH --nodes=1 8 | #SBATCH --ntasks=1 9 | #SBATCH --tasks-per-node=1 10 | #SBATCH --output="logs/postprocess/quality-indicators-%j.out" 11 | #SBATCH --error="logs/postprocess/quality-indicators-%j.err" 12 | 13 | set -e 14 | 15 | mkdir -p logs/postprocess 16 | 17 | # load modules 18 | # [... placeholder ...] 19 | 20 | # activate virtual environment 21 | source .venv/bin/activate 22 | 23 | python utilties/run_filter_tars.py --data_root "$1" 24 | -------------------------------------------------------------------------------- /app/scripts/run_single_node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | trap cleanup_on_error ERR SIGINT SIGTERM 5 | 6 | cleanup_on_error() { 7 | echo "Error: $0:$LINENO: command \`$BASH_COMMAND\` failed with exit code $?" 8 | exit 1 9 | } 10 | 11 | help() { 12 | echo "Usage: run_single_node.sh [ -d | --dump_id ] [-m | --max_docs]" 13 | exit 2 14 | } 15 | 16 | while [[ $# -gt 0 ]]; do 17 | key="$1" 18 | case $key in 19 | -d | --dump_id) 20 | DUMP_ID="$2" 21 | shift 2 22 | ;; 23 | -m | --max_docs) 24 | MAX_DOCS="$2" 25 | shift 2 26 | ;; 27 | -h | --help) 28 | help 29 | ;; 30 | --) 31 | shift 32 | break 33 | ;; 34 | *) 35 | echo "Invalid option: -$1" 36 | help 37 | ;; 38 | esac 39 | done 40 | 41 | # generate random run id 42 | RUN_ID=$(openssl rand -hex 12) 43 | 44 | CLEAN_URLS_DIR="/mnt/data/${RUN_ID}/clean_urls" 45 | SOURCES_DIR="/mnt/data/${RUN_ID}/download/${DUMP_ID}" 46 | OUTPUT_DIR="/mnt/data/${RUN_ID}/annotated/${DUMP_ID}" 47 | 48 | # create directories 49 | mkdir -p "$CLEAN_URLS_DIR" 50 | mkdir -p "$SOURCES_DIR" 51 | mkdir -p "$OUTPUT_DIR" 52 | 53 | printf "Created directories:\n" 54 | printf " * CLEAN_URLS_DIR: %s\n" "$CLEAN_URLS_DIR" 55 | printf " * SOURCES_DIR: %s\n" "$SOURCES_DIR" 56 | printf " * OUTPUT_DIR: %s\n" "$OUTPUT_DIR" 57 | 58 | if [ -z "${MAX_DOCS}" ]; then 59 | MAX_DOCS=-1 60 | fi 61 | 62 | # get file fid 63 | case $DUMP_ID in 64 | "CC-MAIN-2013-48") 65 | FID="1359HSlQighPkMV3iEf_z6pO5rdknZhJ_" 66 | ;; 67 | "CC-MAIN-2016-50") 68 | FID="14_YuQeu6S0u2lKYKOcpEy5AUjmvSeQdE" 69 | ;; 70 | "CC-MAIN-2020-40") 71 | FID="1hKFv4gkUqV_cJcR-02J7rbVm2vJ8HRHH" 72 | ;; 73 | "CC-MAIN-2021-43") 74 | FID="1wuXzQ6RKmV56RldqRImbbbHnnza7GSpF" 75 | ;; 76 | "CC-MAIN-2023-06") 77 | FID="1mKWK79_M_ENGJy781tPUCtsNJtuoxu5d" 78 | ;; 79 | "CC-MAIN-2023-14") 80 | FID="15Od3TdMrkondhfyCNCBSxijXbuyq5rz3" 81 | ;; 82 | *) 83 | echo "Invalid dump id: $DUMP_ID" 84 | exit 1 85 | ;; 86 | esac 87 | 88 | # download urls 89 | printf "\n================================\nFetching URL List...\n" 90 | gdown "https://drive.google.com/uc?id=$FID" -O "$CLEAN_URLS_DIR/$DUMP_ID.parquet" 91 | 92 | mkdir -p /usr/app/data/tmp 93 | 94 | # 1) Prepare urls for download 95 | printf "\n================================\nURL prep...\n" 96 | python3 download_prepare_urls.py \ 97 | --cc_dump "$DUMP_ID" \ 98 | --clean_urls_dir "$CLEAN_URLS_DIR" \ 99 | --num_nodes 1 100 | 101 | # 2) Download documents 102 | printf "\n================================\nDownloading documents...\n" 103 | python3 download_run.py \ 104 | --input "${CLEAN_URLS_DIR}/${DUMP_ID}/1.parquet" \ 105 | --subset_size $MAX_DOCS \ 106 | --write_dir "$SOURCES_DIR" 107 | 108 | # 3) Annotate documents 109 | printf "\n================================\nAnnotating documents...\n" 110 | python3 annotate_run.py \ 111 | --data_dir "$SOURCES_DIR" \ 112 | --crawl_id "$DUMP_ID" \ 113 | --max_docs $MAX_DOCS \ 114 | --output_dir "$OUTPUT_DIR" \ 115 | --soffice_executable "soffice" 116 | 117 | printf "\n---------------------------------\n" 118 | printf "WordScape pipeline complete.\n" 119 | printf "Dataset is in %s\n" "$OUTPUT_DIR" 120 | -------------------------------------------------------------------------------- /app/settings/__init__.py: -------------------------------------------------------------------------------- 1 | from . import annotation 2 | from . import bbox 3 | from . import colors 4 | from . import content_awareness 5 | from . import entities 6 | from . import filesystem 7 | from . import download 8 | -------------------------------------------------------------------------------- /app/settings/annotation.py: -------------------------------------------------------------------------------- 1 | # possible sources of colorization decision 2 | 3 | ANNOTATION_BUILTIN = "builtin" 4 | ANNOTATION_XML_PATTERN = "xml_pattern" 5 | ANNOTATION_CONTENT_AWARE_HEURISTIC = "content_aware_heuristic" 6 | ANNOTATION_BODY_HEADING_HEURISTIC_USINGBUILTIN = "body_heading_heuristic_usingbuiltin" 7 | ANNOTATION_BODY_HEADING_HEURISTIC_BASE = "body_heading_heuristic_base" 8 | 9 | DECISION_SOURCES = [ 10 | ANNOTATION_BUILTIN, 11 | ANNOTATION_XML_PATTERN, 12 | ANNOTATION_CONTENT_AWARE_HEURISTIC, 13 | ANNOTATION_BODY_HEADING_HEURISTIC_USINGBUILTIN, 14 | ANNOTATION_BODY_HEADING_HEURISTIC_BASE 15 | ] 16 | # builtins vs heuristics 17 | BUILTIN_SOURCES = [ 18 | ANNOTATION_BUILTIN, 19 | ANNOTATION_XML_PATTERN 20 | ] 21 | HEURISTIC_SOURCES = [ 22 | ANNOTATION_CONTENT_AWARE_HEURISTIC, 23 | ANNOTATION_BODY_HEADING_HEURISTIC_USINGBUILTIN, 24 | ANNOTATION_BODY_HEADING_HEURISTIC_BASE, 25 | ] 26 | -------------------------------------------------------------------------------- /app/settings/bbox.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains all basic settings related to bounding boxes. 3 | """ 4 | import settings.entities as entities 5 | 6 | # tolerance for bbox color detection 7 | BBOX_COLOR_TOL = 1 8 | 9 | # minimum size of bounding boxes (expressed as a fraction of page width and 10 | # page height) 11 | DEFAULT_FRACTION_NORMAL = 1e-2 12 | DEFAULT_FRACTION_SMALL = 1e-3 13 | DEFAULT_FRACTION_TINY = 5e-4 14 | 15 | BBOX_MIN_FRACTIONS = { 16 | entities.ENTITY_TITLE_ID: DEFAULT_FRACTION_NORMAL, 17 | entities.ENTITY_HEADING_1_ID: DEFAULT_FRACTION_NORMAL, 18 | entities.ENTITY_HEADING_2_ID: DEFAULT_FRACTION_NORMAL, 19 | entities.ENTITY_HEADING_3_ID: DEFAULT_FRACTION_NORMAL, 20 | entities.ENTITY_HEADING_4_ID: DEFAULT_FRACTION_NORMAL, 21 | entities.ENTITY_HEADING_5_ID: DEFAULT_FRACTION_NORMAL, 22 | entities.ENTITY_HEADING_6_ID: DEFAULT_FRACTION_NORMAL, 23 | entities.ENTITY_HEADING_7_ID: DEFAULT_FRACTION_NORMAL, 24 | entities.ENTITY_HEADING_8_ID: DEFAULT_FRACTION_NORMAL, 25 | entities.ENTITY_HEADING_9_ID: DEFAULT_FRACTION_NORMAL, 26 | entities.ENTITY_TEXT_ID: DEFAULT_FRACTION_NORMAL, 27 | entities.ENTITY_LIST_ID: DEFAULT_FRACTION_NORMAL, 28 | entities.ENTITY_HEADER_ID: DEFAULT_FRACTION_NORMAL, 29 | entities.ENTITY_FOOTER_ID: DEFAULT_FRACTION_NORMAL, 30 | entities.ENTITY_TABLE_HEADER_ID: DEFAULT_FRACTION_NORMAL, 31 | entities.ENTITY_TABLE_HEADER_CELL_ID: DEFAULT_FRACTION_NORMAL, 32 | entities.ENTITY_TABLE_ID: DEFAULT_FRACTION_NORMAL, 33 | entities.ENTITY_TABLE_CELL_ID: DEFAULT_FRACTION_NORMAL, 34 | entities.ENTITY_TABLE_CAPTION_ID: DEFAULT_FRACTION_NORMAL, 35 | entities.ENTITY_TOC_ID: DEFAULT_FRACTION_NORMAL, 36 | entities.ENTITY_BIBLIOGRAPHY_ID: DEFAULT_FRACTION_NORMAL, 37 | entities.ENTITY_QUOTE_ID: DEFAULT_FRACTION_NORMAL, 38 | entities.ENTITY_EQUATION_ID: DEFAULT_FRACTION_NORMAL, 39 | entities.ENTITY_FIGURE_ID: DEFAULT_FRACTION_NORMAL, 40 | entities.ENTITY_FOOTNOTE_ID: DEFAULT_FRACTION_NORMAL, 41 | entities.ENTITY_ANNOTATION_ID: DEFAULT_FRACTION_NORMAL, 42 | entities.ENTITY_FORM_FIELD_ID: DEFAULT_FRACTION_TINY, 43 | entities.ENTITY_FORM_TAG_ID: DEFAULT_FRACTION_TINY, 44 | } 45 | -------------------------------------------------------------------------------- /app/settings/content_awareness.py: -------------------------------------------------------------------------------- 1 | # settings for content-aware heuristics 2 | 3 | # symbols we consider to constitute a possible form field 4 | # note the special triple-period symbol, which word likes to auto-create 5 | FORM_FIELD_SYMBOLS = ['_', '.', '…'] 6 | 7 | # symbols we consider to indicate a quote; must be at start and end. 8 | QUOTE_SYMBOLS = ["\"", "\'"] 9 | 10 | # symbols we consider to constitute a possible numbering 11 | # ! warning: we also include any number followed by a '.', there are infinite 12 | # ! such possibilities. 13 | # here we only list single symbols that we consider to indicate a list entry 14 | # also, the check for builtin numbering indicators is handled separately 15 | NUMBERING_SYMBOLS = [ 16 | '-', '\u2022', '\u27A2', '\u25E6', '\u25AA', '\u25AB', '\u25CF', '\u25CB', 17 | '\u25A0', '\u25A1', '\u25B6', '\u2043', '\u25C6', '\u25C7', '\u25D0', 18 | '\u25D1' 19 | ] 20 | 21 | NUMBERING_FOLLOWERS = ['\.', ':', '\)'] 22 | -------------------------------------------------------------------------------- /app/settings/download.py: -------------------------------------------------------------------------------- 1 | import regex 2 | 3 | # constants 4 | MAX_FILESIZE = 90 * 1024 * 1024 # 90 MB 5 | 6 | # string patterns 7 | DOC_FN_PATTERN = "doc_{url_hash}{ext}" 8 | TAR_PATTERN = "docs_{part_id}-shard_{shard_num:05d}.tar.gz" 9 | META_DATA_FN_PATTERN = "meta_{part_id}.parquet" 10 | LOG_FN_PATTERN = "info_{part_id}.log" 11 | LOG_FORMAT = "[%(asctime)s]::%(name)s::%(levelname)s::%(message)s" 12 | 13 | VALID_CT_REGEX = pattern = regex.compile( 14 | r'(application|text)/.*(openxml|word|doc|msword|msdownload|rtf).*', 15 | flags=regex.IGNORECASE | regex.DOTALL 16 | ) 17 | 18 | # header fields 19 | HEADER_FIELDS = [ 20 | "content-type", 21 | "content-length", 22 | "content-encoding", 23 | "content-language", 24 | "last-modified" 25 | ] 26 | 27 | # mapping from olet library names to DB olet fields 28 | OLET_DB_MAPPING = { 29 | 'File format': 'olet_ftype', 30 | 'Container format': 'olet_container', 31 | 'Properties code page': 'olet_codepage', 32 | 'Python codec': 'olet_python_codec', 33 | 'Application name': 'olet_appname', 34 | 'Author': 'olet_author', 35 | 'Encrypted': 'olet_encrypted', 36 | 'VBA Macros': 'olet_vba', 37 | 'XLM Macros': 'olet_xlm', 38 | 'External Relationships': 'olet_ext_rels', 39 | 'ObjectPool': 'olet_ObjectPool', 40 | 'Flash objects': 'olet_flash' 41 | } 42 | -------------------------------------------------------------------------------- /app/settings/entities.py: -------------------------------------------------------------------------------- 1 | TOTAL_BASE_LABELS = 31 2 | LABEL_NUMS = [i for i in range(0, TOTAL_BASE_LABELS)] 3 | 4 | ENTITY_TITLE_NAME = "title" 5 | ENTITY_TITLE_ID = 0 6 | 7 | ENTITY_HEADING_1_NAME = "heading_1" 8 | ENTITY_HEADING_1_ID = 1 9 | 10 | ENTITY_HEADING_2_NAME = "heading_2" 11 | ENTITY_HEADING_2_ID = 2 12 | 13 | ENTITY_HEADING_3_NAME = "heading_3" 14 | ENTITY_HEADING_3_ID = 3 15 | 16 | ENTITY_HEADING_4_NAME = "heading_4" 17 | ENTITY_HEADING_4_ID = 4 18 | 19 | ENTITY_HEADING_5_NAME = "heading_5" 20 | ENTITY_HEADING_5_ID = 5 21 | 22 | ENTITY_HEADING_6_NAME = "heading_6" 23 | ENTITY_HEADING_6_ID = 6 24 | 25 | ENTITY_HEADING_7_NAME = "heading_7" 26 | ENTITY_HEADING_7_ID = 7 27 | 28 | ENTITY_HEADING_8_NAME = "heading_8" 29 | ENTITY_HEADING_8_ID = 8 30 | 31 | ENTITY_HEADING_9_NAME = "heading_9" 32 | ENTITY_HEADING_9_ID = 9 33 | 34 | ENTITY_TEXT_NAME = "text" 35 | ENTITY_TEXT_ID = 10 36 | 37 | ENTITY_LIST_NAME = "list" 38 | ENTITY_LIST_ID = 11 39 | 40 | ENTITY_HEADER_NAME = "header" 41 | ENTITY_HEADER_ID = 12 42 | 43 | ENTITY_FOOTER_NAME = "footer" 44 | ENTITY_FOOTER_ID = 13 45 | 46 | ENTITY_TABLE_HEADER_NAME = "table_header" 47 | ENTITY_TABLE_HEADER_ID = 14 48 | 49 | ENTITY_TABLE_HEADER_CELL_NAME = "table_header_cell" 50 | ENTITY_TABLE_HEADER_CELL_ID = 15 51 | 52 | ENTITY_TABLE_NAME = "table" 53 | ENTITY_TABLE_ID = 16 54 | 55 | ENTITY_TABLE_CELL_NAME = "table_cell" 56 | ENTITY_TABLE_CELL_ID = 17 57 | 58 | ENTITY_TOC_NAME = "toc" 59 | ENTITY_TOC_ID = 18 60 | 61 | ENTITY_BIBLIOGRAPHY_NAME = "bibliography" 62 | ENTITY_BIBLIOGRAPHY_ID = 19 63 | 64 | ENTITY_QUOTE_NAME = "quote" 65 | ENTITY_QUOTE_ID = 20 66 | 67 | ENTITY_EQUATION_NAME = "equation" 68 | ENTITY_EQUATION_ID = 21 69 | 70 | ENTITY_FIGURE_NAME = "figure" 71 | ENTITY_FIGURE_ID = 22 72 | 73 | ENTITY_TABLE_CAPTION_NAME = "table_caption" 74 | ENTITY_TABLE_CAPTION_ID = 23 75 | 76 | ENTITY_FOOTNOTE_NAME = "footnote" 77 | ENTITY_FOOTNOTE_ID = 24 78 | 79 | ENTITY_ANNOTATION_NAME = "annotation" 80 | ENTITY_ANNOTATION_ID = 25 81 | 82 | ENTITY_FORM_FIELD_NAME = "form_field" 83 | ENTITY_FORM_FIELD_ID = 26 84 | 85 | ENTITY_FORM_TAG_NAME = "form_tag" 86 | ENTITY_FORM_TAG_ID = 27 87 | 88 | ENTITY_TABLE_ROW_NAME = "table_row" 89 | ENTITY_TABLE_ROW_ID = 28 90 | 91 | ENTITY_TABLE_COLUMN_NAME = "table_column" 92 | ENTITY_TABLE_COLUMN_ID = 29 93 | 94 | ENTITY_TABLE_HEADER_ROW_NAME = "table_header_row" 95 | ENTITY_TABLE_HEADER_ROW_ID = 30 96 | 97 | # put all entity names in a list 98 | ALL_ENTITY_NAMES = [ 99 | eval(entity_name_var) 100 | for entity_name_var in dir() 101 | if ( 102 | entity_name_var.startswith("ENTITY_") 103 | and entity_name_var.endswith("_NAME") 104 | and isinstance(eval(entity_name_var), str) 105 | ) 106 | ] 107 | 108 | # put all entity ids in a list 109 | ALL_ENTITY_IDS = [ 110 | eval(entity_id_var) 111 | for entity_id_var in dir() 112 | if ( 113 | entity_id_var.startswith("ENTITY_") 114 | and entity_id_var.endswith("_ID") 115 | and isinstance(eval(entity_id_var), int) 116 | ) 117 | ] 118 | 119 | ENTITY_ID_TO_NAME = { 120 | ENTITY_TITLE_ID: ENTITY_TITLE_NAME, 121 | ENTITY_HEADING_1_ID: ENTITY_HEADING_1_NAME, 122 | ENTITY_HEADING_2_ID: ENTITY_HEADING_2_NAME, 123 | ENTITY_HEADING_3_ID: ENTITY_HEADING_3_NAME, 124 | ENTITY_HEADING_4_ID: ENTITY_HEADING_4_NAME, 125 | ENTITY_HEADING_5_ID: ENTITY_HEADING_5_NAME, 126 | ENTITY_HEADING_6_ID: ENTITY_HEADING_6_NAME, 127 | ENTITY_HEADING_7_ID: ENTITY_HEADING_7_NAME, 128 | ENTITY_HEADING_8_ID: ENTITY_HEADING_8_NAME, 129 | ENTITY_HEADING_9_ID: ENTITY_HEADING_9_NAME, 130 | ENTITY_TEXT_ID: ENTITY_TEXT_NAME, 131 | ENTITY_LIST_ID: ENTITY_LIST_NAME, 132 | ENTITY_HEADER_ID: ENTITY_HEADER_NAME, 133 | ENTITY_FOOTER_ID: ENTITY_FOOTER_NAME, 134 | ENTITY_TABLE_HEADER_ID: ENTITY_TABLE_HEADER_NAME, 135 | ENTITY_TABLE_HEADER_CELL_ID: ENTITY_TABLE_HEADER_CELL_NAME, 136 | ENTITY_TABLE_ID: ENTITY_TABLE_NAME, 137 | ENTITY_TABLE_CELL_ID: ENTITY_TABLE_CELL_NAME, 138 | ENTITY_TOC_ID: ENTITY_TOC_NAME, 139 | ENTITY_BIBLIOGRAPHY_ID: ENTITY_BIBLIOGRAPHY_NAME, 140 | ENTITY_QUOTE_ID: ENTITY_QUOTE_NAME, 141 | ENTITY_EQUATION_ID: ENTITY_EQUATION_NAME, 142 | ENTITY_FIGURE_ID: ENTITY_FIGURE_NAME, 143 | ENTITY_TABLE_CAPTION_ID: ENTITY_TABLE_CAPTION_NAME, 144 | ENTITY_FOOTNOTE_ID: ENTITY_FOOTNOTE_NAME, 145 | ENTITY_ANNOTATION_ID: ENTITY_ANNOTATION_NAME, 146 | ENTITY_FORM_FIELD_ID: ENTITY_FORM_FIELD_NAME, 147 | ENTITY_FORM_TAG_ID: ENTITY_FORM_TAG_NAME, 148 | ENTITY_TABLE_ROW_ID: ENTITY_TABLE_ROW_NAME, 149 | ENTITY_TABLE_COLUMN_ID: ENTITY_TABLE_COLUMN_NAME, 150 | ENTITY_TABLE_HEADER_ROW_ID: ENTITY_TABLE_HEADER_ROW_NAME, 151 | } 152 | 153 | ENTITY_NAME_TO_ID = {} 154 | for k, v in ENTITY_ID_TO_NAME.items(): 155 | ENTITY_NAME_TO_ID[v] = k 156 | -------------------------------------------------------------------------------- /app/settings/entity_names.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": 0, 3 | "heading_1": 1, 4 | "heading_2": 2, 5 | "heading_3": 3, 6 | "heading_4": 4, 7 | "heading_5": 5, 8 | "heading_6": 6, 9 | "heading_7": 7, 10 | "heading_8": 8, 11 | "heading_9": 9, 12 | "text": 10, 13 | "list": 11, 14 | "header": 12, 15 | "footer": 13, 16 | "table_header": 14, 17 | "table_header_cell": 15, 18 | "table": 16, 19 | "table_cell": 17, 20 | "toc": 18, 21 | "bibliography": 19, 22 | "quote": 20, 23 | "equation": 21, 24 | "figure": 22, 25 | "table_caption": 23, 26 | "footnote": 24, 27 | "annotation": 25, 28 | "form_field": 26, 29 | "form_tag": 27, 30 | "table_row": 28, 31 | "table_column": 29, 32 | "table_header_row": 30 33 | } -------------------------------------------------------------------------------- /app/settings/filesystem.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains basic directory structures for the project. 3 | """ 4 | from pathlib import Path 5 | 6 | ROOT = Path(__file__).parent.parent 7 | 8 | # resources 9 | RESOURCES_DIR = ROOT / "resources" 10 | FASTTEXT_CLASSIFIERS_DIR = Path(RESOURCES_DIR, "fasttext-models") 11 | 12 | # data dirs 13 | DATA_ROOT = ROOT / "data" 14 | DOC_SOURCES_DIR = DATA_ROOT / "doc_sources" 15 | CC_SEGMENT_DIR = DATA_ROOT / "crawl-data" 16 | CC_DIR = DATA_ROOT / "cc_urls" 17 | CLEAN_URLS_DIR = DATA_ROOT / "clean_urls" 18 | DOWNLOAD_DIR = DATA_ROOT / "download" 19 | 20 | # tmp dirs 21 | TMP_DIR = DATA_ROOT / "tmp" 22 | 23 | # fixed-location files 24 | ALEMBIC_INI_LOC = ROOT / "alembic.ini" 25 | 26 | # for pipeline extensions 27 | RAW_DIR = DATA_ROOT / "raw" 28 | EXPERIMENT_DIR = DATA_ROOT / "experiments" 29 | 30 | # structure of wordscape annotated files 31 | WS_MULTIMODAL = "multimodal" 32 | WS_META = "meta" 33 | -------------------------------------------------------------------------------- /app/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/__init__.py -------------------------------------------------------------------------------- /app/src/annotation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/annotation/__init__.py -------------------------------------------------------------------------------- /app/src/annotation/annotation_quality.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple 2 | 3 | import settings.entities as entity_settings 4 | import settings.annotation as annotation_settings 5 | from src.annotation.colorization import ColorizationDecision 6 | 7 | __all__ = [ 8 | "calc_annotation_quality_score" 9 | ] 10 | 11 | IGNORE_ENTITY_IDS = [ 12 | entity_settings.ENTITY_TABLE_ROW_ID, 13 | entity_settings.ENTITY_TABLE_CELL_ID, 14 | entity_settings.ENTITY_TABLE_COLUMN_ID 15 | ] 16 | 17 | 18 | def calc_annotation_quality_score( 19 | colorization_decisions: List[ColorizationDecision], 20 | entity_counts: Dict[int, int], 21 | ) -> Tuple[float, Dict[int, float]]: 22 | r""" Calculate the annotation quality score for a document 23 | 24 | @param colorization_decisions: the colorization decisions for a document; 25 | this is a list of ColorizationDecision objects with the attributes: 26 | - text (str): the text of the element 27 | - decision_source (str): the source of the decision 28 | - entity_decision (int): the id of the entity category 29 | @param entity_counts: dictionary with the number of entities for each 30 | entity category 31 | 32 | @return: the annotation quality score for the document, and the proportion 33 | of builtin characters for each entity 34 | """ 35 | # count the number of characters for each entity 36 | char_counter = { 37 | k: {'builtin': 0, 'heuristic': 0} 38 | for k in entity_settings.ALL_ENTITY_IDS 39 | } 40 | 41 | for col_decision in colorization_decisions: 42 | category_id = col_decision.entity_decision 43 | 44 | if col_decision.text is None: 45 | # we assign text length 1 to entity categories that do not have 46 | # text (this only concerns tables and figures which are always 47 | # builtins) 48 | text_len = 1.0 49 | else: 50 | text_len = len(col_decision.text) 51 | 52 | if col_decision.decision_source in annotation_settings.BUILTIN_SOURCES: 53 | char_counter[category_id]['builtin'] += text_len 54 | else: 55 | char_counter[category_id]['heuristic'] += text_len 56 | 57 | # compute proportion of builtin characters for each entity 58 | builtin_props = dict.fromkeys(entity_settings.ALL_ENTITY_IDS, 0.0) 59 | 60 | for cat_id, char_counts in char_counter.items(): 61 | total_chars = char_counts['builtin'] + char_counts['heuristic'] 62 | 63 | if total_chars == 0: 64 | prop = 0.0 65 | else: 66 | prop = char_counts['builtin'] / total_chars 67 | 68 | builtin_props[cat_id] = prop 69 | 70 | # compute final score 71 | num_entities = sum(entity_counts.values()) 72 | 73 | if num_entities == 0: 74 | return 0.0, builtin_props 75 | 76 | quality_score = 0.0 77 | for entity_id, count in entity_counts.items(): 78 | if entity_id not in IGNORE_ENTITY_IDS: 79 | quality_score += count * builtin_props[entity_id] 80 | 81 | quality_score /= num_entities 82 | 83 | return quality_score, builtin_props 84 | -------------------------------------------------------------------------------- /app/src/annotation/colorization/__init__.py: -------------------------------------------------------------------------------- 1 | # handlers 2 | from .colorization_handler import ColorizationHandler, ColorizationDecision 3 | from .heuristics.build_heuristics import ParagraphHeuristic 4 | 5 | # entities modules 6 | from .entities import colorize_builtin_form_elements 7 | from .entities import colorize_builtin_toc_elements 8 | from .entities import colorize_figures 9 | from .entities import colorize_header_and_footer 10 | from .entities import colorize_paragraph 11 | from .entities import colorize_table 12 | from .entities import colorize_text_boxes 13 | -------------------------------------------------------------------------------- /app/src/annotation/colorization/colorize_doc.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from typing import Union 3 | from docx.document import Document as DocxDocument 4 | 5 | from src.annotation.colorization import ( 6 | ColorizationHandler, 7 | ParagraphHeuristic, 8 | colorize_builtin_form_elements, 9 | colorize_builtin_toc_elements, 10 | colorize_figures, 11 | colorize_header_and_footer, 12 | colorize_paragraph, 13 | colorize_text_boxes, 14 | colorize_table 15 | ) 16 | from src.annotation.config import AnnotationConfig 17 | from src.annotation.utils.color_utils import sanitize_figure_settings 18 | 19 | import settings.colors as color_settings 20 | 21 | 22 | def colorize_word_doc( 23 | word_doc: DocxDocument, 24 | colorization_handler: ColorizationHandler, 25 | config: AnnotationConfig, 26 | temp_dir: Union[pathlib.Path, None] = None, 27 | ) -> DocxDocument: 28 | r""" Colorize a word document, and return the colorized document 29 | 30 | @param word_doc: the word document to colorize 31 | @param colorization_handler: the colorization handler to use for 32 | colorization 33 | @param config: the annotation config to use for colorization 34 | @param temp_dir: directory to use for storing temporary files 35 | 36 | @return: the colorized word document 37 | """ 38 | # some elements do not have builtin styles, or styles we do not recognize. 39 | # For these cases, we build heuristics as a fallback option 40 | paragraph_heuristics = ParagraphHeuristic(word_doc, config) 41 | 42 | # sanitization step: change figure settings so that no preset styles are 43 | # applied which could change the color of figures 44 | sanitize_figure_settings(document=word_doc) 45 | 46 | # 1) colorize headers and footers 47 | colorize_header_and_footer( 48 | word_doc, colorization_handler=colorization_handler 49 | ) 50 | 51 | # 2) colorize text boxes 52 | colorize_text_boxes( 53 | word_doc, hsv_color=color_settings.COLOR_TEXT, 54 | colorization_handler=colorization_handler 55 | ) 56 | 57 | # 3) colorize tables 58 | for table in word_doc.tables: 59 | colorize_table(table, colorization_handler=colorization_handler) 60 | 61 | # 4) colorize paragraph elements 62 | for paragraph in word_doc.paragraphs: 63 | colorize_paragraph( 64 | paragraph, 65 | colorization_handler=colorization_handler, 66 | paragraph_heuristics=paragraph_heuristics 67 | ) 68 | 69 | # 5) colorize table of contents elements 70 | # ! this has to be done before forms, due to XML overlaps 71 | colorize_builtin_toc_elements( 72 | word_doc, colorization_handler=colorization_handler 73 | ) 74 | 75 | # 6) colorize built-in form elements 76 | # !this has to be done after regular colorization, because form fields may 77 | # !overlap with other entity types, therefore being overcolored if this is 78 | # !done first 79 | colorize_builtin_form_elements( 80 | word_doc, colorization_handler=colorization_handler 81 | ) 82 | 83 | # 6) colorize figures 84 | word_doc = colorize_figures( 85 | word_doc, temp_dir=temp_dir, colorization_handler=colorization_handler 86 | ) 87 | 88 | return word_doc 89 | -------------------------------------------------------------------------------- /app/src/annotation/colorization/entities/__init__.py: -------------------------------------------------------------------------------- 1 | from .figure import colorize_figures 2 | from .form import colorize_builtin_form_elements 3 | from .tables import colorize_table 4 | from .paragraph import colorize_paragraph 5 | from .text_box import colorize_text_boxes 6 | from .toc import colorize_builtin_toc_elements 7 | from .header_footer import colorize_header_and_footer 8 | -------------------------------------------------------------------------------- /app/src/annotation/colorization/entities/figure.py: -------------------------------------------------------------------------------- 1 | from docx.document import Document as _Document 2 | from docx import Document 3 | import io 4 | import numpy as np 5 | import os 6 | import pathlib 7 | from PIL import Image 8 | import tempfile 9 | import zipfile 10 | 11 | from src.annotation.colorization import ColorizationHandler 12 | from src.annotation.utils.color_utils import hsv_to_rgb 13 | from src.annotation.utils.updateable_zipfile import UpdateableZipFile 14 | 15 | import settings 16 | 17 | IMG_EXT = ( 18 | '.bmp', 19 | '.gif', 20 | '.jpeg', 21 | '.jpg', 22 | '.png', 23 | '.tiff', 24 | '.ico', 25 | '.pcx', 26 | '.ppm', 27 | '.pgm', 28 | '.pbm', 29 | '.pnm', 30 | '.webp', 31 | '.hdr', 32 | '.dds', 33 | '.im', 34 | '.eps', 35 | '.svg' 36 | ) 37 | 38 | 39 | def colorize_figures( 40 | word_doc: _Document, 41 | temp_dir: pathlib.Path, 42 | colorization_handler: ColorizationHandler, 43 | ) -> _Document: 44 | r""" Colorizes figures in word document. It does so by first creating a 45 | temporary word document, then extracting all images from the document, 46 | colorizing them, and finally overwriting the images in the temporary 47 | document. The temporary document is then loaded into memory, destroyed 48 | on disk and returned. 49 | 50 | @param word_doc: word document to colorize 51 | @param temp_dir: directory to use for storing temporary files 52 | @param colorization_handler: colorization handler; here, this is only used 53 | to keep track of annotation sources. 54 | 55 | 56 | @return: colorized word document instance 57 | """ 58 | 59 | # create temporary file 60 | temp_doc_fp = tempfile.NamedTemporaryFile( 61 | mode="w+b", suffix=".docx", dir=temp_dir 62 | ) 63 | 64 | # save doc to temp file 65 | word_doc.save(path_or_stream=temp_doc_fp) 66 | 67 | # we raise an error if something has gone wrong and the document is not a 68 | # valid zip file 69 | if not zipfile.is_zipfile(temp_doc_fp): 70 | raise ValueError(f"document is not a valid zip file") 71 | 72 | # convert hsv to rgb 73 | rgb_color = tuple(hsv_to_rgb(hsv_color=settings.colors.COLOR_FIGURES)) 74 | 75 | # extract image files, overwrite them with images color 76 | with UpdateableZipFile(temp_doc_fp, "a") as archive: 77 | for fp in archive.namelist(): 78 | if ( 79 | not fp.startswith("word/media") or 80 | not fp.lower().endswith(IMG_EXT) 81 | ): 82 | continue 83 | 84 | # extract image to temp dir 85 | img_bytes = archive.read(fp) 86 | 87 | # read and overwrite image 88 | try: 89 | img = Image.open(io.BytesIO(img_bytes)) 90 | except Exception as e: 91 | print(f"[WARNING] reading image {fp} " 92 | f"failed with {e.__class__.__name__}: {e}") 93 | continue 94 | img = Image.new("RGB", img.size) 95 | img.putdata([rgb_color] * np.prod(img.size)) 96 | 97 | _, ext = os.path.splitext(fp) 98 | ext = ext.lower().strip(".") 99 | ext = 'jpeg' if ext == 'jpg' else ext 100 | with io.BytesIO() as temp_img: 101 | try: 102 | img.save(temp_img, format=ext) 103 | except IOError: 104 | continue 105 | except Exception as e: 106 | # could not write file, skip 107 | print(f"unknown exception while writing image {fp};\n{e}") 108 | continue 109 | 110 | temp_img.seek(0) 111 | archive.write(temp_img, fp) 112 | 113 | # add annotation source 114 | colorization_handler.update_colorization_decisions( 115 | text=None, 116 | decision_source=settings.annotation.ANNOTATION_BUILTIN, 117 | entity_decision=settings.entities.ENTITY_FIGURE_ID 118 | ) 119 | 120 | word_doc = Document(temp_doc_fp.name) 121 | temp_doc_fp.close() 122 | 123 | return word_doc 124 | -------------------------------------------------------------------------------- /app/src/annotation/colorization/entities/header_footer.py: -------------------------------------------------------------------------------- 1 | from docx.document import Document as _Document 2 | 3 | from src.annotation.colorization import ColorizationHandler 4 | from src.annotation.colorization.entities.tables import colorize_table 5 | 6 | import settings 7 | 8 | 9 | def colorize_header_and_footer( 10 | document: _Document, colorization_handler: ColorizationHandler 11 | ): 12 | r""" Colorize header and footer of the document. 13 | 14 | @param document: the document to colorize 15 | @param colorization_handler: the colorization handler to use for 16 | colorization of the header and footer 17 | """ 18 | # colorize document header 19 | header_name = settings.colors.get_entity_name(settings.colors.COLOR_HEADER) 20 | _colorize( 21 | document, entity_name=header_name, 22 | colorization_handler=colorization_handler 23 | ) 24 | 25 | # colorize document footer 26 | footer_name = settings.colors.get_entity_name(settings.colors.COLOR_FOOTER) 27 | _colorize( 28 | document, entity_name=footer_name, 29 | colorization_handler=colorization_handler 30 | ) 31 | 32 | 33 | def _colorize( 34 | document: _Document, 35 | entity_name: str, 36 | colorization_handler: ColorizationHandler 37 | ): 38 | r""" Colorize header or footer of the document. 39 | 40 | @param document: the document to colorize 41 | @param entity_name: either "header" or "footer" 42 | @param colorization_handler: the colorization handler to use for 43 | colorization of the header and footer 44 | """ 45 | assert entity_name in [ 46 | settings.entities.ENTITY_HEADER_NAME, 47 | settings.entities.ENTITY_FOOTER_NAME 48 | ] 49 | 50 | color = settings.colors.ENTITY_NAME_TO_COLOR[entity_name] 51 | 52 | for section in document.sections: 53 | header_or_footer_obj = getattr(section, entity_name) 54 | 55 | # skip if obj is linked to previous section 56 | if header_or_footer_obj.is_linked_to_previous: 57 | continue 58 | 59 | # colorize paragraphs 60 | for par in header_or_footer_obj.paragraphs: 61 | if len(par.text) == 0: 62 | continue 63 | 64 | colorization_handler.assign_par_color( 65 | par=par, 66 | base_color=color, 67 | decision_source=settings.annotation.ANNOTATION_BUILTIN 68 | ) 69 | 70 | # colorize tables as footer / header 71 | for table in header_or_footer_obj.tables: 72 | colorize_table( 73 | table=table, 74 | base_color_table=color, 75 | base_color_table_header=color, 76 | colorization_handler=colorization_handler, 77 | sat_val_step=0 78 | ) 79 | -------------------------------------------------------------------------------- /app/src/annotation/colorization/entities/paragraph.py: -------------------------------------------------------------------------------- 1 | from docx.oxml.xmlchemy import serialize_for_reading 2 | from docx.text.paragraph import Paragraph 3 | 4 | import settings 5 | from src.annotation.builtin_styles import BUILTIN_STYLES 6 | from src.annotation.colorization import ColorizationHandler 7 | from src.annotation.colorization import ParagraphHeuristic 8 | from src.annotation.colorization.mappings import MAP_BUILTIN_TO_ENTITY_COLOR 9 | from src.annotation.utils.color_utils import check_if_par_is_numbered 10 | 11 | 12 | def colorize_paragraph( 13 | paragraph: Paragraph, 14 | colorization_handler: ColorizationHandler, 15 | paragraph_heuristics: ParagraphHeuristic 16 | ): 17 | r""" Colorize a paragraph. This function relies primarily on builtin styles 18 | to identify which category a paragraph belongs to. If no builtin style is 19 | found, we fall back to heuristics. 20 | 21 | @param paragraph: the paragraph to colorize 22 | @param colorization_handler: the colorization handler 23 | @param paragraph_heuristics: the paragraph heuristics 24 | """ 25 | # skip paragraph if it has no style associated 26 | if paragraph.style is None: 27 | return 28 | 29 | # skip paragraph if it is empty 30 | par_style = paragraph.style.name.lower() 31 | par_text = "".join(s for s in paragraph.text if s not in ["\n", "\t"]) 32 | if len(par_text) == 0 and "toc" not in par_style: 33 | return 34 | 35 | # if no built-in style, we can try to fall back to heuristics 36 | if par_style not in BUILTIN_STYLES: 37 | colorization_handler.assign_par_color_considering_runs( 38 | paragraph, paragraph_heuristics, 39 | original_was_builtin=False, 40 | original_builtin_entity_id=settings.entities.ENTITY_TEXT_ID 41 | ) 42 | return 43 | 44 | # check the builtin --> entity mapping 45 | entity_color_found_for_builtin = None 46 | for possible_start in MAP_BUILTIN_TO_ENTITY_COLOR: 47 | if par_style.startswith(possible_start): 48 | entity_color_found_for_builtin = \ 49 | MAP_BUILTIN_TO_ENTITY_COLOR[possible_start] 50 | 51 | # ! some entity types we want to deal with specially 52 | # ! this may include run-checking or detecting other entity signals 53 | if entity_color_found_for_builtin == settings.colors.COLOR_TEXT: 54 | attributes = set( 55 | paragraph._p.xml._attr_seq(serialize_for_reading(paragraph._p)) 56 | ) 57 | 58 | if "" in attributes or "" in attributes: 59 | colorization_handler.assign_par_color( 60 | par=paragraph, 61 | base_color=settings.colors.COLOR_EQUATION, 62 | decision_source=settings.annotation.ANNOTATION_XML_PATTERN 63 | ) 64 | elif check_if_par_is_numbered(paragraph): 65 | colorization_handler.assign_par_color( 66 | par=paragraph, 67 | base_color=settings.colors.COLOR_LIST, 68 | decision_source=settings.annotation.ANNOTATION_XML_PATTERN 69 | ) 70 | else: 71 | colorization_handler.assign_par_color_considering_runs( 72 | par=paragraph, 73 | para_heuristics=paragraph_heuristics, 74 | original_was_builtin=True, 75 | original_builtin_entity_id=settings.entities.ENTITY_TEXT_ID 76 | ) 77 | 78 | elif entity_color_found_for_builtin is not None: 79 | colorization_handler.assign_par_color( 80 | par=paragraph, 81 | base_color=entity_color_found_for_builtin, 82 | decision_source=settings.annotation.ANNOTATION_BUILTIN 83 | ) 84 | 85 | else: 86 | print(f"unrecognized style {par_style}") 87 | -------------------------------------------------------------------------------- /app/src/annotation/colorization/entities/tables/__init__.py: -------------------------------------------------------------------------------- 1 | from . import styles 2 | from .table_colorization_handler import TableColorizationHandler 3 | from .colorize_table import colorize_table 4 | -------------------------------------------------------------------------------- /app/src/annotation/colorization/entities/tables/colorize_table.py: -------------------------------------------------------------------------------- 1 | from docx.table import Table 2 | 3 | import settings 4 | from src.annotation.colorization import ColorizationHandler 5 | from src.annotation.colorization.entities.tables import \ 6 | TableColorizationHandler 7 | 8 | 9 | def colorize_table( 10 | table: Table, 11 | colorization_handler: ColorizationHandler = None, 12 | base_color_table=settings.colors.COLOR_TABLE, 13 | base_color_table_header=settings.colors.COLOR_TABLE_HEADER, 14 | sat_val_step=settings.colors.SAT_VAL_STEP, 15 | ): 16 | ct_tbl_ref_style = getattr(table.style, "_element", None) 17 | 18 | # record table in the colorization decisions 19 | colorization_handler.update_colorization_decisions( 20 | text=None, 21 | decision_source=settings.annotation.ANNOTATION_BUILTIN, 22 | entity_decision=settings.entities.ENTITY_TABLE_ID 23 | ) 24 | 25 | # initialize table colorization handler 26 | tbl_col_handler = TableColorizationHandler( 27 | ct_tbl=table._tbl, ct_tbl_ref_style=ct_tbl_ref_style, 28 | colorization_handler=colorization_handler, 29 | base_color_table=base_color_table, 30 | base_color_header=base_color_table_header, 31 | sat_val_step=sat_val_step 32 | ) 33 | 34 | # colorize table 35 | tbl_col_handler.colorize_table() 36 | -------------------------------------------------------------------------------- /app/src/annotation/colorization/entities/text_box.py: -------------------------------------------------------------------------------- 1 | from docx.document import Document as _Document 2 | from docx.text.paragraph import Paragraph 3 | from typing import Tuple 4 | 5 | from src.annotation.colorization import ColorizationHandler 6 | 7 | import settings 8 | 9 | 10 | def colorize_text_boxes( 11 | document: _Document, 12 | hsv_color: Tuple[int, int, int], 13 | colorization_handler: ColorizationHandler 14 | ): 15 | r""" 16 | Colorize all text boxes in the document. 17 | Currently, the basic assumption is that any text box near to a table 18 | or figure should be viewed as a caption; the only default behavior 19 | of word which creates text-boxes is when inserting captions. 20 | 21 | @param document: the document to colorize 22 | @param hsv_color: the color to use for text boxes in hsv color space 23 | @param colorization_handler: global tracker for colorization information 24 | """ 25 | text_box_elements = document.element.body.xpath(".//w:txbxContent//w:p") 26 | for par_xml in text_box_elements: 27 | colorization_handler.assign_par_color( 28 | par=Paragraph(par_xml, document), 29 | base_color=hsv_color, 30 | decision_source=settings.annotation.ANNOTATION_XML_PATTERN 31 | ) 32 | -------------------------------------------------------------------------------- /app/src/annotation/colorization/heuristics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/annotation/colorization/heuristics/__init__.py -------------------------------------------------------------------------------- /app/src/annotation/colorization/mappings.py: -------------------------------------------------------------------------------- 1 | r""" 2 | In order to nicely support headers, the heuristic mapping uses ints as values: 3 | 1 through 9 for known headers 4 | -1 for unknown 5 | special values for various builtins / other special properties 6 | 7 | Here is also provided a mapping from builtins to constants that 8 | work with the run-level mapping strategy. 9 | """ 10 | 11 | import settings 12 | 13 | HEURISTIC_LEVEL_BODY = -10 14 | HEURISTIC_LEVEL_TITLE = -20 15 | HEURISTIC_LEVEL_LIST = -30 16 | 17 | HEURISTIC_FONT_UNKNOWN = -1.0 18 | 19 | CONSIDER_RUN_COLORING_FOR = [settings.colors.COLOR_TEXT] 20 | 21 | # if style starts with one of the following names, it 22 | # should map to that color 23 | MAP_BUILTIN_TO_ENTITY_COLOR = { 24 | # BODY 25 | "body": settings.colors.COLOR_TEXT, 26 | "normal": settings.colors.COLOR_TEXT, 27 | "plain text": settings.colors.COLOR_TEXT, 28 | "no spacing": settings.colors.COLOR_TEXT, 29 | "default": settings.colors.COLOR_TEXT, 30 | 31 | # TITLE 32 | "title": settings.colors.COLOR_DOCUMENT_TITLE, 33 | 34 | # HEADINGS 35 | "heading 1": settings.colors.COLOR_SECTION_HEADING_1, 36 | "heading 2": settings.colors.COLOR_SECTION_HEADING_2, 37 | "heading 3": settings.colors.COLOR_SECTION_HEADING_3, 38 | "heading 4": settings.colors.COLOR_SECTION_HEADING_4, 39 | "heading 5": settings.colors.COLOR_SECTION_HEADING_5, 40 | "heading 6": settings.colors.COLOR_SECTION_HEADING_6, 41 | "heading 7": settings.colors.COLOR_SECTION_HEADING_7, 42 | "heading 8": settings.colors.COLOR_SECTION_HEADING_8, 43 | "heading 9": settings.colors.COLOR_SECTION_HEADING_9, 44 | 45 | # HEADERS AND FOOTERS 46 | "header": settings.colors.COLOR_HEADER, 47 | "footer": settings.colors.COLOR_FOOTER, 48 | 49 | # LIST 50 | "list": settings.colors.COLOR_LIST, 51 | 52 | # TOC 53 | "toc": settings.colors.COLOR_TOC, 54 | 55 | # BIBLIOGRAPHY 56 | "bibliography": settings.colors.COLOR_BIBLIOGRAPHY, 57 | 58 | # QUOTE 59 | "quote": settings.colors.COLOR_QUOTE, 60 | "intense quote": settings.colors.COLOR_QUOTE, 61 | 62 | # CAPTIONS 63 | "caption": settings.colors.COLOR_TABLE_CAPTIONS, 64 | 65 | # FOOTNOTES 66 | "footnote": settings.colors.COLOR_FOOTNOTE, 67 | 68 | # ANNOTATION 69 | "annotation": settings.colors.COLOR_ANNOTATION, 70 | } 71 | -------------------------------------------------------------------------------- /app/src/annotation/config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import pathlib 3 | import yaml 4 | 5 | 6 | @dataclass 7 | class AnnotationConfig: 8 | # image config 9 | image_format: str 10 | image_height: int 11 | image_width: int 12 | image_dpi: int 13 | 14 | # decompression bomb checks 15 | max_decompress_ratio: float 16 | max_image_pixels: int 17 | 18 | # documents 19 | max_doc_bytes: int 20 | max_doc_pages: int 21 | 22 | # time limits 23 | annotation_timeout_secs: int 24 | annotation_cleanup_secs: int 25 | 26 | # data org 27 | max_bytes_in_shard: int 28 | 29 | # language 30 | top_k_languages: int 31 | 32 | # libreoffice 33 | unoserver_start_timeout: int 34 | unoconvert_timeout: int 35 | soffice_launch_timeout: int 36 | soffice_launch_ping_interval: float 37 | 38 | # entity detection 39 | max_heading_len: int 40 | form_field_min_length: int 41 | 42 | # entity relations 43 | bbox_relation_overlap_threshold: float 44 | bbox_relation_scale_threshold: float 45 | bbox_relation_closeness_threshold: float 46 | word_2_entity_overlap_threshold: float 47 | 48 | # annotation config 49 | min_text_chars: int 50 | 51 | 52 | def load_config(fp: pathlib.Path) -> AnnotationConfig: 53 | with fp.open(mode='r') as f: 54 | data = yaml.safe_load(f) 55 | 56 | kwargs = {} 57 | for d in data.values(): 58 | kwargs.update({k.lower(): v for k, v in d.items()}) 59 | 60 | return AnnotationConfig(**kwargs) 61 | -------------------------------------------------------------------------------- /app/src/annotation/entity_detection/__init__.py: -------------------------------------------------------------------------------- 1 | from .entity_detector import EntityDetector 2 | from .detection import detect_entities_in_document 3 | -------------------------------------------------------------------------------- /app/src/annotation/entity_detection/detection.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import os 4 | from typing import List, Dict, Tuple, Union 5 | import pathlib 6 | 7 | from src.annotation.annotation_objects import Entity 8 | from src.annotation.colorization import ColorizationHandler 9 | from src.annotation.entity_detection import EntityDetector 10 | from src.annotation.utils.identifiers import get_page_id 11 | from src.annotation.utils.pdf_utils import pdf_to_page_images_iterator 12 | 13 | 14 | def detect_entities_in_document( 15 | doc_id: str, 16 | temp_pdf_fp: Union[str, pathlib.Path], 17 | colorization_handler: ColorizationHandler, 18 | debug_dir: Union[str, pathlib.Path] = None, 19 | word_doc_fp: Union[str, pathlib.Path] = None, 20 | dpi: int = 100, 21 | size: Tuple[Union[int, None], Union[int, None]] = (None, None) 22 | ) -> Dict[str, Dict[int, List[Entity]]]: 23 | r"""Detect entities in a document. 24 | 25 | @param doc_id: id of the document 26 | @param temp_pdf_fp: path to pdf file 27 | @param colorization_handler: colorization handler containts the colors used 28 | for colorization 29 | @param debug_dir: path to save colorized image pages for debugging 30 | @param word_doc_fp: path to word document; this is only used for debugging 31 | @param dpi: resolution of the output image(s) 32 | @param size: size of the output image(s), uses the Pillow (width, height) 33 | standard. If one of width or height is set to None, the image 34 | aspect ratio is preserved. 35 | 36 | @return: Dict with page number as key and as value a dictionary with 37 | entity_category_id as key and list of entity objects for detected 38 | entities as value 39 | """ 40 | pages_entities = {} 41 | page_number = 1 # page number starts at 1 42 | 43 | # extract pages from pdf as images 44 | # ! important: output format needs to use lossless compression when 45 | # ! converting the colorized pdf to images. Otherwise, the entity 46 | # ! detection will be inaccurate. ALWAYS USE fmt="png"! 47 | for pages_block in pdf_to_page_images_iterator( 48 | pdf_fp=temp_pdf_fp, 49 | fmt="png", 50 | size=size, 51 | dpi=dpi, 52 | output_folder=None 53 | ): 54 | for page in pages_block: 55 | # convert to cv2 format with HSV color space 56 | page = np.array(page).astype(np.uint8) 57 | page_cv2 = cv2.cvtColor(page, cv2.COLOR_RGB2HSV) 58 | 59 | if debug_dir is not None: 60 | fn_root = os.path.splitext(os.path.split(word_doc_fp)[-1])[0] 61 | debug_save_as = os.path.join( 62 | debug_dir, f"colorized_{fn_root}_p{page_number}.png" 63 | ) 64 | cv2.imwrite( 65 | debug_save_as, cv2.cvtColor(page, cv2.COLOR_RGB2BGR) 66 | ) 67 | 68 | # detect entities in page: this function returns a dictionary with 69 | # entity_category as key and list of bounding boxes for detected 70 | # entities as value 71 | page_id = get_page_id(doc_id, page_number) 72 | entities = _detect_entities_on_page( 73 | doc_id=doc_id, 74 | page_id=page_id, 75 | page_num=page_number, 76 | page_image=page_cv2, 77 | colorization_handler=colorization_handler 78 | ) 79 | pages_entities[page_id] = entities 80 | page_number += 1 81 | 82 | return pages_entities 83 | 84 | 85 | def _detect_entities_on_page( 86 | doc_id: str, 87 | page_id: str, 88 | page_num: int, 89 | page_image: np.ndarray, 90 | colorization_handler: ColorizationHandler 91 | ) -> Dict[int, List[Entity]]: 92 | r"""Detect entities in a page. 93 | 94 | @param page_image: page to detect entities in; ! this needs to be a cv2 95 | image in HSV color space 96 | @param colorization_handler: colorization handler containts the colors used 97 | for colorization 98 | 99 | @return: Dictionary with entity_category_id as key and list of entity 100 | objects for detected entities as value 101 | """ 102 | entity_detector = EntityDetector( 103 | doc_id=doc_id, page_id=page_id, page_num=page_num, 104 | image_numpy=page_image, colorization_handler=colorization_handler 105 | ) 106 | return entity_detector.detect_entities() 107 | -------------------------------------------------------------------------------- /app/src/annotation/entity_detection/utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /app/src/annotation/language_detection/__init__.py: -------------------------------------------------------------------------------- 1 | from .inference import predict_lang_per_page 2 | from .inference import predict_lang 3 | -------------------------------------------------------------------------------- /app/src/annotation/language_detection/inference.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | import fasttext 3 | 4 | from src.annotation.annotation_objects import Word 5 | import src.annotation.language_detection.utils as lang_utils 6 | 7 | 8 | def predict_lang_per_page( 9 | pages_words: Dict[str, List[Word]], 10 | k: int, 11 | lm: fasttext.FastText._FastText = None 12 | ) -> Dict[str, Dict[str, float]]: 13 | r""" Detects top-k languages occuring in text using the fasttext model 14 | trained on trained on data from Wikipedia, Tatoeba and SETimes. 15 | 16 | Reference: https://fasttext.cc/docs/en/language-identification.html 17 | 18 | @param pages_words: dictionary mapping page ids to list of words 19 | @param k: number of predictions to return, defaults to 5 20 | @param lm: language model, defaults to None, in which case it is loaded in 21 | the function 22 | 23 | @return: dictionary mapping page ids to list of predicted languages and 24 | list of corresponding confidence scores 25 | """ 26 | if lm is None: 27 | lm = lang_utils.load_lang_model(version="ftz") 28 | 29 | pages_langs = {} 30 | for page_id, page_words in pages_words.items(): 31 | page_text = " ".join([word.text for word in page_words]) 32 | pages_langs[page_id] = predict_lang(page_text, k=k, lm=lm) 33 | 34 | return pages_langs 35 | 36 | 37 | def _clean_text(text: str) -> str: 38 | return " ".join(text.strip().lower().split()) 39 | 40 | 41 | def predict_lang(text: str, k: int, lm=None) -> Dict[str, float]: 42 | # clean text 43 | text = _clean_text(text) 44 | 45 | if len(text) == 0: 46 | return {"__label__unknown": 1.0} 47 | 48 | if lm is None: 49 | lm = lang_utils.load_lang_model(version="ftz") 50 | 51 | # predict language 52 | tags, confs = lm.predict(text, k=k) 53 | 54 | # convert predictions to dictionary 55 | langs: Dict[str, float] = { 56 | lang: float(conf) for lang, conf in zip(tags, confs) 57 | } 58 | 59 | return langs 60 | -------------------------------------------------------------------------------- /app/src/annotation/language_detection/utils.py: -------------------------------------------------------------------------------- 1 | import iso639 2 | import fasttext 3 | from pathlib import Path 4 | from settings.filesystem import FASTTEXT_CLASSIFIERS_DIR 5 | 6 | # suppress fasttext warning 7 | fasttext.FastText.eprint = lambda x: None 8 | 9 | 10 | def lang_code_to_name(lang_code: str) -> str: 11 | r""" Convert language iso639 code to human readable language name. """ 12 | try: 13 | return iso639.to_name(lang_code) 14 | except iso639.NonExistentLanguageError: 15 | return "unknown" 16 | 17 | 18 | def load_lang_model(version: str = "bin") -> fasttext.FastText._FastText: 19 | r""" Load language model. """ 20 | if version.lower() == "bin": 21 | return fasttext.load_model( 22 | path=str(Path(FASTTEXT_CLASSIFIERS_DIR, "lid.176.bin")) 23 | ) 24 | elif version.lower() == "ftz": 25 | return fasttext.load_model( 26 | path=str(Path(FASTTEXT_CLASSIFIERS_DIR, "lid.176.ftz")) 27 | ) 28 | else: 29 | raise ValueError(f"Invalid fasttext model version {version}") 30 | -------------------------------------------------------------------------------- /app/src/annotation/oxml_metadata.py: -------------------------------------------------------------------------------- 1 | r""" 2 | Module to get metadata that can be acquired when viewing the XML of a word 3 | document 4 | """ 5 | 6 | from typing import List 7 | from docx.document import Document 8 | 9 | 10 | class OXMLMetadata: 11 | r""" 12 | Class for metadata not directly originating from annotation, but intrinsic 13 | to one document. 14 | """ 15 | languages_autocorrect: List[str] 16 | template_name: str 17 | 18 | # !!! IMPORTANT INFO (also has category, subject, title, status) 19 | # https://python-docx.readthedocs.io/en/latest/api/document.html#coreproperties-objects 20 | core_category: str 21 | core_comments: str 22 | core_content_status: str 23 | core_created: str 24 | core_identifier: str 25 | core_keywords: str 26 | core_last_printed: str 27 | core_modified: str 28 | core_subject: str 29 | core_title: str 30 | core_version: str 31 | 32 | 33 | def get_langs(doc: Document) -> List[str]: 34 | # get w:lang tags 35 | lang_tags = doc.element.body.xpath("//w:lang") 36 | lang_list = [] 37 | for tag in lang_tags: 38 | for k, v in tag.items(): 39 | lang_list.append(v) 40 | return list(set(lang_list)) 41 | 42 | 43 | def get_oxml_metadata(doc: Document) -> OXMLMetadata: 44 | data = OXMLMetadata() 45 | data.languages_autocorrect = get_langs(doc) 46 | 47 | core = doc.core_properties 48 | data.core_category = core.category 49 | data.core_comments = core.comments 50 | data.core_content_status = core.content_status 51 | data.core_created = core.created 52 | data.core_identifier = core.identifier 53 | data.core_keywords = core.keywords 54 | data.core_last_printed = core.last_printed 55 | data.core_modified = core.modified 56 | data.core_subject = core.subject 57 | data.core_title = core.title 58 | data.core_version = core.version 59 | 60 | return data 61 | -------------------------------------------------------------------------------- /app/src/annotation/postprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | from .postprocess import postprocess_words 2 | from .postprocess import postprocess_entities 3 | from .postprocess import postprocess_entities_content_based 4 | from .postprocess import postprocess_tables 5 | -------------------------------------------------------------------------------- /app/src/annotation/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/annotation/preprocessing/__init__.py -------------------------------------------------------------------------------- /app/src/annotation/preprocessing/highlighting.py: -------------------------------------------------------------------------------- 1 | from docx.document import Document as DocxDocument 2 | from docx.table import _Cell 3 | 4 | 5 | def sanitize_highlighting(word_doc: DocxDocument) -> DocxDocument: 6 | r"""Remove highlighting from a word document, as this interferes with our 7 | colorization based annotation process. 8 | 9 | @param word_doc: word document instance 10 | 11 | @return: sanitized word document instance 12 | """ 13 | for para in word_doc.paragraphs: 14 | # Iterate over all runs in the paragraph 15 | for run in para.runs: 16 | # Check if the run has highlighting 17 | if run.font.highlight_color is not None: 18 | # Remove the highlighting 19 | run.font.highlight_color = None 20 | 21 | # Iterate over all tables in the document 22 | for table in word_doc.tables: 23 | for row in table.rows: 24 | try: 25 | row_cells = row.cells 26 | except IndexError: 27 | row_cells = [_Cell(tc, table) for tc in row._tr.tc_lst] 28 | for cell in row_cells: 29 | for para in cell.paragraphs: 30 | for run in para.runs: 31 | # remove highlighting 32 | if run.font.highlight_color is not None: 33 | run.font.highlight_color = None 34 | 35 | return word_doc 36 | -------------------------------------------------------------------------------- /app/src/annotation/sanity_checks.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import Dict, Tuple, Set 3 | 4 | from src.exceptions import * 5 | 6 | ASPECT_RATIO_TOL = 1e-2 7 | 8 | 9 | def pages_aspect_ratios( 10 | page_dims_pdf_parser: Dict[str, Tuple[int, int]], 11 | page_dims_renderings: Dict[str, Tuple[int, int]] 12 | ): 13 | r""" checks that the aspect ratios of the pages in the PDF file are 14 | consistent with the aspect ratios of the pages in the rendered page images. 15 | 16 | @param page_dims_pdf_parser: dictionary mapping page_ids to tuples 17 | containing the width and height of the page 18 | @param page_dims_renderings: dictionary mapping page_ids to tuples 19 | containing the width and height of the page 20 | 21 | @raises InconsistentAspectRatiosError: if the aspect ratios are not 22 | consistent 23 | """ 24 | for pg_key in page_dims_pdf_parser.keys(): 25 | # compute aspect ratios 26 | aspect_ratio_pdf = \ 27 | page_dims_pdf_parser[pg_key][0] / page_dims_pdf_parser[pg_key][1] 28 | aspect_ratio_renderings = \ 29 | page_dims_renderings[pg_key][0] / page_dims_renderings[pg_key][1] 30 | 31 | if not math.isclose( 32 | aspect_ratio_renderings, aspect_ratio_pdf, 33 | rel_tol=ASPECT_RATIO_TOL 34 | ): 35 | raise InconsistentAspectRatiosError( 36 | aspect_ratio_pdf, aspect_ratio_renderings 37 | ) 38 | 39 | 40 | def page_counts_consistency( 41 | pages_from_entity_detection: Set, 42 | pages_from_pdf_parser: Set 43 | ): 44 | r""" checks that the page numberings are consistent between the entity 45 | detection and the pdf parser. 46 | 47 | @param pages_from_entity_detection: dictionary mapping page numbers to 48 | dictionaries mapping entity ids to lists of bounding boxes 49 | @param pages_from_pdf_parser: dictionary mapping page numbers to lists of 50 | words 51 | 52 | @raises InconsistentPageCountError: if the page numberings are inconsistent 53 | """ 54 | if pages_from_entity_detection != pages_from_pdf_parser: 55 | raise InconsistentPageCountError( 56 | expected=pages_from_entity_detection, 57 | actual=pages_from_pdf_parser 58 | ) 59 | -------------------------------------------------------------------------------- /app/src/annotation/soffice/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/annotation/soffice/__init__.py -------------------------------------------------------------------------------- /app/src/annotation/soffice/utils.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | import psutil 4 | from typing import Union 5 | 6 | __all__ = [ 7 | "get_soffice_process_on_port", 8 | "get_free_port" 9 | ] 10 | 11 | 12 | def get_soffice_process_on_port(port) -> Union[psutil.Process, None]: 13 | """ function returns the soffice process object on the given port or 14 | None if no process is running on the given port. 15 | """ 16 | for proc in psutil.process_iter(): 17 | try: 18 | name = proc.name() 19 | except ( 20 | psutil.NoSuchProcess, 21 | psutil.AccessDenied, 22 | psutil.ZombieProcess 23 | ): 24 | continue 25 | 26 | if not name.startswith("soffice"): 27 | continue 28 | 29 | try: 30 | connections = proc.connections() 31 | except ( 32 | psutil.NoSuchProcess, 33 | psutil.AccessDenied, 34 | psutil.ZombieProcess 35 | ): 36 | continue 37 | 38 | for conn in connections: 39 | if ( 40 | conn.status == psutil.CONN_LISTEN and 41 | conn.laddr.port == port 42 | ): 43 | return proc 44 | 45 | return None 46 | 47 | 48 | def get_free_port(): 49 | r""" function returns a free port on the current machine """ 50 | with socket.socket() as s: 51 | s.bind(("", 0)) 52 | return s.getsockname()[1] 53 | -------------------------------------------------------------------------------- /app/src/annotation/text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/annotation/text/__init__.py -------------------------------------------------------------------------------- /app/src/annotation/text/text_entity_matching.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from typing import Dict, List 3 | 4 | from src.annotation.utils.bbox_utils import area_of_overlap 5 | from src.annotation.annotation_objects import Entity, Word, BoundingBox 6 | 7 | 8 | def assign_entities_to_words( 9 | pages_entities: Dict[str, Dict[int, List[Entity]]], 10 | pages_words: Dict[str, List[Word]], 11 | threshold: float 12 | ) -> Dict[str, List[Word]]: 13 | r""" Assigns entities to words based on on whether the word bounding 14 | box is overlapping with the entity bounding box by at least the threshold. 15 | 16 | @param pages_entities: dictionary with page_ids as keys and dictionary of 17 | entities as value. The dictionary of entities is indexed by entity 18 | category and contains a list of entity objects. 19 | @param pages_words: dictionary with page_ids as keys and list of words as 20 | value. 21 | @param threshold: threshold for overlap between word and entity bounding 22 | boxes. 23 | 24 | @return: dictionary with page_ids as keys and list of words as value. 25 | """ 26 | for page_id in pages_words.keys(): 27 | words = pages_words[page_id] 28 | entities = pages_entities[page_id] 29 | for word in words: 30 | # find candidate entities for word 31 | candidate_entities = list(itertools.chain(*[ 32 | _find_candidate_entities( 33 | word=word, entities=entity_lst, threshold=threshold 34 | ) for entity_lst in entities.values() 35 | ])) 36 | 37 | # assign candidate entities to word 38 | for entity in candidate_entities: 39 | word.entity_ids.append(entity.id) 40 | word.entity_categories.append(entity.entity_category) 41 | 42 | return pages_words 43 | 44 | 45 | def _find_candidate_entities( 46 | word: Word, entities: List[Entity], threshold: float 47 | ) -> List[Entity]: 48 | r"""Finds candidate entities for a word based on whether the word bounding 49 | box is overlapping with the entity bounding box by at least threshold. 50 | 51 | @param word: word object to find candidate entities for 52 | @param entities: list of entity objects 53 | @param threshold: threshold for overlap 54 | 55 | @return: list of candidate entities 56 | """ 57 | return list(filter( 58 | lambda e: is_contained_in(word.bbox, e.bbox, threshold=threshold), 59 | entities 60 | )) 61 | 62 | 63 | def is_contained_in( 64 | bbox1: BoundingBox, bbox2: BoundingBox, threshold: float 65 | ) -> bool: 66 | r""" Checks whether bbox1 is contained in bbox2 by at least the threshold. 67 | 68 | @param bbox1: first bounding box 69 | @param bbox2: second bounding box 70 | @param threshold: threshold for overlap 71 | 72 | @return: True if the two bounding boxes are overlapping by at least the 73 | threshold, False otherwise 74 | """ 75 | if bbox1.area <= 0: 76 | return False 77 | 78 | overlap = area_of_overlap(bbox1, bbox2) 79 | ratio = overlap / bbox1.area 80 | 81 | return ratio >= threshold 82 | -------------------------------------------------------------------------------- /app/src/annotation/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .color_utils import hsv_to_rgb, hsv_to_bgr 2 | -------------------------------------------------------------------------------- /app/src/annotation/utils/bbox_utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import math 3 | import numpy as np 4 | from typing import Tuple, List 5 | 6 | from src.annotation.annotation_objects import BoundingBox 7 | 8 | 9 | def area_of_overlap( 10 | bbox1: BoundingBox, 11 | bbox2: BoundingBox 12 | ) -> float: 13 | r"""calculates the area of overlap between two bounding boxes 14 | 15 | @param bbox1: tuple of floats (x, y, w, h) indicating top-left corner 16 | (x, y), height h, and width w of the first bounding box 17 | @param bbox2: tuple of floats (x, y, w, h) indicating top-left corner 18 | (x, y), height h, and width w of the second bounding box 19 | 20 | returns: a float indicating the area of intersection between the two 21 | bounding boxes 22 | """ 23 | x1, y1, w1, h1 = bbox1.box 24 | x2, y2, w2, h2 = bbox2.box 25 | 26 | # determine coordinates of intersection triangle 27 | x_left = max(x1, x2) 28 | x_right = min(x1 + w1, x2 + w2) 29 | y_top = max(y1, y2) 30 | y_bottom = min(y1 + h1, y2 + h2) 31 | 32 | if x_right < x_left or y_bottom < y_top: 33 | return 0.0 34 | 35 | # The intersection of two axis-aligned bounding boxes is always an 36 | # axis-aligned bounding box 37 | intersection_area = (x_right - x_left) * (y_bottom - y_top) 38 | 39 | return intersection_area 40 | 41 | 42 | def euclidean_distance( 43 | bbox1: Tuple[float, float, float, float], 44 | bbox2: Tuple[float, float, float, float] 45 | ) -> float: 46 | r"""calculates the euclidean distance between two bounding boxes 47 | 48 | @param bbox1: tuple of floats (x, y, w, h) indicating top-left corner 49 | (x, y), height h, and width w of the first bounding box 50 | @param bbox2: tuple of floats (x, y, w, h) indicating top-left corner 51 | (x, y), height h, and width w of the second bounding box 52 | 53 | returns: a float indicating the euclidean distance between the two 54 | bounding boxes 55 | """ 56 | 57 | x1, y1, w1, h1 = bbox1 58 | x2, y2, w2, h2 = bbox2 59 | 60 | left = x1 + w1 < x2 61 | right = x2 + w2 < x1 62 | bottom = y2 + h2 < y1 63 | top = y1 + h1 < y2 64 | 65 | if top and left: 66 | return math.dist([x1 + w1, y1 + h1], [x2, y2]) 67 | elif left and bottom: 68 | return math.dist([x1 + w1, y1], [x2, y2 + h2]) 69 | elif bottom and right: 70 | return math.dist([x1, y1], [x2 + w2, y2 + h2]) 71 | elif right and top: 72 | return math.dist([x1, y1 + h1], [x2 + w2, y2]) 73 | elif left: 74 | return x2 - (x1 + w1) 75 | elif right: 76 | return x1 - (x2 + w2) 77 | elif bottom: 78 | return y1 - (y2 + h2) 79 | elif top: 80 | return y2 - (y1 + h1) 81 | else: # rectangles intersect 82 | return 0. 83 | 84 | 85 | def is_contained_in( 86 | bbox1: BoundingBox, 87 | bbox2: BoundingBox, 88 | ) -> bool: 89 | r"""determines whether bbox1 is contained in bbox2 90 | 91 | @param bbox1: tuple of floats (x, y, w, h) indicating top-left corner 92 | (x, y), height h, and width w of the first bounding box 93 | @param bbox2: tuple of floats (x, y, w, h) indicating top-left corner 94 | (x, y), height h, and width w of the second bounding box 95 | 96 | @return: True if bbox1 is contained in bbox2, False otherwise 97 | """ 98 | # determine the area of the first bounding box 99 | _, _, w1, h1 = bbox1.box 100 | area_bbox1 = w1 * h1 101 | 102 | intersection_area = area_of_overlap(bbox1, bbox2) 103 | 104 | # bbox1 is contained in bbox2 if the area of intersection is equal to the 105 | # area of bbox 1 106 | return math.isclose(intersection_area, area_bbox1) 107 | 108 | 109 | def detect_contours( 110 | image: np.array, 111 | lowerb: Tuple[int, int, int], 112 | upperb: Tuple[int, int, int] 113 | ) -> Tuple[List, List]: 114 | r""" utility function: detects contours in the image for values that fall 115 | in the range lowerb, upperb. 116 | 117 | @param image: image where contours are detected 118 | @param lowerb: lower bound of the range 119 | @param upperb: upper bound of the range 120 | 121 | @return: a tuple of two lists: the first list contains the contours, the 122 | second list contains the hierarchy 123 | """ 124 | # create mask where values are in range 125 | mask = cv2.inRange(image, lowerb=lowerb, upperb=upperb) 126 | 127 | # get contours in mask 128 | contours, hierarchy = cv2.findContours( 129 | mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE 130 | ) 131 | 132 | return contours, hierarchy 133 | -------------------------------------------------------------------------------- /app/src/annotation/utils/color_utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from docx.document import Document as DocxDocument 4 | from docx.oxml import OxmlElement 5 | from docx.oxml.ns import qn 6 | from docx.text.paragraph import Paragraph 7 | 8 | from typing import Tuple 9 | 10 | 11 | def rgb_to_hex(rgb_color: Tuple[int, int, int]) -> str: 12 | r"""convert rgb colors to hex 13 | 14 | @param rgb_color: a tuple of 3 values (r, g, b) 15 | 16 | @return: a hex string 17 | """ 18 | rgb_color = tuple(int(c) % 256 for c in np.squeeze(rgb_color)) 19 | 20 | if len(rgb_color) != 3: 21 | raise ValueError( 22 | "rgb color must consist of 3 positive numbers! " 23 | "got {}".format(len(rgb_color)) 24 | ) 25 | 26 | return "#%02x%02x%02x" % rgb_color 27 | 28 | 29 | def hsv_to_rgb(hsv_color: Tuple[int, int, int]) -> Tuple[int, int, int]: 30 | r"""convert hsv colors to rgb 31 | 32 | @param hsv_color: a tuple of 3 values (h, s, v) 33 | 34 | @return: a tuple of 3 values (r, g, b) 35 | """ 36 | hsv_color_uint8 = np.uint8(hsv_color) 37 | 38 | # TODO: remove this ambiguity: in the future only a tuple of 3 values is 39 | # accepted 40 | if len(hsv_color_uint8.shape) == 1: 41 | hsv_color_uint8 = np.expand_dims(hsv_color_uint8, axis=[0, 1]) 42 | elif len(hsv_color_uint8.shape) == 2: 43 | hsv_color_uint8 = np.expand_dims(hsv_color_uint8, axis=0) 44 | else: 45 | raise ValueError( 46 | "! Warning: hsv color has shape {}; this function " 47 | "excpects hsv_color to be a tuple of 3 values".format( 48 | hsv_color_uint8.shape) 49 | ) 50 | 51 | return tuple( 52 | cv2.cvtColor(hsv_color_uint8, cv2.COLOR_HSV2RGB) 53 | .squeeze() 54 | .astype(int) 55 | .tolist() 56 | ) 57 | 58 | 59 | def hsv_to_bgr(hsv_color: Tuple[int, int, int]) -> Tuple[int, int, int]: 60 | r"""convert hsv colors to bgr 61 | 62 | @param hsv_color: a tuple of 3 values (h, s, v) 63 | 64 | @return: a tuple of 3 values (r, g, b) 65 | """ 66 | hsv_color_uint8 = np.uint8(hsv_color) 67 | 68 | # TODO: remove this ambiguity: in the future only a tuple of 3 values is 69 | # accepted 70 | if len(hsv_color_uint8.shape) == 1: 71 | hsv_color_uint8 = np.expand_dims(hsv_color_uint8, axis=[0, 1]) 72 | elif len(hsv_color_uint8.shape) == 2: 73 | hsv_color_uint8 = np.expand_dims(hsv_color_uint8, axis=0) 74 | else: 75 | raise ValueError( 76 | "! Warning: hsv color has shape {}; this function " 77 | "excpects hsv_color to be a tuple of 3 values".format( 78 | hsv_color_uint8.shape) 79 | ) 80 | 81 | return tuple( 82 | cv2.cvtColor(hsv_color_uint8, cv2.COLOR_HSV2BGR) 83 | .squeeze() 84 | .astype(int) 85 | .tolist() 86 | ) 87 | 88 | 89 | def sanitize_figure_settings(document: DocxDocument): 90 | r""" 91 | Removing all child entries of the `a:blip xml` element 92 | ensures that all figures are loaded as-is with no rendering mods, 93 | enabling our figure-detection method to work 94 | 95 | @param document: the document to sanitize 96 | """ 97 | fig_blip_elements = document.element.body.xpath(".//pic:blipFill//a:blip") 98 | # delete the child elements of this 99 | for blip_wrapper in fig_blip_elements: 100 | for img_mod_child in blip_wrapper.getchildren(): 101 | blip_wrapper.remove(img_mod_child) 102 | 103 | 104 | def shade_element(prop, color_hex): 105 | r""" Apply shading to an element """ 106 | color_hex = color_hex.replace('#', '').upper() 107 | shd = OxmlElement("w:shd") 108 | shd.set(qn("w:fill"), color_hex) 109 | prop.append(shd) 110 | 111 | 112 | def check_if_par_is_numbered(par: Paragraph) -> bool: 113 | r""" 114 | Check if a par is numbered, which we assume to indicate a list. 115 | 116 | @param par: the paragraph to check 117 | 118 | @return: True if the paragraph is numbered, False otherwise 119 | """ 120 | 121 | # a list style (even within a normal paragraph!) means numbering has 122 | # occured. 123 | par_xml_numbering = par._p.xpath(".//w:pPr//w:numPr") 124 | 125 | if len(par_xml_numbering) > 0: 126 | return True 127 | 128 | return False 129 | -------------------------------------------------------------------------------- /app/src/annotation/utils/docx_utils.py: -------------------------------------------------------------------------------- 1 | from docx.document import Document as DocxDocument 2 | from lxml import etree 3 | from typing import Union 4 | 5 | from src.exceptions import UnknownPageCountException 6 | 7 | 8 | def get_page_count(doc: DocxDocument) -> Union[int, None]: 9 | r""" Get page count from docx file. 10 | 11 | @param doc: docx document 12 | 13 | @return: page count or None if not found 14 | """ 15 | for part in doc._part.package.iter_parts(): 16 | if part.partname.endswith("app.xml"): 17 | app_etree = etree.fromstring(part._blob) 18 | break 19 | else: 20 | raise UnknownPageCountException("app.xml not found") 21 | 22 | # get pages from app.xml 23 | for child in app_etree: 24 | if child.tag.endswith("Pages"): 25 | if child.text is None: 26 | break 27 | pages = int(child.text) 28 | return pages 29 | 30 | raise UnknownPageCountException("`Pages` tag not found") 31 | -------------------------------------------------------------------------------- /app/src/annotation/utils/identifiers.py: -------------------------------------------------------------------------------- 1 | def get_page_id(doc_id: str, page_number: int) -> str: 2 | """ Generate a page id. """ 3 | return f"{doc_id}_p{page_number:05d}" 4 | 5 | 6 | def get_page_num_from_page_id(page_id: str) -> int: 7 | """ Extract the page number from a page id. """ 8 | return int(page_id.split("_p")[-1]) 9 | 10 | 11 | def get_doc_id(cc_dump_id: str, doc_number: int) -> str: 12 | """ Generate a document id. """ 13 | return f"doc_{cc_dump_id}_{doc_number:08d}" 14 | -------------------------------------------------------------------------------- /app/src/annotation/utils/pdf_utils.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import cv2 4 | import numpy as np 5 | import os 6 | from pdf2image import pdfinfo_from_path, convert_from_path 7 | from typing import Union, Tuple, Dict 8 | 9 | from src.annotation.utils.identifiers import get_page_id 10 | 11 | PDF2IMG_BLOCKSIZE = 4 12 | 13 | __all__ = [ 14 | "get_page_count_from_pdf", 15 | "pdf_to_page_images_iterator", 16 | "extract_page_images_and_dimensions_from_pdf" 17 | ] 18 | 19 | 20 | def get_page_count_from_pdf(pdf_fp: pathlib.Path) -> int: 21 | r"""Get number of pages from pdf file. 22 | 23 | @param pdf_fp: path to pdf file 24 | 25 | @return: number of pages 26 | """ 27 | pdf_info = pdfinfo_from_path(pdf_fp, userpw=None, poppler_path=None) 28 | return int(pdf_info["Pages"]) 29 | 30 | 31 | def pdf_to_page_images_iterator( 32 | pdf_fp: str, 33 | fmt: str, 34 | dpi: int, 35 | size: Tuple[Union[int, None], Union[int, None]], 36 | output_folder: Union[str, None] 37 | ): 38 | r"""Iterate over pages of a pdf file. The function creates pages in batches 39 | of `block_size` pages. This is to avoid memory issues when converting 40 | large pdf files. 41 | 42 | @param pdf_fp: path to pdf file 43 | @param fmt: output format; this should be a lossless format when the 44 | function is used for entity detection. 45 | @param dpi: resolution of the output image(s) 46 | @param size: size of the output image(s), uses the Pillow (width, height) 47 | standard. If one of width or height is set to None, the image 48 | aspect ratio is preserved. 49 | @param output_folder: path to output folder 50 | 51 | @return: iterator over pages 52 | """ 53 | pdf_info = pdfinfo_from_path(pdf_fp, userpw=None, poppler_path=None) 54 | num_pages = pdf_info["Pages"] 55 | for page in range(1, num_pages + 1, PDF2IMG_BLOCKSIZE): 56 | # ! important: output format needs to use lossless compression when 57 | # ! converting the colorized pdf to images. Otherwise, the entity 58 | # ! detection will be inaccurate. 59 | yield convert_from_path( 60 | pdf_path=pdf_fp, 61 | size=size, 62 | dpi=dpi, 63 | first_page=page, 64 | thread_count=4, 65 | last_page=min(page + PDF2IMG_BLOCKSIZE - 1, num_pages), fmt=fmt, 66 | output_folder=output_folder 67 | ) 68 | 69 | 70 | def extract_page_images_and_dimensions_from_pdf( 71 | doc_id: str, 72 | pdf_fp: Union[str, pathlib.Path], 73 | target_dir: Union[str, pathlib.Path], 74 | fmt: str, 75 | dpi: int, 76 | size: Tuple[Union[int, None], Union[int, None]] 77 | ) -> Tuple[Dict[str, str], Dict[str, Tuple[int, int]]]: 78 | r"""Extract page images and dimensions from a pdf file. 79 | 80 | Note: Currently, this function saves individual page images to the 81 | target_dir directory. This will be removed in the future as want to 82 | write the images directly from memory into tar archives. 83 | 84 | @param doc_id: document id 85 | @param pdf_fp: path to pdf file 86 | @param target_dir: path to target directory 87 | @param fmt: output format; this should be a lossless format when the 88 | function is used for entity detection. 89 | @param dpi: resolution of the output image(s) 90 | @param size: size of the output image(s), uses the Pillow (width, height) 91 | standard. If one of width or height is set to None, the image 92 | aspect ratio is preserved. 93 | 94 | @return: dict with page_id as keys and paths to extracted images as 95 | value, dict with page_id as keys and dimensions for each 96 | image/page as value 97 | """ 98 | image_paths = {} 99 | image_dimensions = {} 100 | page_number = 1 # page number starts at 1 101 | 102 | # extract pages from pdf as images 103 | for pages_block in pdf_to_page_images_iterator( 104 | pdf_fp=pdf_fp, 105 | fmt=fmt, 106 | dpi=dpi, 107 | size=size, 108 | output_folder=None 109 | ): 110 | for page_img in pages_block: 111 | # get page id 112 | page_id = get_page_id(doc_id, page_number) 113 | 114 | # convert to cv2 format with HSV color space 115 | page_img = np.array(page_img).astype(np.uint8) 116 | page_img = cv2.cvtColor(page_img, code=cv2.COLOR_RGB2BGR) 117 | 118 | # extract dimensions 119 | height, width, _ = page_img.shape 120 | image_dimensions[page_id] = (width, height) 121 | 122 | fp = os.path.join( 123 | target_dir, f"{page_id}.{fmt}" 124 | ) 125 | image_paths[page_id] = fp 126 | cv2.imwrite(fp, page_img) 127 | 128 | page_number += 1 129 | 130 | return image_paths, image_dimensions 131 | -------------------------------------------------------------------------------- /app/src/annotation/utils/zip_bomb.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | import zipfile 3 | from pathlib import Path 4 | from PIL import Image 5 | from PIL import UnidentifiedImageError 6 | 7 | from src.exceptions import * 8 | 9 | __all__ = [ 10 | "get_uncompressed_file_size", 11 | "detect_image_decompression_bombs", 12 | "zip_bomb_check" 13 | ] 14 | 15 | # Limit images to around 32MB for a 24-bit (3 bpp) image 16 | MAX_IMAGE_PIXELS = int(1024 * 1024 * 1024 // 32 // 3) 17 | 18 | IMG_EXT = ( 19 | '.bmp', 20 | '.gif', 21 | '.jpeg', 22 | '.jpg', 23 | '.png', 24 | '.tiff', 25 | '.ico', 26 | '.pcx', 27 | '.ppm', 28 | '.pgm', 29 | '.pbm', 30 | '.pnm', 31 | '.webp', 32 | '.hdr', 33 | '.dds', 34 | '.im', 35 | '.eps', 36 | '.svg' 37 | ) 38 | 39 | 40 | def _compression_ratio(zip_file: zipfile.ZipFile): 41 | uncompressed_size = sum(zp.file_size for zp in zip_file.infolist()) 42 | compressed_size = sum(zp.compress_size for zp in zip_file.infolist()) 43 | 44 | if compressed_size == 0: 45 | return 0 46 | 47 | return uncompressed_size / compressed_size 48 | 49 | 50 | def get_uncompressed_file_size(doc_bytes: bytes, doc_fn: Path): 51 | # check if file is a valid zip file 52 | with BytesIO(doc_bytes) as f: 53 | if not zipfile.is_zipfile(f): 54 | raise NoZipFileException(f"{doc_fn} is not a valid zip file") 55 | 56 | # calculate uncompressed size 57 | with zipfile.ZipFile(f) as zf: 58 | uncompressed_size = sum(zp.file_size for zp in zf.infolist()) 59 | 60 | return uncompressed_size 61 | 62 | 63 | def detect_image_decompression_bombs(doc_bytes: bytes, doc_fn: Path): 64 | with BytesIO(doc_bytes) as f: 65 | if not zipfile.is_zipfile(f): 66 | raise NoZipFileException(f"{doc_fn} is not a valid zip file") 67 | 68 | # check if one of the images is a decompression bomb 69 | with zipfile.ZipFile(f) as zf: 70 | # check images in zip file 71 | for fp in zf.namelist(): 72 | if not fp.lower().endswith(IMG_EXT): 73 | continue 74 | 75 | img_bytes_compressed = zf.read(fp) 76 | 77 | try: 78 | Image.open(BytesIO(img_bytes_compressed)) 79 | except Image.DecompressionBombError as e: 80 | raise ImageDecompressionBombError( 81 | f"{doc_fn} -- Image decompression bomb detected: " 82 | "image pixels exceed max image pixels; " 83 | f"error:\n\t{e}" 84 | ) 85 | except Exception as e: 86 | print(f"[WARNING] reading image {fp} " 87 | f"failed with {e.__class__.__name__}: {e}") 88 | continue 89 | 90 | 91 | def zip_bomb_check( 92 | doc_bytes: bytes, threshold: float = 100, 93 | max_image_pixels=MAX_IMAGE_PIXELS 94 | ): 95 | Image.MAX_IMAGE_PIXELS = max_image_pixels 96 | 97 | with BytesIO(doc_bytes) as f: 98 | if not zipfile.is_zipfile(f): 99 | raise NoZipFileException(f"document is not a valid zip file") 100 | 101 | with zipfile.ZipFile(f, "r") as zip_file: 102 | cr = _compression_ratio(zip_file) 103 | 104 | if cr > threshold: 105 | raise ZipBombException(f"zip bomb detected: compression ratio" 106 | f" {cr} exceeds threshold {threshold}") 107 | 108 | # check images in zip file 109 | for fp in zip_file.namelist(): 110 | if ( 111 | not fp.startswith("word/media") or 112 | not fp.lower().endswith(IMG_EXT) 113 | ): 114 | continue 115 | 116 | img_bytes_compressed = zip_file.read(fp) 117 | 118 | try: 119 | Image.open(BytesIO(img_bytes_compressed)) 120 | except Image.DecompressionBombError as e: 121 | raise ImageDecompressionBombError( 122 | "Image decompression bomb detected: " 123 | "image pixels exceed max image pixels; " 124 | f"error:\n\t{e}" 125 | ) 126 | except UnidentifiedImageError as e: 127 | raise UnidentifiedImageError(e) 128 | except Exception as e: 129 | print(f"[WARNING] reading image {fp} " 130 | f"failed with {e.__class__.__name__}: {e}") 131 | continue 132 | -------------------------------------------------------------------------------- /app/src/cc_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/cc_processing/__init__.py -------------------------------------------------------------------------------- /app/src/cc_processing/deduplicate.py: -------------------------------------------------------------------------------- 1 | r"""Utility module, which globally deduplicates URLs obtained from CC dumps 2 | (i.e after running this script, a URL will show up only once globally accross all dump parquets) 3 | before passing them to download_docs_ray. Input are parquet files in CLEAN_URLS_DIR, 4 | output is globally deduplicated df.""" 5 | 6 | import pandas as pd 7 | import os 8 | from pathlib import Path 9 | 10 | def dedupe_urls(src_dir: str, input_df: pd.DataFrame) -> pd.DataFrame: 11 | r""" 12 | Deduplicate URLs globally: While processing a new URL dump, deduplicate against already processed dumps. 13 | @param src_dir: dir with parquets to deduplicate against. 14 | @param input_df: df of URLs currently being processed. 15 | @raises ValueError: No files in src_dir, or unexpected parquet format. 16 | 17 | return: Globally deduplicated df 18 | """ 19 | 20 | # list parquet files in src_dir 21 | unprocessed = list(filter(lambda x: x.endswith('.parquet'), os.listdir(src_dir))) 22 | initial_len = len(input_df) 23 | 24 | if (len(unprocessed) <= 0): 25 | raise ValueError("No parquet files found in " + src_dir) 26 | 27 | # build initial set (note: set lookup for contains is O(1)) 28 | # ! note individual parquets are already deduped on per-dump basis 29 | try: 30 | pqname = unprocessed.pop() 31 | curr_df = pd.read_parquet(Path(src_dir, pqname)) 32 | except: 33 | raise ValueError("Cannot read initial parquet file from " + src_dir) 34 | try: 35 | url_hash_tracker = set(curr_df['url_hash']) 36 | except: 37 | raise ValueError("Unexpected parquet format, url_hash required (in file) " + pqname) 38 | 39 | # go through each parquet file, and get the hashes 40 | while (len(unprocessed) > 0): 41 | pqname = unprocessed.pop() 42 | curr_df = pd.read_parquet(Path(src_dir, pqname)) 43 | try: 44 | url_hashes = set(curr_df['url_hash']) 45 | except: 46 | raise ValueError("Unexpected parquet format, url_hash required (in file) " + pqname) 47 | # add to hashes we compare against 48 | url_hash_tracker = url_hash_tracker.union(url_hashes) 49 | 50 | # remove duplicates 51 | hash_series = pd.Series(list(url_hash_tracker)) 52 | out_df = input_df[~input_df['url_hash'].isin(hash_series)] 53 | end_len = len(out_df) 54 | 55 | print('Removed ' + str(initial_len - end_len) + ' duplicates through comparison with already processed dumps') 56 | return out_df -------------------------------------------------------------------------------- /app/src/data_sources/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /app/src/data_sources/download_exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Classes for custom exceptions that may be thrown whilst downloading documents. 3 | """ 4 | import settings 5 | from typing import Tuple, Union 6 | 7 | 8 | class InvalidContentType(Exception): 9 | def __init__(self, content_type): 10 | """ 11 | The HTTP content type header is not acceptable. 12 | """ 13 | self.content_type = content_type 14 | 15 | def __repr__(self): 16 | return "InvalidContentType({})".format(self.content_type) 17 | 18 | 19 | class FileSizeExceeded(Exception): 20 | def __init__(self, filesize): 21 | """ 22 | Attemtped to download a document of too large filesize. 23 | """ 24 | self.filesize = filesize 25 | 26 | def __repr__(self): 27 | return "FileSizeExceeded({})".format(self.filesize) 28 | 29 | 30 | class OleCheckFailed(Exception): 31 | def __init__(self, error): 32 | """ 33 | A safety check on a .docx or .doc files OLE properties failed, so the document is not safe to download. 34 | (See Microsoft OLE documentation for safety of OLE properties, and maldoc_check.py for implementation) 35 | """ 36 | self.error = error 37 | 38 | def __repr__(self): 39 | return "OleCheckFailed({})".format(self.error) 40 | 41 | class HTTPError(Exception): 42 | def __init__(self, status_code=None): 43 | """ 44 | A benign HTTP error, signified by a status code 45 | """ 46 | self.status_code = status_code 47 | 48 | def __repr__(self): 49 | return "HTTPError={}".format(self.status_code) 50 | 51 | """ 52 | Functions to check validity of downloads and requests 53 | """ 54 | 55 | def valid_content_type( 56 | content_type: str 57 | ) -> Tuple[ 58 | Union[str, None], Union[InvalidContentType, None] 59 | ]: 60 | """check if content type is valid; this functions returns True if either 61 | the content type is unknown or if the content type is known and is found to 62 | be valid. 63 | 64 | @content_type: str content type 65 | return: bool, InvalidContentType exception or None 66 | """ 67 | if content_type is None: 68 | # unknown content type 69 | return content_type, None 70 | 71 | # sanitize content type string 72 | content_type = content_type.lower().replace('-', '') 73 | 74 | if settings.download.VALID_CT_REGEX.match(content_type) is None: 75 | return content_type, InvalidContentType(content_type=content_type) 76 | 77 | return content_type, None 78 | 79 | 80 | def valid_content_length( 81 | content_length: Union[str, None] 82 | ) -> Tuple[ 83 | Union[int, None], Union[FileSizeExceeded, None] 84 | ]: 85 | """check if content length is valid; this functions returns True if either 86 | the file size is known and below the maximally allowed file size, or if the 87 | file size is unknown. 88 | 89 | @content_length: str content length 90 | 91 | return: bool, FileSizeExceeded exception or None 92 | """ 93 | try: 94 | content_length = int(content_length) 95 | except (TypeError, ValueError): 96 | return content_length, None 97 | 98 | # check size of content 99 | if content_length > settings.download.MAX_FILESIZE: 100 | return content_length, FileSizeExceeded(filesize=content_length) 101 | 102 | return content_length, None -------------------------------------------------------------------------------- /app/src/data_sources/http_handlers.py: -------------------------------------------------------------------------------- 1 | import time 2 | import settings 3 | from typing import Tuple, Union 4 | import requests 5 | from src.data_sources.download_exceptions import ( 6 | FileSizeExceeded, 7 | HTTPError, 8 | InvalidContentType, 9 | valid_content_length, 10 | valid_content_type 11 | ) 12 | 13 | 14 | def run_sess( 15 | sess_method: Union[requests.get, requests.head], 16 | timeout: int, 17 | allow_redirects: bool, url: str 18 | ) -> Tuple[requests.Response, Exception, int]: 19 | """run session 20 | @param sess_method: requests.get or requests.head 21 | @param timeout: int timeout 22 | @param allow_redirects: bool allow redirects 23 | @param url: str url 24 | 25 | return: requests.Response, Exception str, Timestamp int 26 | """ 27 | timestamp = int(time.time()) 28 | exception = None 29 | 30 | try: 31 | response = sess_method( 32 | url, timeout=timeout, allow_redirects=allow_redirects, stream=True 33 | ) 34 | except Exception as e: 35 | response = None 36 | exception = e 37 | 38 | return response, exception, timestamp 39 | 40 | 41 | def header_handler( 42 | response: requests.Response, 43 | exception: Exception 44 | ) -> Tuple[ 45 | Union[requests.Response, None], 46 | dict, 47 | Union[Exception, FileSizeExceeded, InvalidContentType, HTTPError, None] 48 | ]: 49 | """ handle header: check for valid content type and content length, and 50 | return header metadata 51 | 52 | @param response: requests.Response 53 | @param exception: Exception raised during call to requests.head 54 | 55 | return: requests.Response, dict, Exception 56 | """ 57 | header_metadata = {} 58 | 59 | # in this case, the download failed during run_sess, so we return the 60 | # original exception raise by the call to sess.head 61 | if response is None: 62 | return response, header_metadata, exception 63 | 64 | # in this case, the server sent a response, but the response code is not 65 | # 200, so we return the HTTPError exception 66 | if response.status_code != 200: 67 | return ( 68 | response, 69 | header_metadata, 70 | HTTPError(status_code=response.status_code) 71 | ) 72 | 73 | header_metadata = {k: response.headers.get(k, None) for k in 74 | settings.download.HEADER_FIELDS} 75 | 76 | # check for valid content length 77 | content_length, exception = valid_content_length( 78 | header_metadata['content-length'] 79 | ) 80 | header_metadata['content-length'] = content_length 81 | 82 | if exception is not None: 83 | return response, header_metadata, exception 84 | 85 | # check for valid content type 86 | content_type, exception = valid_content_type( 87 | header_metadata['content-type'] 88 | ) 89 | header_metadata['content-type'] = content_type 90 | 91 | return response, header_metadata, exception 92 | 93 | 94 | def body_handler( 95 | response: requests.Response, 96 | exception: Exception 97 | ) -> Tuple[ 98 | Union[requests.Response, None], 99 | dict, 100 | Union[Exception, HTTPError, FileSizeExceeded, None] 101 | ]: 102 | """ handle body: check if response is valid, fetch ip-address and content 103 | length, and return body metadata 104 | 105 | @param response: requests.Response 106 | @param exception: Exception raised during call to sess.get 107 | 108 | return: requests.Response, dict, Exception 109 | """ 110 | body_metadata = {} 111 | 112 | # in this case, the download failed during run_sess, so we return the 113 | # original exception raise by the call to sess.get 114 | if response is None: 115 | return response, body_metadata, exception 116 | 117 | if response.status_code != 200: 118 | return ( 119 | response, 120 | body_metadata, 121 | HTTPError(status_code=response.status_code) 122 | ) 123 | 124 | # get content length 125 | try: 126 | content_length = len(response.content) 127 | except TypeError: 128 | content_length = None 129 | 130 | content_length, exception = valid_content_length(content_length) 131 | body_metadata = { 132 | # dummy value for ip --> not collected 133 | 'ip_address': 0, 'content_length': content_length 134 | } 135 | 136 | return response, body_metadata, exception 137 | -------------------------------------------------------------------------------- /app/src/exceptions.py: -------------------------------------------------------------------------------- 1 | from typing import Set 2 | 3 | __all__ = [ 4 | "InconsistentPageCountError", 5 | "InconsistentAspectRatiosError", 6 | "UnsupportedDocumentLayoutError", 7 | "ConversionFailedException", 8 | "SofficeStartFailed", 9 | "UnknownPageCountException", 10 | "PageCountExceededException", 11 | "ZipBombException", 12 | "NoZipFileException", 13 | "CompressedFileSizeExceededException", 14 | "UncompressedFileSizeExceededException", 15 | "ImageDecompressionBombError", 16 | "TextTooShortException" 17 | ] 18 | 19 | 20 | class InconsistentPageCountError(Exception): 21 | r"""Raised when the number of pages in the PDF file is not consistent with 22 | the number of pages in the annotated pdf. 23 | 24 | Note: If this error is raised, then it might indicate that the colorization 25 | step has interfered with the layout of the document! 26 | """ 27 | 28 | def __init__(self, expected: Set, actual: Set): 29 | self.expected = expected 30 | self.actual = actual 31 | super().__init__(f"Expected {expected} pages, but got {actual} pages.") 32 | 33 | def __repr__(self): 34 | return f"InconsistentPageCountError(" \ 35 | f"expected={self.expected}, actual={self.actual}" \ 36 | f")" 37 | 38 | 39 | class InconsistentAspectRatiosError(Exception): 40 | r"""Raised when the aspect ratios of the pages in the PDF file are not 41 | consistent with the aspect ratios of the pages in the rendered page images. 42 | """ 43 | 44 | def __init__(self, ar_pdf, ar_img): 45 | self.ar_pdf = ar_pdf 46 | self.ar_img = ar_img 47 | super().__init__(f"Expected inconsistent aspect ratios:" 48 | f"got {ar_pdf} from pdf" 49 | f"and {ar_img} from renderings.") 50 | 51 | def __repr__(self): 52 | return f"InconsistentAspectRatiosError(" \ 53 | f"ar_pdf={self.ar_pdf}, ar_img={self.ar_img}" \ 54 | f")" 55 | 56 | 57 | class SofficeStartFailed(Exception): 58 | r"""Raised when the soffice process fails to start.""" 59 | pass 60 | 61 | 62 | class UnsupportedDocumentLayoutError(Exception): 63 | r"""Raised when the layout of the document is not supported, such as 64 | too many document columns, i.e. more than 3""" 65 | 66 | def __init__(self, msg: str): 67 | self.msg = msg 68 | 69 | def __repr__(self): 70 | return f"UnsupportedDocumentLayoutError(msg={self.msg})" 71 | 72 | 73 | class ConversionFailedException(Exception): 74 | r"""Raised when the conversion of a doc/docx file to a pdf file fails.""" 75 | pass 76 | 77 | 78 | class UnknownPageCountException(Exception): 79 | r"""Raised when the page count of a document cannot be determined.""" 80 | pass 81 | 82 | 83 | class PageCountExceededException(Exception): 84 | r"""Raised when the page count of a document exceeds the maximum allowed 85 | number of pages.""" 86 | pass 87 | 88 | 89 | class ZipBombException(Exception): 90 | r"""Raised when a zip bomb is detected.""" 91 | pass 92 | 93 | 94 | class NoZipFileException(Exception): 95 | r"""Raised when a file is not a zip file.""" 96 | pass 97 | 98 | 99 | class CompressedFileSizeExceededException(Exception): 100 | r"""Raised when a file size exceeds the maximum allowed file size.""" 101 | pass 102 | 103 | 104 | class UncompressedFileSizeExceededException(Exception): 105 | r"""Raised when an uncompressed file size exceeds the maximum allowed 106 | file size.""" 107 | pass 108 | 109 | 110 | class ImageDecompressionBombError(Exception): 111 | r"""Raised when an image decompression bomb is detected.""" 112 | pass 113 | 114 | 115 | class TextTooShortException(Exception): 116 | r"""Raised when the text of a document is too short.""" 117 | pass 118 | -------------------------------------------------------------------------------- /app/src/extensions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/extensions/__init__.py -------------------------------------------------------------------------------- /app/src/extensions/obj_detection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/extensions/obj_detection/__init__.py -------------------------------------------------------------------------------- /app/src/extensions/obj_detection/data_prep/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/extensions/obj_detection/data_prep/__init__.py -------------------------------------------------------------------------------- /app/src/extensions/obj_detection/data_prep/wordscape_yolo_config_handler.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import List, Tuple 3 | from src.extensions.obj_detection.data_prep.wordscape_yolo_formatter import YOLOSettings 4 | from pathlib import Path 5 | import settings 6 | 7 | 8 | def json_to_config(json_obj, first_key: str) -> YOLOSettings: 9 | # make elem_mergings 10 | elem_mergings_formatted = {} 11 | elem_mergings_json = json_obj[first_key]["elem_mergings"] 12 | if len(elem_mergings_json) == 0: 13 | elem_mergings_formatted = {"masters": {}, "mapping": {}} 14 | else: 15 | masters = {} 16 | for key in elem_mergings_json["masters"]: 17 | masters[int(key)] = elem_mergings_json["masters"][key] 18 | mapping = {} 19 | for key in elem_mergings_json["mapping"]: 20 | mapping[int(key)] = int(elem_mergings_json["mapping"][key]) 21 | 22 | elem_mergings_formatted = {"masters": masters, "mapping": mapping} 23 | 24 | # check if elem_accepts are defined by the provided JSON 25 | elem_accepts_base = settings.entities.LABEL_NUMS 26 | if ( 27 | ("elem_accepts" in json_obj[first_key].keys()) 28 | and (json_obj[first_key]["elem_accepts"] != None) 29 | and (len(json_obj[first_key]["elem_accepts"]) > 0) 30 | ): 31 | elem_accepts_base = json_obj[first_key]["elem_accepts"] 32 | 33 | base_settings = YOLOSettings( 34 | raw_path=Path(json_obj[first_key]["raw_path"]), 35 | is_validation=json_obj[first_key]["is_validation"], 36 | max_img=json_obj[first_key]["max_img"], 37 | elem_drops=json_obj[first_key]["elem_drops"], 38 | elem_mergings=elem_mergings_formatted, 39 | elem_accepts=elem_accepts_base, 40 | scanify=json_obj[first_key]["scanify"], 41 | quality_threshold=json_obj[first_key]["quality_threshold"], 42 | language_codes=json_obj[first_key]["language_codes"], 43 | language_code_threshold=json_obj[first_key]["language_code_threshold"], 44 | ) 45 | 46 | # check if there are element minimums defined. If not, return the base settings 47 | if ( 48 | ("elem_mins" in json_obj[first_key].keys()) 49 | and (json_obj[first_key]["elem_mins"] != None) 50 | and (len(json_obj[first_key]["elem_mins"]) > 0) 51 | ): 52 | settings_list = [] 53 | for elem_type in json_obj[first_key]["elem_mins"].keys(): 54 | setting_modified = copy.deepcopy(base_settings) 55 | # distinction: elem_drops (dont include in dataset) vs. elems that we require to be in a doc (for filter purposes) 56 | setting_modified.elem_accepts = [int(elem_type)] 57 | setting_modified.max_img = json_obj[first_key]["elem_mins"][elem_type] 58 | settings_list.append(setting_modified) 59 | return settings_list 60 | else: 61 | return [base_settings] 62 | 63 | 64 | def parse_config(json_obj) -> Tuple[List[YOLOSettings], List[YOLOSettings]]: 65 | r""" 66 | Parse a json config to two YOLOSettings objects for train and validation dataset. 67 | """ 68 | 69 | train_settings = json_to_config(json_obj, "train_settings") 70 | val_settings = json_to_config(json_obj, "val_settings") 71 | 72 | return train_settings, val_settings 73 | -------------------------------------------------------------------------------- /app/src/extensions/obj_detection/data_prep/yolo_dataset_report.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import yaml 3 | from pathlib import Path 4 | import settings 5 | import json 6 | 7 | 8 | def parse_label_folder(folder_path: Path) -> dict: 9 | # track total results 10 | entity_counts = {key: 0 for key in settings.entities.LABEL_NUMS} 11 | # also track empty label examples 12 | empty_labels = 0 13 | 14 | for txt_file in folder_path.glob("*"): 15 | # read the txt file 16 | txt_lines = [] 17 | with open(txt_file, "r") as txt_file_open: 18 | for line in txt_file_open.readlines(): 19 | line_list = line.split() 20 | txt_lines.append(line_list) 21 | 22 | # only count unique bboxes per img 23 | txt_lines = [list(x) for x in set(tuple(x) for x in txt_lines)] 24 | # count entity appearances 25 | for entry in txt_lines: 26 | entity_counts[int(entry[0])] = entity_counts[int(entry[0])] + 1 27 | 28 | if len(txt_lines) == 0: 29 | empty_labels += 1 30 | 31 | entity_counts[-1] = empty_labels 32 | return entity_counts 33 | 34 | 35 | def main(): 36 | arg_parser = argparse.ArgumentParser() 37 | arg_parser.add_argument( 38 | "--dataset_path", 39 | "-dp", 40 | type=str, 41 | default="/mnt/DATA/msc-data/yolo_wordscape_experiments/3headers_balanced_quality", 42 | help="path to dataset to analyze", 43 | ) 44 | args = arg_parser.parse_args() 45 | 46 | # read labels for yolo classes from dataset.yaml 47 | ds_path = Path(args.dataset_path) 48 | with open(ds_path / "dataset.yaml", "r") as stream: 49 | yaml_ds = yaml.safe_load(stream) 50 | labels = yaml_ds["names"] 51 | 52 | # check train and val data 53 | train_counts = parse_label_folder(ds_path / "train" / "labels") 54 | val_counts = parse_label_folder(ds_path / "val" / "labels") 55 | 56 | # apply labels for report 57 | train_counts_formatted = {} 58 | val_counts_formatted = {} 59 | for i in range(len(labels)): 60 | train_counts_formatted[labels[i]] = train_counts[i] 61 | val_counts_formatted[labels[i]] = val_counts[i] 62 | train_counts_formatted["empty_labels"] = train_counts[-1] 63 | val_counts_formatted["empty_labels"] = val_counts[-1] 64 | 65 | report_dict = { 66 | "train_counts": train_counts_formatted, 67 | "val_counts": val_counts_formatted, 68 | } 69 | with open(ds_path / "report.json", "w") as report_w: 70 | json.dump(report_dict, report_w) 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /app/src/extensions/obj_detection/spaceml/move_train_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import shutil 4 | 5 | 6 | def main(): 7 | r""" 8 | Utility script to move some train data into a validation folder. 9 | """ 10 | 11 | arg_parser = argparse.ArgumentParser() 12 | arg_parser.add_argument( 13 | "--train_dir", 14 | "-td", 15 | type=str, 16 | default=None, 17 | help="source train dir to move files from", 18 | ) 19 | arg_parser.add_argument( 20 | "--val_dir", 21 | "-vd", 22 | type=str, 23 | default=None, 24 | help="destination val dir to move files to", 25 | ) 26 | arg_parser.add_argument( 27 | "--num", "-n", type=int, default=None, help="number of files to move" 28 | ) 29 | args = arg_parser.parse_args() 30 | 31 | meta_paths = [] 32 | 33 | tar_paths = sorted( 34 | filter(lambda x: x.endswith(".tar"), os.listdir(args.train_dir + "/multimodal")) 35 | )[0 : args.num] 36 | 37 | for tar_name in tar_paths: 38 | tar_path = os.path.join(args.train_dir + "/multimodal", tar_name) 39 | meta_path = ( 40 | "doc_meta_" + tar_name.replace("docs_", "").replace(".tar", "") + ".jsonl" 41 | ) 42 | meta_paths.append(meta_path) 43 | 44 | shutil.move(tar_path, args.val_dir + "/multimodal") 45 | 46 | for meta_name in meta_paths: 47 | meta_path_inner = os.path.join(args.train_dir + "/meta", meta_name) 48 | shutil.move(meta_path_inner, args.val_dir + "/meta") 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /app/src/extensions/obj_detection/spaceml/move_train_data_singlefiles.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from pathlib import Path 4 | import shutil 5 | import random 6 | 7 | def main(): 8 | r""" 9 | Utility script to move some train data (images and labels) into a different folder. 10 | """ 11 | 12 | arg_parser = argparse.ArgumentParser() 13 | arg_parser.add_argument( 14 | "--source_dir", 15 | "-sd", 16 | type=str, 17 | default=None, 18 | help="source dir to move files from", 19 | ) 20 | arg_parser.add_argument( 21 | "--dest_dir", 22 | "-dd", 23 | type=str, 24 | default=None, 25 | help="destination dir to move files to", 26 | ) 27 | arg_parser.add_argument( 28 | "--num", "-n", type=int, default=None, help="number of files to move" 29 | ) 30 | args = arg_parser.parse_args() 31 | 32 | img_paths = sorted( 33 | filter(lambda x: x.endswith(".png") or x.endswith(".jpg"), os.listdir(args.source_dir + "/images")) 34 | )[0 : args.num] 35 | 36 | print(img_paths) 37 | 38 | # randomly sample 39 | img_paths_shuffled = random.sample(img_paths, len(img_paths)) 40 | 41 | label_paths = [] 42 | 43 | for img_name in img_paths_shuffled: 44 | img_path = Path(args.source_dir + "/images") / img_name 45 | dest_path = Path(args.dest_dir + "/images") / img_name 46 | shutil.move(img_path, dest_path) 47 | # print(img_path) 48 | # print(dest_path) 49 | 50 | label_path = img_path.parents[1] / "labels" / img_name.replace('.png', '.txt').replace('.jpg', '.txt') 51 | label_paths.append(label_path) 52 | 53 | for label_path in label_paths: 54 | dest_path = Path(args.dest_dir + "/labels") / label_path.name 55 | shutil.move(label_path, dest_path) 56 | # print(label_path) 57 | # print(dest_path) 58 | 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /app/src/extensions/obj_detection/spaceml/ws_yolo_experimentrun.py: -------------------------------------------------------------------------------- 1 | from ultralytics import YOLO 2 | import argparse 3 | import os 4 | 5 | 6 | def main(): 7 | r""" 8 | A script to run a YOLO-Wordscript experiment. Arguments are the path to the YOLO dataset yaml, and the GPUs to use. 9 | """ 10 | arg_parser = argparse.ArgumentParser() 11 | arg_parser.add_argument( 12 | "--config_path", 13 | "-cp", 14 | type=str, 15 | default="/home/valde/GitHub/msc-thesis/data/experiments/baseline/dataset.yaml", 16 | help="path to config", 17 | ) 18 | arg_parser.add_argument( 19 | "--gpu_usage", 20 | "-gu", 21 | type=str, 22 | default="0,1,2,3", 23 | help="Comma separated list of CUDA GPU IDs", 24 | ) 25 | arg_parser.add_argument( 26 | "--epochs", "-ep", type=int, default=10, help="number of epochs" 27 | ) 28 | arg_parser.add_argument( 29 | "--gpu_batch", "-gb", type=int, default=24, help="batch size per gpu" 30 | ) 31 | arg_parser.add_argument( 32 | "--resume_path", 33 | "-rp", 34 | type=str, 35 | default=None, 36 | help="Path to weights for resume", 37 | ) 38 | arg_parser.add_argument( 39 | "--use_pretrained", 40 | "-up", 41 | type=bool, 42 | default=False, 43 | help="Flag to use resume_path not to resume from, but as pretrained weights for a new experiment", 44 | ) 45 | arg_parser.add_argument( 46 | "--override_name", 47 | "-on", 48 | type=str, 49 | default=None, 50 | help="Optionally override experiment name", 51 | ) 52 | arg_parser.add_argument( 53 | "--random_weights", 54 | "-rw", 55 | type=bool, 56 | default=False, 57 | help="If set, the model will be initialized with random weights (i.e train fully from scratch)" 58 | ) 59 | arg_parser.add_argument( 60 | "--learning_rate", 61 | "-lr", 62 | type=bool, 63 | default=False, 64 | help="If set, the model will be trained using learning rate decay" 65 | ) 66 | args = arg_parser.parse_args() 67 | 68 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 69 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_usage 70 | 71 | experiment_name = args.config_path.split("/")[-2] 72 | if (args.override_name != None): 73 | experiment_name = args.override_name 74 | 75 | # check wether to resume with these weights, or to use as pretrained 76 | res_decision = False 77 | if (args.resume_path != None) and (args.use_pretrained == False): 78 | res_decision = True 79 | 80 | model = YOLO("yolov5lu.pt") 81 | if args.resume_path != None: 82 | model = YOLO(args.resume_path) 83 | if args.random_weights == True: 84 | # ! important: .yaml means this is just a config, not preloaded weights 85 | model = YOLO("yolov5l.yaml") 86 | if args.learning_rate == True: 87 | model.train( 88 | data=args.config_path, 89 | lr0 = 1e-3, 90 | lrf = 1e-4, 91 | epochs=args.epochs, 92 | name=experiment_name, 93 | device=[int(x) for x in args.gpu_usage.split(",")], 94 | batch=len(args.gpu_usage.split(",")) * args.gpu_batch, 95 | resume=res_decision, 96 | ) 97 | else: 98 | model.train( 99 | data=args.config_path, 100 | epochs=args.epochs, 101 | name=experiment_name, 102 | device=[int(x) for x in args.gpu_usage.split(",")], 103 | batch=len(args.gpu_usage.split(",")) * args.gpu_batch, 104 | resume=res_decision, 105 | ) 106 | 107 | 108 | if __name__ == "__main__": 109 | main() 110 | -------------------------------------------------------------------------------- /app/src/extensions/pretrain/layoutlmv3/data_prep/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/extensions/pretrain/layoutlmv3/data_prep/__init__.py -------------------------------------------------------------------------------- /app/src/quality/__init__.py: -------------------------------------------------------------------------------- 1 | from . import text_normalizer 2 | -------------------------------------------------------------------------------- /app/src/quality/perplexity.py: -------------------------------------------------------------------------------- 1 | """ 2 | code based on 3 | https://github.com/facebookresearch/cc_net/blob/main/cc_net/text_normalizer.py 4 | """ 5 | 6 | from pathlib import Path 7 | from sentencepiece import SentencePieceProcessor 8 | import kenlm 9 | 10 | from src.quality import text_normalizer 11 | 12 | 13 | def perplexity(log_score, length): 14 | return 10.0 ** (-log_score / length) 15 | 16 | 17 | class SentencePiece: 18 | 19 | def __init__(self, model: Path, normalize=True): 20 | self._normalize = normalize 21 | 22 | self._sp = SentencePieceProcessor() 23 | self._sp.load(str(model)) 24 | 25 | def tokenize(self, text: str): 26 | if self._normalize: 27 | text = text_normalizer.normalize(text) 28 | 29 | tokenized = self._sp.encode_as_pieces(text) 30 | return " ".join(tokenized) 31 | 32 | 33 | class LanguageModel: 34 | def __init__(self, sp_model: Path, lm_model: Path): 35 | # init models 36 | self._sp = SentencePiece(sp_model, normalize=True) 37 | lm_config = kenlm.Config() 38 | self._lm = kenlm.Model(str(lm_model), lm_config) 39 | 40 | def compute_perplexity(self, content: str) -> float: 41 | # tokenize 42 | content = self._sp.tokenize(content) 43 | 44 | # get lines 45 | lines = content.split("\n") 46 | 47 | doc_log_score, doc_length = 0, 0 48 | 49 | for line in lines: 50 | log_score = self._lm.score(line) 51 | length = len(line.split()) + 1 52 | doc_log_score += log_score 53 | doc_length += length 54 | 55 | return perplexity(doc_log_score, doc_length) 56 | -------------------------------------------------------------------------------- /app/src/quality/text_normalizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | code adapted from 3 | https://github.com/facebookresearch/cc_net/blob/main/cc_net/text_normalizer.py 4 | """ 5 | import re 6 | import unicodedata 7 | 8 | DIGIT_RE = re.compile(r"\d") 9 | UNICODE_PUNCT = { 10 | ",": ",", 11 | "。": ".", 12 | "、": ",", 13 | "„": '"', 14 | "”": '"', 15 | "“": '"', 16 | "«": '"', 17 | "»": '"', 18 | "1": '"', 19 | "」": '"', 20 | "「": '"', 21 | "《": '"', 22 | "》": '"', 23 | "´": "'", 24 | "∶": ":", 25 | ":": ":", 26 | "?": "?", 27 | "!": "!", 28 | "(": "(", 29 | ")": ")", 30 | ";": ";", 31 | "–": "-", 32 | "—": " - ", 33 | ".": ". ", 34 | "~": "~", 35 | "’": "'", 36 | "…": "...", 37 | "━": "-", 38 | "〈": "<", 39 | "〉": ">", 40 | "【": "[", 41 | "】": "]", 42 | "%": "%", 43 | "►": "-", 44 | } 45 | UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]") 46 | 47 | NON_PRINTING_CHARS_RE = re.compile( 48 | f"[{''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))}]" 49 | ) 50 | 51 | 52 | def strip_accents(line: str) -> str: 53 | """Strips accents from a piece of text.""" 54 | nfd = unicodedata.normalize("NFD", line) 55 | output = [c for c in nfd if unicodedata.category(c) != "Mn"] 56 | if len(output) == line: 57 | return line 58 | return "".join(output) 59 | 60 | 61 | def replace_unicode_punct(text: str) -> str: 62 | return "".join((UNICODE_PUNCT.get(c, c) for c in text)) 63 | 64 | 65 | def remove_non_printing_char(text: str) -> str: 66 | return NON_PRINTING_CHARS_RE.sub("", text) 67 | 68 | 69 | def normalize(line: str) -> str: 70 | line = line.strip() 71 | 72 | if not line: 73 | return line 74 | 75 | line = line.lower() 76 | line = strip_accents(line) 77 | line = DIGIT_RE.sub("0", line) 78 | line = replace_unicode_punct(line) 79 | line = remove_non_printing_char(line) 80 | 81 | return line 82 | -------------------------------------------------------------------------------- /app/utilities/checksums.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/utilities/checksums.parquet -------------------------------------------------------------------------------- /app/utilities/compute_checksums.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from concurrent.futures import ProcessPoolExecutor, as_completed 3 | import functools 4 | import multiprocessing as mp 5 | from pathlib import Path 6 | import tarfile 7 | from typing import Set 8 | import hashlib 9 | import polars as pl 10 | from typing import Dict, List 11 | from tqdm import tqdm 12 | 13 | # ------ debug 14 | _sources = "/Users/maurice/phd/code/openDoc/WordScape-Data/annotated/cc_main_2022_49/20230601_163415/doc_sources" 15 | _doc_meta = "/Users/maurice/phd/code/openDoc/WordScape-Data/annotated/cc_main_2022_49/20230601_163415/meta_copy/doc.meta.parquet" 16 | # ------ debug 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("--sources", type=str, default=_sources) 20 | parser.add_argument("--doc_meta", type=str, default=_doc_meta) 21 | parser.add_argument("--out_dir", type=str, default=".") 22 | args = parser.parse_args() 23 | 24 | 25 | def load_document_ids(meta_fp) -> Set[str]: 26 | return set(( 27 | pl.scan_parquet(meta_fp) 28 | .select(pl.col("url_hash")) 29 | ).collect().to_dict()["url_hash"]) 30 | 31 | 32 | def name_to_id(name: str) -> str: 33 | return name.replace("doc_", "").split(".")[0] 34 | 35 | 36 | def process_single_file(tar_fp: Path, meta_fp: Path) -> Dict[str, List[str]]: 37 | document_ids = load_document_ids(meta_fp) 38 | tar = tarfile.open(tar_fp, 'r:gz') 39 | 40 | data = { 41 | "url_hash": [], "bytehash": [] 42 | } 43 | 44 | for mem in tar.getmembers(): 45 | url_hash = name_to_id(mem.name) 46 | if url_hash in document_ids: 47 | with tar.extractfile(mem) as fobj: 48 | checksum = hashlib.sha256(fobj.read()).hexdigest() 49 | data["url_hash"].append(url_hash) 50 | data["bytehash"].append(checksum) 51 | 52 | return data 53 | 54 | 55 | def process_all(): 56 | meta_fp = Path(args.doc_meta) 57 | source_tars = list(Path(args.sources).glob("*.tar.gz")) 58 | 59 | process_fn = functools.partial(process_single_file, meta_fp=meta_fp) 60 | 61 | with ProcessPoolExecutor(max_workers=mp.cpu_count() - 4) as executor: 62 | futures = list( 63 | executor.submit(process_fn, tar_fp) for tar_fp in source_tars 64 | ) 65 | 66 | count = 0 67 | for future in tqdm(as_completed(futures), total=len(futures)): 68 | single_data = future.result() 69 | pl.DataFrame(single_data).write_parquet( 70 | f"{args.out_dir}/checksums-{count}.parquet" 71 | ) 72 | count += 1 73 | futures.remove(future) 74 | 75 | 76 | if __name__ == '__main__': 77 | process_all() 78 | -------------------------------------------------------------------------------- /app/utilities/merge_annotations_metadata.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import concurrent.futures 3 | 4 | import jsonlines 5 | import joblib 6 | import pandas as pd 7 | import pathlib 8 | from typing import Dict 9 | from tqdm import tqdm 10 | 11 | FP_PATTERNS = { 12 | "page": "*page_*.jsonl", 13 | "doc": "*doc_*.jsonl" 14 | } 15 | 16 | FLATTEN_KWS = [ 17 | "annotation_sources" 18 | "builtin_proportion_per_entity" 19 | ] 20 | 21 | MAX_ROWS_IN_MEM = 100_000 22 | 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--meta_dir", type=str, default=None) 25 | args = parser.parse_args() 26 | 27 | 28 | def _flatten_obj(obj: Dict[str, int], key: str): 29 | if not isinstance(obj, dict): 30 | raise ValueError 31 | 32 | return {f"{key}_{k}": v for k, v in obj.items()} 33 | 34 | 35 | def _serialize(obj): 36 | if isinstance(obj, list): 37 | return str(obj) 38 | return obj 39 | 40 | 41 | def _to_dataframe(jsonl_fp: pathlib.Path) -> pd.DataFrame: 42 | df = pd.DataFrame() 43 | 44 | data = {} 45 | 46 | try: 47 | with jsonlines.open(jsonl_fp) as reader: 48 | for obj in reader: 49 | obj_procsd = { 50 | k: _serialize(v) 51 | for k, v in obj.items() if k not in FLATTEN_KWS 52 | } 53 | for k in FLATTEN_KWS: 54 | 55 | if k not in obj: 56 | continue 57 | 58 | obj_procsd.update(_flatten_obj(obj[k], k)) 59 | 60 | if len(data) == 0: 61 | data = {k: [v] for k, v in obj_procsd.items()} 62 | continue 63 | 64 | for k in data.keys(): 65 | data[k].append(obj_procsd[k]) 66 | except Exception as e: 67 | print(f"Failed loading {jsonl_fp} with {e.__class__.__name__}:\n{e}") 68 | 69 | return df.from_dict(data) 70 | 71 | 72 | def do_merge(level: str, meta_dir: pathlib.Path): 73 | print(f"start generating {level}-level metadata file") 74 | fp_pattern = FP_PATTERNS[level] 75 | 76 | meta_files = list(meta_dir.glob(fp_pattern)) 77 | 78 | full_df = pd.DataFrame() 79 | 80 | full_df_fp = meta_dir / f"{level}.meta.parquet" 81 | append = False 82 | 83 | print(f"start generating {level}-level metadata file; " 84 | f"saving to {full_df_fp}") 85 | 86 | with concurrent.futures.ProcessPoolExecutor( 87 | max_workers=joblib.cpu_count() - 1 88 | ) as executor: 89 | for part_df in (pbar := tqdm( 90 | executor.map(_to_dataframe, meta_files), 91 | total=len(meta_files) 92 | )): 93 | full_df = pd.concat([full_df, part_df], ignore_index=True) 94 | rows_in_mem = len(full_df) 95 | 96 | if rows_in_mem > MAX_ROWS_IN_MEM: 97 | full_df.to_parquet( 98 | path=full_df_fp, append=append, engine="fastparquet" 99 | ) 100 | pbar.set_postfix_str( 101 | f"wrote to {full_df_fp} with append={append}" 102 | ) 103 | append = True 104 | full_df = pd.DataFrame(columns=full_df.columns) 105 | 106 | if len(full_df) > 0: 107 | full_df.to_parquet( 108 | path=full_df_fp, append=append, engine="fastparquet" 109 | ) 110 | 111 | del full_df 112 | 113 | 114 | if __name__ == '__main__': 115 | do_merge(level="doc", meta_dir=pathlib.Path(args.meta_dir)) 116 | do_merge(level="page", meta_dir=pathlib.Path(args.meta_dir)) 117 | -------------------------------------------------------------------------------- /app/utilities/merge_sources_metadata.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import concurrent.futures 3 | import joblib 4 | import pandas as pd 5 | import pathlib 6 | from tqdm import tqdm 7 | 8 | MAX_ROWS_IN_MEM = 100_000 9 | 10 | 11 | def get_args(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("--meta_dir", type=str, default=None) 14 | args = parser.parse_args() 15 | return args 16 | 17 | 18 | def _load_parquet(meta_fp: pathlib.Path): 19 | return pd.read_parquet(meta_fp) 20 | 21 | 22 | def main(): 23 | args = get_args() 24 | data_dir = pathlib.Path(args.meta_dir) 25 | dump_id = data_dir.name 26 | 27 | meta_files = list(data_dir.glob("*.parquet")) 28 | 29 | print("Found", len(meta_files), "source meta files") 30 | 31 | full_df = pd.DataFrame() 32 | full_df_fp = data_dir.parent / f"sources_{dump_id}.meta.parquet" 33 | 34 | append = False 35 | 36 | with concurrent.futures.ProcessPoolExecutor( 37 | max_workers=joblib.cpu_count() - 2 38 | ) as executor: 39 | for part_df in (pbar := tqdm( 40 | executor.map(_load_parquet, meta_files), 41 | total=len(meta_files) 42 | )): 43 | full_df = pd.concat([full_df, part_df], ignore_index=True) 44 | rows_in_mem = len(full_df) 45 | 46 | if rows_in_mem > MAX_ROWS_IN_MEM: 47 | full_df.to_parquet( 48 | path=full_df_fp, append=append, engine="fastparquet" 49 | ) 50 | pbar.set_postfix_str( 51 | f"wrote to {full_df_fp} with append={append}" 52 | ) 53 | append = True 54 | full_df = pd.DataFrame(columns=full_df.columns) 55 | 56 | if len(full_df) > 0: 57 | full_df.to_parquet( 58 | path=full_df_fp, append=append, engine="fastparquet" 59 | ) 60 | 61 | del full_df 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | -------------------------------------------------------------------------------- /app/utilities/run_filter_tars.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pathlib 3 | import tarfile 4 | import multiprocessing as mp 5 | import json 6 | from typing import List, Tuple, Union 7 | from pathlib import Path 8 | 9 | import joblib 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--data_root", type=str, default=None) 13 | args = parser.parse_args() 14 | 15 | 16 | def get_page_id(fn: str) -> str: 17 | return fn[fn.find("doc_"):].replace("doc_", "") 18 | 19 | 20 | def filter_tar_file( 21 | inputs: Tuple[pathlib.Path, List[str]] 22 | ) -> Union[int, None]: 23 | src_tar_fp, whitelist_pages = inputs 24 | 25 | whitelist_pages = set(get_page_id(p) for p in whitelist_pages) 26 | 27 | filtered_tar_fn = src_tar_fp.name.replace(".tar.gz", ".filtered.tar.gz") 28 | filtered_tar_fp = src_tar_fp.parent / filtered_tar_fn 29 | 30 | src_tar = tarfile.open(src_tar_fp, 'r:gz') 31 | tgt_tar = tarfile.open(filtered_tar_fp, 'w:gz') 32 | 33 | try: 34 | all_jpg_members = set( 35 | get_page_id(Path(mem.name).stem) for mem in src_tar.getmembers() 36 | if mem.name.endswith(".jpg") 37 | ) 38 | all_txt_members = set( 39 | get_page_id(Path(mem.name).stem) for mem in src_tar.getmembers() 40 | if mem.name.startswith("text_doc_") 41 | ) 42 | all_ent_members = set( 43 | get_page_id(Path(mem.name).stem) for mem in src_tar.getmembers() 44 | if mem.name.startswith("entities_doc_") 45 | ) 46 | all_wrd_members = set( 47 | get_page_id(Path(mem.name).stem) for mem in src_tar.getmembers() 48 | if mem.name.startswith("words_doc_") 49 | ) 50 | 51 | all_page_ids = all_jpg_members & all_txt_members \ 52 | & all_ent_members & all_wrd_members 53 | 54 | filtered_pages = all_page_ids & whitelist_pages 55 | 56 | # write all matching members to target tar 57 | num_files = 0 58 | for mem in src_tar.getmembers(): 59 | num_files += 1 60 | page_id = get_page_id(Path(mem.name).stem) 61 | if page_id not in filtered_pages: 62 | continue 63 | 64 | fobj = src_tar.extractfile(mem) 65 | fobj.seek(0) 66 | 67 | # write to target tar 68 | tgt_tar.addfile(mem, fobj) 69 | 70 | except Exception as e: 71 | print("Error processing: ", src_tar_fp) 72 | tgt_tar.close() 73 | src_tar.close() 74 | filtered_tar_fp.unlink(missing_ok=True) 75 | print(e) 76 | return 0 77 | 78 | num_filtered_files = len(filtered_pages) * 4 79 | print("Processed: ", src_tar_fp) 80 | print(f"Total files: {num_files}, Filtered files: {num_filtered_files}") 81 | 82 | tgt_tar.close() 83 | src_tar.close() 84 | 85 | return 1 86 | 87 | 88 | def main(): 89 | data_root = pathlib.Path(args.data_root) 90 | annotations_dir = data_root / "multimodal" 91 | paths = list(annotations_dir.glob("*.tar.gz")) 92 | total_paths = len(paths) 93 | 94 | if total_paths == 0: 95 | print("No files found in: ", args.data_root) 96 | return 97 | 98 | # load whitelisted urls 99 | with open(data_root / "whitelist_pages.json", 'r') as f: 100 | whitelist_pages = json.load(f) 101 | 102 | # construct inputs 103 | inputs = list() 104 | for path in paths: 105 | shard_id = path.name.replace("docs_", "").replace(".tar.gz", "") 106 | try: 107 | inputs.append((path, whitelist_pages[shard_id])) 108 | except KeyError: 109 | print("No whitelist for: ", shard_id) 110 | continue 111 | 112 | with mp.Pool(processes=joblib.cpu_count() - 1) as pool: 113 | res_codes = pool.map(filter_tar_file, inputs) 114 | 115 | print("Total files: ", total_paths) 116 | print("Total filtered files: ", sum(res_codes)) 117 | 118 | 119 | if __name__ == '__main__': 120 | main() 121 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/data/.gitkeep -------------------------------------------------------------------------------- /docs/wordscape.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/docs/wordscape.png --------------------------------------------------------------------------------