├── .gitignore
├── LICENSE
├── README.md
├── README_ANNOTATION.md
├── README_CC_URL_PARSE.md
├── README_DOWNLOAD.md
├── README_EXTENSIONS.md
├── app
    ├── Dockerfile
    ├── alembic.ini
    ├── alembic
    │   ├── README
    │   ├── env.py
    │   ├── script.py.mako
    │   └── versions
    │   │   ├── 17cdff8379cb_initial_sources_record.py
    │   │   ├── 2de829bd1ca3_remove_sensitive_info_add_bytehash.py
    │   │   ├── d1fbae82c5fa_initial_metadata_model.py
    │   │   └── eb22a058c4c9_less_mandatory_fields_for_doc_sources.py
    ├── annotate_run.py
    ├── cc_parse_merge_and_recover_urls.py
    ├── cc_parse_partition_listings.py
    ├── cc_parse_snapshot.py
    ├── configs
    │   ├── default_config.yaml
    │   └── extensions
    │   │   ├── obj_detection
    │   │       └── ws_yolo
    │   │       │   ├── 1header.json
    │   │       │   ├── 3headers.json
    │   │       │   ├── baseline.json
    │   │       │   ├── baseline_quality.json
    │   │       │   ├── local
    │   │       │       ├── 1header_balanced_quality.json
    │   │       │       ├── 1header_balanced_quality_multilang.json
    │   │       │       ├── 1header_balanced_quality_report.json
    │   │       │       ├── 3headers_balanced_quality.json
    │   │       │       ├── 3headers_balanced_quality_report.json
    │   │       │       ├── tableonly_balanced.json
    │   │       │       └── tableonly_balanced_cut.json
    │   │       │   ├── spaceml
    │   │       │       ├── 1header.json
    │   │       │       ├── 1header_quality.json
    │   │       │       ├── 3headers.json
    │   │       │       ├── 3headers_quality.json
    │   │       │       ├── baseline.json
    │   │       │       ├── baseline_quality.json
    │   │       │       ├── tableonly.json
    │   │       │       └── tableonly_quality.json
    │   │       │   ├── tableonly.json
    │   │       │   └── tableonly_quality.json
    │   │   └── pretrain
    │   │       └── layoutlm
    │   │           ├── 1header_balanced_quality.json
    │   │           └── 1header_balanced_quality_test.json
    ├── download_dump_data.py
    ├── download_prepare_urls.py
    ├── download_run.py
    ├── orm
    │   ├── __init__.py
    │   ├── dbutils
    │   │   └── db_connection.py
    │   └── models.py
    ├── pp_compute_perplexity.py
    ├── requirements.txt
    ├── resources
    │   ├── fasttext-models
    │   │   └── .gitkeep
    │   └── wikipedia-models
    │   │   └── .gitkeep
    ├── scripts
    │   ├── annotation-kickoff.sh
    │   ├── annotation-launch.sbatch
    │   ├── cc-parse-launch.sbatch
    │   ├── download-launch.sbatch
    │   ├── install_libreoffice_centos.sh
    │   ├── pp-compute-perplexity.sbatch
    │   ├── run-filter-tars.sbatch
    │   └── run_single_node.sh
    ├── settings
    │   ├── __init__.py
    │   ├── annotation.py
    │   ├── bbox.py
    │   ├── colors.py
    │   ├── content_awareness.py
    │   ├── download.py
    │   ├── entities.py
    │   ├── entity_names.json
    │   └── filesystem.py
    ├── src
    │   ├── __init__.py
    │   ├── annotation
    │   │   ├── __init__.py
    │   │   ├── annotation_objects.py
    │   │   ├── annotation_quality.py
    │   │   ├── annotator_process.py
    │   │   ├── builtin_styles.py
    │   │   ├── colorization
    │   │   │   ├── __init__.py
    │   │   │   ├── colorization_handler.py
    │   │   │   ├── colorize_doc.py
    │   │   │   ├── entities
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── figure.py
    │   │   │   │   ├── form.py
    │   │   │   │   ├── header_footer.py
    │   │   │   │   ├── paragraph.py
    │   │   │   │   ├── tables
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── colorize_table.py
    │   │   │   │   │   ├── element_parsers.py
    │   │   │   │   │   ├── styles.py
    │   │   │   │   │   ├── table_colorization_handler.py
    │   │   │   │   │   └── utils.py
    │   │   │   │   ├── text_box.py
    │   │   │   │   └── toc.py
    │   │   │   ├── heuristics
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── build_heuristics.py
    │   │   │   │   ├── content_awareness.py
    │   │   │   │   └── utils.py
    │   │   │   └── mappings.py
    │   │   ├── config.py
    │   │   ├── entity_detection
    │   │   │   ├── __init__.py
    │   │   │   ├── detection.py
    │   │   │   ├── entity_detector.py
    │   │   │   └── utils.py
    │   │   ├── language_detection
    │   │   │   ├── __init__.py
    │   │   │   ├── inference.py
    │   │   │   └── utils.py
    │   │   ├── oxml_metadata.py
    │   │   ├── postprocessing
    │   │   │   ├── __init__.py
    │   │   │   ├── filters.py
    │   │   │   ├── postprocess.py
    │   │   │   └── table.py
    │   │   ├── preprocessing
    │   │   │   ├── __init__.py
    │   │   │   └── highlighting.py
    │   │   ├── sanity_checks.py
    │   │   ├── soffice
    │   │   │   ├── __init__.py
    │   │   │   ├── conversion_manager.py
    │   │   │   └── utils.py
    │   │   ├── text
    │   │   │   ├── __init__.py
    │   │   │   ├── text_entity_matching.py
    │   │   │   └── text_extraction.py
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── bbox_utils.py
    │   │   │   ├── color_utils.py
    │   │   │   ├── docx_utils.py
    │   │   │   ├── identifiers.py
    │   │   │   ├── pdf_utils.py
    │   │   │   ├── updateable_zipfile.py
    │   │   │   └── zip_bomb.py
    │   ├── cc_processing
    │   │   ├── __init__.py
    │   │   ├── cc_url_process.py
    │   │   ├── deduplicate.py
    │   │   └── preprocess_cc_urls.py
    │   ├── data_sources
    │   │   ├── __init__.py
    │   │   ├── download_exceptions.py
    │   │   ├── download_process.py
    │   │   ├── http_handlers.py
    │   │   └── maldoc_check.py
    │   ├── exceptions.py
    │   ├── extensions
    │   │   ├── __init__.py
    │   │   ├── obj_detection
    │   │   │   ├── __init__.py
    │   │   │   ├── data_prep
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── doclaynet_yolo_formatter.py
    │   │   │   │   ├── publaynet_yolo_formatter.py
    │   │   │   │   ├── pubtabnet_yolo_formatter.py
    │   │   │   │   ├── wordscape_yolo_config_handler.py
    │   │   │   │   ├── wordscape_yolo_formatter.py
    │   │   │   │   ├── wordscape_yolo_formatter_unzipped.py
    │   │   │   │   └── yolo_dataset_report.py
    │   │   │   └── spaceml
    │   │   │   │   ├── move_train_data.py
    │   │   │   │   ├── move_train_data_singlefiles.py
    │   │   │   │   ├── ws_yolo_dataprep.py
    │   │   │   │   └── ws_yolo_experimentrun.py
    │   │   └── pretrain
    │   │   │   └── layoutlmv3
    │   │   │       └── data_prep
    │   │   │           ├── __init__.py
    │   │   │           ├── wordscape_layoutlmv3_config_handler.py
    │   │   │           ├── wordscape_layoutlmv3_dataprep.py
    │   │   │           ├── wordscape_layoutlmv3_datasetbuilder.py
    │   │   │           └── wordscape_layoutlmv3_formatter.py
    │   └── quality
    │   │   ├── __init__.py
    │   │   ├── perplexity.py
    │   │   └── text_normalizer.py
    ├── utilities
    │   ├── checksums.parquet
    │   ├── compute_checksums.py
    │   ├── merge_annotations_metadata.py
    │   ├── merge_sources_metadata.py
    │   ├── run_filter_tars.py
    │   └── run_whitelist_pages.py
    └── visualize_annotations.py
├── data
    └── .gitkeep
└── docs
    └── wordscape.png


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/*
 2 | .vscode/*
 3 | *.pyc
 4 | *.DS_Store
 5 | Pipfile
 6 | Pipfile.lock
 7 | 
 8 | # Byte-compiled / optimized / DLL files
 9 | __pycache__/
10 | 
11 | # data folders
12 | data/*
13 | !data/.gitkeep
14 | !data/tmp/.gitkeep
15 | annotated/*
16 | resources/*/*.bin
17 | resources/*/*.model
18 | resources/*/*.ftz
19 | 
20 | # venv
21 | .venv


--------------------------------------------------------------------------------
/README_ANNOTATION.md:
--------------------------------------------------------------------------------
  1 | # Annotation of Word files
  2 | 
  3 | All instructions are assumed to be executed from the root directory of the project, with the python virtual environment
  4 | activated and all necessary dependencies installed. In this example, we will be processing the word files downloaded in
  5 | the previous step of the WordScape pipeline.
  6 | 
  7 | This part of WordScape assumes that you have input files (i.e. Word .doc or .docx files) stored in
  8 | gzip compressed tar archives:
  9 | 
 10 | ```
 11 | DATA_DIR
 12 | ├── archive_1.tar.gz
 13 |     ├── file_1.doc
 14 |     ├── file_2.docx
 15 |     ...
 16 |     ├── file_k.doc
 17 | ├── archive_2.tar.gz
 18 |     ├── file_1.doc
 19 |     ...
 20 | ├── archive_n.tar.gz
 21 | ```
 22 | 
 23 | The Word Scape pipeline will process each archive in parallel. The output will have the following components:
 24 | 
 25 | - A `failed` directory with jsonl files that contain the filenames of Word files that failed to process, including the
 26 |   reason for
 27 |   failure.
 28 | - A `logs` directory with log files from each worker.
 29 | - A `meta` directory with jsonl files that contain metadata on document level, and metadata on page level.
 30 | - A `multimodal` directory with tar.gz files that contain multimodal data for each document. The multimodal data
 31 |   includes images of each page, and json files that contain OCR text, word bounding boxes, and entity bounding boxes.
 32 | - A `text` directory with jsonl files that contain OCR text for each document and each page.
 33 | - A `version_info.txt` file that contains the timestamp and git branch and commit hash of the code used to process the
 34 |   data.
 35 | - A `args.json` file that contains the arguments used to run the pipeline.
 36 | - A `config.yaml` file that contains the configuration used to run the pipeline.
 37 | 
 38 | The output directory structure will look like this:
 39 | 
 40 | ```
 41 | ./data/annotated/<CC_DUMP_ID>/<timestamp>/
 42 | ├── failed
 43 |     ├── failed_<shard_id>.jsonl
 44 |     ...
 45 | ├── logs
 46 |     ├── <worker_id>.log
 47 |     ...
 48 | ├── meta
 49 |     ├── doc_meta_<shard_id>.jsonl
 50 |     ├── page_meta_<shard_id>.jsonl
 51 |     ...
 52 | ├── multimodal
 53 |     ├── docs_<shard_id>.tar.gz
 54 |         ├── doc_<url_hash>_p<page_num>.jpg
 55 |         ├── entities_<url_hash>_p<page_num>.json
 56 |         ├── text_<url_hash>_p<page_num>.json
 57 |         ├── words_<url_hash>_p<page_num>.json
 58 |         ...
 59 |     ...
 60 | ├── text
 61 |     ├── doc_text_<shard_id>.jsonl
 62 |     ├── page_text_<shard_id>.jsonl
 63 |     ...
 64 | ├── version_info.txt
 65 | ├── args.json
 66 | ├── config.yaml
 67 | ```
 68 | 
 69 | ## Running annotation scripts
 70 | 
 71 | Here we describe how to run the annotation scripts. The scripts are designed to be run on a Slurm cluster, but can also
 72 | be run locally.
 73 | 
 74 | ### Running on a Slurm cluster
 75 | 
 76 | To run WordScape on a Slurm cluster, you can use the `annotation-kickoff.sh` script from the `scripts` directory.
 77 | This script will divide all files ending in `.tar.gz` into partitions. Each partition will be processed by a separate
 78 | Slurm job. To run using slurm, use
 79 | 
 80 | ```bash
 81 | bash scripts/annotation-kickoff.sh $CRAWL_ID $DATA_DIR
 82 | ```
 83 | 
 84 | where the environment variables `$CRAWL_ID` corresponds to the id of the crawl (e.g., "CC-MAIN-2022-49") and `$DATA_DIR`
 85 | is the directory of the Word source files. After creating the partitions, the script submits the jobs to
 86 | the slurm cluster by calling the script `scripts/annotation-launch.sbatch`.
 87 | 
 88 | ### Running locally
 89 | 
 90 | Alternatively, you can also run the annotation script locally. To do so, you can directly call the `run_annotate.py`
 91 | script:
 92 | 
 93 | ```bash
 94 | python annotate_run.py \
 95 |   --data_dir $DATA_DIR \
 96 |   --crawl_id $CRAWL_ID \
 97 |   --max_docs -1 \
 98 |   --output_dir $OUTPUT_DIR
 99 | ```
100 | 
101 | ## Computing perplexity scores
102 | 
103 | Perplexity scores can be computed using the `pp_compute_perplexity.py` script. This script will download the 5-gram
104 | Kneser-Ney models and SentencePiece tokenizers used in the [CCNet pipeline](https://github.com/facebookresearch/cc_net).
105 | You can run the script with the following command:
106 | 
107 | ```bash
108 | python pp_compute_perplexity.py \
109 |   --lang $LANG \
110 |   --data $ANNOTATIONS_ROOT
111 | ```
112 | 
113 | After downloading the language model for the specified language, the script will compute the perplexity scores for each
114 | document in the annotations directory, and write the results to the `meta_ppl` directory, that contains the same data
115 | as the `meta` directory, but with the perplexity scores added to the document level metadata.
116 | 


--------------------------------------------------------------------------------
/README_CC_URL_PARSE.md:
--------------------------------------------------------------------------------
 1 | # Getting DOC / DOCX URLS from CommonCrawl
 2 | 
 3 | All instructions are assumed to be executed from the root directory of the project, with the python virtual environment
 4 | activated. In this example, we will be processing the commoncrawl dump `CC-MAIN-2023-06`. It is recommended to use
 5 | absolute paths for arguments wherever possible, as outlined in the example.
 6 | 
 7 | ## Preparing WAT segment partitions for input to nodes
 8 | 
 9 | Extraction of relevant URLs is based on metadata supplied by the WAT files of a given commoncrawl dump.
10 | In order to prepare for downloading URLs, we must therefore first split the responsibility of
11 | downloading dumps accross slurm nodes. Each slurm node will in turn assign individual
12 | WAT files to be downloaded and processed by workers.
13 | 
14 | In order to prepare these files, run
15 | 
16 | ```shell
17 | python cc_parse_partition_listings.py --crawl CC-MAIN-2023-06 --partition-size 13 --num_nodes 180
18 | ```
19 | 
20 | The `crawl` argument specifies which cc dump to process.
21 | 
22 | The `partition_size` argument influences the internal task distribution of one slurm node; ideally, it should be set to
23 | the number of cores on each node minus 3; if running locally, it should be set to the number of cores your CPU has.
24 | 
25 | The `num_nodes` argument must be the same as the number of slurm nodes you intend to run the download job on. If running
26 | via the `cc-parse-launch.sbatch`
27 | script, it should be the same as sbatch `array` pragma: e.g. with `array=1-180`, this argument should be set to `180`.
28 | If running locally, this argument should be set to `1`.
29 | 
30 | Running this script will output the listings directory to which the results have been written; take note of this, as it
31 | will be needed in the next step:
32 | 
33 | ```shell
34 | [2023-05-25 20:04:38] Downloading Common Crawl paths listings
35 |         * crawl:          CC-MAIN-2023-06
36 |         * data-type:      wat
37 |         * partition-size: 13
38 |         * listings dir:   ./data/crawl-data/CC-MAIN-2023-06/listings
39 | ```
40 | 
41 | ## Running URL download process
42 | 
43 | Now that the distribution of WAT files to slurm nodes and their respective worker processes have been set, we can run
44 | the download process.
45 | To run using sbatch, you can use the included script:
46 | 
47 | ```shell
48 | sbatch ./scripts/cc-parse-launch.sbatch "./data/crawl-data/CC-MAIN-2023-06/listings" "CC-MAIN-2023-06"
49 | ```
50 | 
51 | The first argument must be the listings directory from the output in the last step, and the second the name of the cc
52 | dump (same as above).
53 | 
54 | In order to run locally, you can use:
55 | 
56 | ```shell
57 | python cc_parse_snapshot.py  \
58 | --input "./data/crawl-data/CC-MAIN-2023-06/listings/1" \
59 | --cc_dump "CC-MAIN-2023-06"
60 | ```
61 | 
62 | Note that a `/1` must be added to the end of the listings directory in the local case, as your local machine will be
63 | operating analagously to a single slurm node.
64 | 
65 | These processes will then begin outputting raw URL data to the `cc_urls` data folder.
66 | 
67 | ## Cleanup, merge and recovery
68 | 
69 | After the raw URL download job completes, the produced URLs must be cleaned and merged into a single parquet file in
70 | the `clean_urls` folder for the next steps of the pipeline.
71 | 
72 | To do this, you can run:
73 | 
74 | ```shell
75 | python cc_parse_merge_and_recover_urls.py \
76 | --input ./data/cc_urls/CC-MAIN-2023-06 \
77 | --listings_dir ./data/crawl-data/CC-MAIN-2023-06/listings \
78 | --cc_dump CC-MAIN-2023-06 \
79 | --dedupe 1
80 | ```
81 | 
82 | The `input` argument must be the `cc_urls` directory being cleaned.
83 | 
84 | The `listings_dir` and `cc_dump` arguments are the same as above.
85 | 
86 | If the `dedupe` flag is set, the resulting parquet file will be globally deduplicated against all already processed
87 | dumps inside the `clean_urls` folder; it is recommended to set this flag if you intend to process multiple dumps.
88 | 
89 | After completing these steps, you should have one parquet file with a list of cleaned
90 | URLS: `./data/clean_urls/CC-MAIN-2023-06.parquet`.
91 | 
92 | Note that, due to contention on commoncrawl resources, it is possible that some WATs were not able to be processed.
93 | These will be written to `./data/clean_urls/CC-MAIN-2023-06_recovery_segments.txt`, and a report will be output by the
94 | script on how many (if any) segments were missed. Optionally, you may
95 | re-run the download job at a later time, using only these segments as input.
96 | 
97 | After completing the above steps, you should be ready to move on to the download phase of the pipeline.
98 | 


--------------------------------------------------------------------------------
/app/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM --platform=amd64 ubuntu:22.04
 2 | 
 3 | WORKDIR /usr/app
 4 | 
 5 | RUN apt-get update && apt-get install -y \
 6 |     python3.11 \
 7 |     python3-pip \
 8 |     wget \
 9 |     curl \
10 |     default-jre \
11 |     libcairo2-dev
12 | 
13 | # copy requirements.txt to the working directory
14 | COPY requirements.txt requirements.txt
15 | 
16 | # install python dependencies
17 | RUN pip3 install --no-cache-dir --upgrade pip
18 | RUN pip3 install --no-cache-dir -r requirements.txt
19 | RUN pip3 install --no-cache-dir gdown
20 | 
21 | # install libreoffice
22 | RUN wget https://downloadarchive.documentfoundation.org/libreoffice/old/7.4.6.2/deb/x86_64/LibreOffice_7.4.6.2_Linux_x86-64_deb.tar.gz
23 | RUN tar -xzvf LibreOffice_7.4.6.2_Linux_x86-64_deb.tar.gz && cd LibreOffice_7.4.6.2_Linux_x86-64_deb/DEBS && dpkg -i *.deb
24 | RUN rm -rf LibreOffice_7.4.6.2_Linux_x86-64_deb.tar.gz LibreOffice_7.4.6.2_Linux_x86-64_deb
25 | 
26 | # add to path
27 | ENV PATH="${PATH}:/opt/libreoffice7.4/program/"
28 | 
29 | RUN wget https://bootstrap.pypa.io/get-pip.py
30 | RUN /opt/libreoffice7.4/program/python get-pip.py
31 | RUN /opt/libreoffice7.4/program/python -m pip install --no-cache-dir unoserver==1.6
32 | 
33 | # fix shebangs
34 | RUN sed -i '1s/python\.bin/python/' "/opt/libreoffice7.4/program/python-core-3.8.16/bin/unoserver"
35 | RUN sed -i '1s/python\.bin/python/' "/opt/libreoffice7.4/program/python-core-3.8.16/bin/unoconvert"
36 | ENV PATH="/opt/libreoffice7.4/program/python-core-3.8.16/bin:${PATH}"
37 | 
38 | COPY resources resources
39 | RUN curl https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -o resources/fasttext-models/lid.176.ftz
40 | 
41 | RUN apt-get install -y poppler-utils
42 | 
43 | COPY . .
44 | 
45 | ENTRYPOINT ["/bin/bash", "/usr/app/scripts/run_single_node.sh"]


--------------------------------------------------------------------------------
/app/alembic.ini:
--------------------------------------------------------------------------------
  1 | # A generic, single database configuration.
  2 | 
  3 | [alembic]
  4 | # path to migration scripts
  5 | script_location = alembic
  6 | 
  7 | # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
  8 | # Uncomment the line below if you want the files to be prepended with date and time
  9 | # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
 10 | # for all available tokens
 11 | # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
 12 | 
 13 | # sys.path path, will be prepended to sys.path if present.
 14 | # defaults to the current working directory.
 15 | prepend_sys_path = .
 16 | 
 17 | # timezone to use when rendering the date within the migration file
 18 | # as well as the filename.
 19 | # If specified, requires the python-dateutil library that can be
 20 | # installed by adding `alembic[tz]` to the pip requirements
 21 | # string value is passed to dateutil.tz.gettz()
 22 | # leave blank for localtime
 23 | # timezone =
 24 | 
 25 | # max length of characters to apply to the
 26 | # "slug" field
 27 | # truncate_slug_length = 40
 28 | 
 29 | # set to 'true' to run the environment during
 30 | # the 'revision' command, regardless of autogenerate
 31 | # revision_environment = false
 32 | 
 33 | # set to 'true' to allow .pyc and .pyo files without
 34 | # a source .py file to be detected as revisions in the
 35 | # versions/ directory
 36 | # sourceless = false
 37 | 
 38 | # version location specification; This defaults
 39 | # to alembic/versions.  When using multiple version
 40 | # directories, initial revisions must be specified with --version-path.
 41 | # The path separator used here should be the separator specified by "version_path_separator" below.
 42 | # version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
 43 | 
 44 | # version path separator; As mentioned above, this is the character used to split
 45 | # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
 46 | # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
 47 | # Valid values for version_path_separator are:
 48 | #
 49 | # version_path_separator = :
 50 | # version_path_separator = ;
 51 | # version_path_separator = space
 52 | version_path_separator = os  # Use os.pathsep. Default configuration used for new projects.
 53 | 
 54 | # set to 'true' to search source files recursively
 55 | # in each "version_locations" directory
 56 | # new in Alembic version 1.10
 57 | # recursive_version_locations = false
 58 | 
 59 | # the output encoding used when revision files
 60 | # are written from script.py.mako
 61 | # output_encoding = utf-8
 62 | 
 63 | # TODO: DO NOT DO THIS IN PRODUCTION; NEED A SECRET MANAGER!
 64 | # this simply serves as an example for how you might set up your own test environment.
 65 | sqlalchemy.url = postgresql://localtest:localtestpass@localhost:5432/docparser-dataset-testing
 66 | 
 67 | 
 68 | [post_write_hooks]
 69 | # post_write_hooks defines scripts or Python functions that are run
 70 | # on newly generated revision scripts.  See the documentation for further
 71 | # detail and examples
 72 | 
 73 | # format using "black" - use the console_scripts runner, against the "black" entrypoint
 74 | # hooks = black
 75 | # black.type = console_scripts
 76 | # black.entrypoint = black
 77 | # black.options = -l 79 REVISION_SCRIPT_FILENAME
 78 | 
 79 | # Logging configuration
 80 | [loggers]
 81 | keys = root,sqlalchemy,alembic
 82 | 
 83 | [handlers]
 84 | keys = console
 85 | 
 86 | [formatters]
 87 | keys = generic
 88 | 
 89 | [logger_root]
 90 | level = WARN
 91 | handlers = console
 92 | qualname =
 93 | 
 94 | [logger_sqlalchemy]
 95 | level = WARN
 96 | handlers =
 97 | qualname = sqlalchemy.engine
 98 | 
 99 | [logger_alembic]
100 | level = INFO
101 | handlers =
102 | qualname = alembic
103 | 
104 | [handler_console]
105 | class = StreamHandler
106 | args = (sys.stderr,)
107 | level = NOTSET
108 | formatter = generic
109 | 
110 | [formatter_generic]
111 | format = %(levelname)-5.5s [%(name)s] %(message)s
112 | datefmt = %H:%M:%S
113 | 


--------------------------------------------------------------------------------
/app/alembic/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.


--------------------------------------------------------------------------------
/app/alembic/env.py:
--------------------------------------------------------------------------------
 1 | from logging.config import fileConfig
 2 | 
 3 | from sqlalchemy import engine_from_config
 4 | from sqlalchemy import pool
 5 | 
 6 | from alembic import context
 7 | 
 8 | from orm import models
 9 | 
10 | # this is the Alembic Config object, which provides
11 | # access to the values within the .ini file in use.
12 | config = context.config
13 | 
14 | # Interpret the config file for Python logging.
15 | # This line sets up loggers basically.
16 | if config.config_file_name is not None:
17 |     fileConfig(config.config_file_name)
18 | 
19 | # add your model's MetaData object here
20 | # for 'autogenerate' support
21 | target_metadata = models.Base.metadata
22 | 
23 | # other values from the config, defined by the needs of env.py,
24 | # can be acquired:
25 | # my_important_option = config.get_main_option("my_important_option")
26 | # ... etc.
27 | 
28 | 
29 | def run_migrations_offline() -> None:
30 |     """Run migrations in 'offline' mode.
31 | 
32 |     This configures the context with just a URL
33 |     and not an Engine, though an Engine is acceptable
34 |     here as well.  By skipping the Engine creation
35 |     we don't even need a DBAPI to be available.
36 | 
37 |     Calls to context.execute() here emit the given string to the
38 |     script output.
39 | 
40 |     """
41 |     url = config.get_main_option("sqlalchemy.url")
42 |     context.configure(
43 |         url=url,
44 |         target_metadata=target_metadata,
45 |         literal_binds=True,
46 |         dialect_opts={"paramstyle": "named"},
47 |     )
48 | 
49 |     with context.begin_transaction():
50 |         context.run_migrations()
51 | 
52 | 
53 | def run_migrations_online() -> None:
54 |     """Run migrations in 'online' mode.
55 | 
56 |     In this scenario we need to create an Engine
57 |     and associate a connection with the context.
58 | 
59 |     """
60 |     connectable = engine_from_config(
61 |         config.get_section(config.config_ini_section, {}),
62 |         prefix="sqlalchemy.",
63 |         poolclass=pool.NullPool,
64 |     )
65 | 
66 |     with connectable.connect() as connection:
67 |         context.configure(
68 |             connection=connection, target_metadata=target_metadata
69 |         )
70 | 
71 |         with context.begin_transaction():
72 |             context.run_migrations()
73 | 
74 | 
75 | if context.is_offline_mode():
76 |     run_migrations_offline()
77 | else:
78 |     run_migrations_online()
79 | 


--------------------------------------------------------------------------------
/app/alembic/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade() -> None:
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade() -> None:
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/app/alembic/versions/17cdff8379cb_initial_sources_record.py:
--------------------------------------------------------------------------------
 1 | """Initial sources_record
 2 | 
 3 | Revision ID: 17cdff8379cb
 4 | Revises: 
 5 | Create Date: 2023-03-15 01:52:21.177793
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '17cdff8379cb'
14 | down_revision = None
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.create_table('sources_record',
22 |     sa.Column('url', sa.String(length=10000), nullable=False),
23 |     sa.Column('url_hash', sa.String(length=1000), nullable=False),
24 |     sa.Column('crawl_id', sa.String(length=1000), nullable=False),
25 |     sa.Column('shard_id', sa.String(length=1000), nullable=False),
26 |     sa.Column('filename', sa.String(length=10000), nullable=False),
27 |     sa.Column('status_code', sa.String(length=200), nullable=False),
28 |     sa.Column('content_type', sa.String(length=1000), nullable=True),
29 |     sa.Column('content_length', sa.String(length=1000), nullable=True),
30 |     sa.Column('content_encoding', sa.String(length=1000), nullable=False),
31 |     sa.Column('content_language', sa.ARRAY(sa.String()), nullable=True),
32 |     sa.Column('last_modified', sa.DateTime(), nullable=True),
33 |     sa.Column('source_filename', sa.String(length=10000), nullable=True),
34 |     sa.Column('ip_address', sa.String(length=15), nullable=True),
35 |     sa.Column('olet_ftype', sa.String(length=200), nullable=True),
36 |     sa.Column('olet_container', sa.String(length=200), nullable=True),
37 |     sa.Column('olet_appname', sa.String(length=200), nullable=True),
38 |     sa.Column('olet_codepage', sa.String(length=200), nullable=True),
39 |     sa.Column('olet_author', sa.String(length=400), nullable=True),
40 |     sa.Column('olet_encrypted', sa.String(length=200), nullable=True),
41 |     sa.Column('olet_vba', sa.String(length=400), nullable=True),
42 |     sa.Column('olet_xlm', sa.String(length=400), nullable=True),
43 |     sa.Column('olet_ext_rels', sa.String(length=200), nullable=True),
44 |     sa.Column('olet_ObjectPool', sa.String(length=200), nullable=True),
45 |     sa.Column('olet_flash', sa.String(length=200), nullable=True),
46 |     sa.Column('olet_python_codec', sa.String(length=200), nullable=True),
47 |     sa.Column('olet_pass', sa.String(length=200), nullable=True),
48 |     sa.Column('timestamp', sa.DateTime(), nullable=False),
49 |     sa.Column('exception', sa.String(length=1000), nullable=True),
50 |     sa.PrimaryKeyConstraint('url_hash')
51 |     )
52 |     # ### end Alembic commands ###
53 | 
54 | 
55 | def downgrade() -> None:
56 |     # ### commands auto generated by Alembic - please adjust! ###
57 |     op.drop_table('sources_record')
58 |     # ### end Alembic commands ###
59 | 


--------------------------------------------------------------------------------
/app/alembic/versions/d1fbae82c5fa_initial_metadata_model.py:
--------------------------------------------------------------------------------
 1 | """Initial metadata model
 2 | 
 3 | Revision ID: d1fbae82c5fa
 4 | Revises: 17cdff8379cb
 5 | Create Date: 2023-04-11 06:11:03.232383
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'd1fbae82c5fa'
14 | down_revision = '17cdff8379cb'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.create_table('metadata_record',
22 |     sa.Column('doc_id', sa.String(length=10000), nullable=False),
23 |     sa.Column('url', sa.String(length=10000), nullable=False),
24 |     sa.Column('url_hash', sa.String(length=1000), nullable=False),
25 |     sa.Column('crawl_id', sa.String(length=1000), nullable=False),
26 |     sa.Column('shard_id', sa.String(length=1000), nullable=False),
27 |     sa.Column('filename', sa.String(length=10000), nullable=False),
28 |     sa.Column('geo_location', sa.ARRAY(sa.String()), nullable=True),
29 |     sa.Column('languages_fasttest', sa.ARRAY(sa.String()), nullable=True),
30 |     sa.Column('languages_autocorrect', sa.ARRAY(sa.String()), nullable=True),
31 |     sa.Column('doc_type', sa.String(length=1000), nullable=True),
32 |     sa.Column('industry', sa.String(length=1000), nullable=True),
33 |     sa.Column('word_count', sa.Integer(), nullable=False),
34 |     sa.Column('num_figures', sa.Integer(), nullable=False),
35 |     sa.Column('num_tables', sa.Integer(), nullable=False),
36 |     sa.Column('num_table_cells', sa.Integer(), nullable=False),
37 |     sa.Column('num_quotes', sa.Integer(), nullable=False),
38 |     sa.Column('num_equations', sa.Integer(), nullable=False),
39 |     sa.Column('num_sections_1', sa.Integer(), nullable=False),
40 |     sa.Column('num_sections_2', sa.Integer(), nullable=False),
41 |     sa.Column('num_sections_3', sa.Integer(), nullable=False),
42 |     sa.Column('num_sections_4', sa.Integer(), nullable=False),
43 |     sa.Column('num_sections_5', sa.Integer(), nullable=False),
44 |     sa.Column('num_sections_6', sa.Integer(), nullable=False),
45 |     sa.Column('num_sections_7', sa.Integer(), nullable=False),
46 |     sa.Column('num_sections_8', sa.Integer(), nullable=False),
47 |     sa.Column('num_sections_9', sa.Integer(), nullable=False),
48 |     sa.Column('annotation_sources', sa.JSON(), nullable=False),
49 |     sa.Column('form_hover_tags', sa.JSON(), nullable=True),
50 |     sa.Column('template_name', sa.String(length=1000), nullable=True),
51 |     sa.Column('creator_username', sa.String(length=1000), nullable=True),
52 |     sa.PrimaryKeyConstraint('doc_id')
53 |     )
54 |     # ### end Alembic commands ###
55 | 
56 | 
57 | def downgrade() -> None:
58 |     # ### commands auto generated by Alembic - please adjust! ###
59 |     op.drop_table('metadata_record')
60 |     # ### end Alembic commands ###
61 | 


--------------------------------------------------------------------------------
/app/alembic/versions/eb22a058c4c9_less_mandatory_fields_for_doc_sources.py:
--------------------------------------------------------------------------------
 1 | """Less mandatory fields for doc_sources
 2 | 
 3 | Revision ID: eb22a058c4c9
 4 | Revises: 2de829bd1ca3
 5 | Create Date: 2023-05-29 21:25:05.888669
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'eb22a058c4c9'
14 | down_revision = '2de829bd1ca3'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.alter_column('sources_record', 'filename',
22 |                existing_type=sa.VARCHAR(length=10000),
23 |                nullable=True)
24 |     op.alter_column('sources_record', 'bytehash',
25 |                existing_type=sa.VARCHAR(length=10000),
26 |                nullable=True)
27 |     op.alter_column('sources_record', 'status_code',
28 |                existing_type=sa.VARCHAR(length=200),
29 |                nullable=True)
30 |     op.alter_column('sources_record', 'content_encoding',
31 |                existing_type=sa.VARCHAR(length=1000),
32 |                nullable=True)
33 |     # ### end Alembic commands ###
34 | 
35 | 
36 | def downgrade() -> None:
37 |     # ### commands auto generated by Alembic - please adjust! ###
38 |     op.alter_column('sources_record', 'content_encoding',
39 |                existing_type=sa.VARCHAR(length=1000),
40 |                nullable=False)
41 |     op.alter_column('sources_record', 'status_code',
42 |                existing_type=sa.VARCHAR(length=200),
43 |                nullable=False)
44 |     op.alter_column('sources_record', 'bytehash',
45 |                existing_type=sa.VARCHAR(length=10000),
46 |                nullable=False)
47 |     op.alter_column('sources_record', 'filename',
48 |                existing_type=sa.VARCHAR(length=10000),
49 |                nullable=False)
50 |     # ### end Alembic commands ###
51 | 


--------------------------------------------------------------------------------
/app/cc_parse_merge_and_recover_urls.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Util used after downloading one URL-Batch from CC
  3 | - Merges parquet files into one input file for DL script
  4 | - Recovers any missed WAT segments into one TXT file
  5 | - Deduplicates URLs within one CC Dump
  6 | """
  7 | 
  8 | import argparse
  9 | import pandas as pd
 10 | from pathlib import Path
 11 | import pyarrow as pa
 12 | import pyarrow.parquet as pq
 13 | import time
 14 | 
 15 | import settings
 16 | from src.cc_processing.preprocess_cc_urls import process_urls
 17 | 
 18 | BASE_URL = "https://data.commoncrawl.org/"
 19 | 
 20 | 
 21 | def main():
 22 |     parser = argparse.ArgumentParser()
 23 |     parser.add_argument(
 24 |         "--input", "-i", default=None, type=str,
 25 |         help="directory containing URL parquets to merge and deduplicate"
 26 |     )
 27 |     parser.add_argument(
 28 |         "--cc_dump", "-cc", default=None, type=str,
 29 |         help="cc dump being processed"
 30 |     )
 31 |     parser.add_argument(
 32 |         "--listings_dir", "-ld", type=str, default=None,
 33 |         help="listings dir to compare against",
 34 |     )
 35 |     parser.add_argument(
 36 |         "--dedupe", "-dd", type=bool, default=False,
 37 |         help="set to true in order to deduplicate current dump compared to "
 38 |              "already processed dumps"
 39 |     )
 40 |     args = parser.parse_args()
 41 | 
 42 |     # end-results will be written to here
 43 |     write_dir = settings.filesystem.CLEAN_URLS_DIR
 44 | 
 45 |     if not Path(write_dir).exists():
 46 |         Path(write_dir).mkdir(parents=True)
 47 | 
 48 |     pdir = Path(args.input)
 49 |     pqfiles = [i for i in pdir.glob('*.parquet')]
 50 |     with pq.ParquetWriter(str(pdir / (args.cc_dump + "_merged_raw.parquet")),
 51 |                           schema=pa.schema([('url', pa.string())])) as writer:
 52 |         for item in pqfiles:
 53 |             pqtab = pq.read_table(item)
 54 |             # some parquets may be empty (no docx urls in segment)
 55 |             if pqtab.schema.equals(writer.schema):
 56 |                 writer.write_table(pq.read_table(item))
 57 | 
 58 |     time.sleep(5)
 59 | 
 60 |     # deduplicate parquet
 61 |     df = pd.read_parquet(str(pdir / (args.cc_dump + "_merged_raw.parquet")))
 62 |     num_undupe_rows = len(df)
 63 |     df = df.drop_duplicates()
 64 |     num_rows = len(df)
 65 |     df.to_parquet(str(pdir / (args.cc_dump + "_merged.parquet")))
 66 | 
 67 |     print("total unique URLs: " + str(num_rows) + " removed " + str(
 68 |         num_undupe_rows - num_rows) + " duplicates")
 69 | 
 70 |     # check if any segments need to be recovered
 71 |     lstdir = Path(args.listings_dir)
 72 |     lstfiles = [i for i in lstdir.glob('**/*.txt')]
 73 |     needed_segments = []
 74 |     for item in lstfiles:
 75 |         with open(item) as file:
 76 |             for line in file:
 77 |                 needed_segments.append(line.strip())
 78 | 
 79 |     logfiles = [i for i in pdir.glob('worker_log_*')]
 80 |     gotten_segments = []
 81 |     for item in logfiles:
 82 |         with open(item) as file:
 83 |             last_seen_seg = ''
 84 |             for line in file:
 85 |                 if 'Fetching ' in line:
 86 |                     last_seen_seg = line.split('Fetching ')[-1].strip()
 87 |                 if 'Success! got URL list' in line:
 88 |                     gotten_segments.append(last_seen_seg)
 89 | 
 90 |     missed_segments = [x for x in needed_segments if
 91 |                        ((BASE_URL + x) not in gotten_segments)]
 92 | 
 93 |     # write the segments to recover to a txt file
 94 |     with open(str(write_dir / (args.cc_dump + "_recovery_segments.txt")),
 95 |               'w') as file:
 96 |         for item in missed_segments:
 97 |             file.write(item + '\n')
 98 | 
 99 |     print("sucessfully parsed " + str(
100 |         len(gotten_segments)) + " segments, missed " + str(
101 |         len(missed_segments)))
102 | 
103 |     # do remaining processing and cleaning of urls
104 |     process_urls(input=str(pdir / (args.cc_dump + "_merged.parquet")),
105 |                  output=str(write_dir / (args.cc_dump + ".parquet")),
106 |                  dedupe=args.dedupe)
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     main()
111 | 


--------------------------------------------------------------------------------
/app/cc_parse_partition_listings.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import urllib.request
 4 | import io
 5 | import gzip
 6 | from datetime import datetime
 7 | from pathlib import Path
 8 | import settings
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("--crawl", default=None, type=str,
12 |                     help="Common Crawl crawl")
13 | parser.add_argument("--partition-size", default=13, type=int,
14 |                     help="Partition size")
15 | parser.add_argument("--num_nodes", default=180, type=int,
16 |                     help="number of nodes")
17 | args = parser.parse_args()
18 | 
19 | DATA_TYPE = "wat"
20 | BASE_URL = "https://data.commoncrawl.org"
21 | 
22 | LISTINGS_DIR = settings.filesystem.CC_SEGMENT_DIR / (args.crawl + "/listings")
23 | 
24 | 
25 | def get_timestamp():
26 |     return datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
27 | 
28 | 
29 | def get_idx(idx: int, n_digits: int = 9):
30 |     return "0" * (n_digits - len(str(idx))) + str(idx)
31 | 
32 | 
33 | def main():
34 |     # commoncrawl params
35 |     crawl = args.crawl
36 |     partition_size = args.partition_size
37 | 
38 |     # directory structure
39 |     listings_dir = LISTINGS_DIR
40 | 
41 |     print(
42 |         "{} Downloading Common Crawl paths listings"
43 |         "\n\t* crawl:          {}"
44 |         "\n\t* data-type:      {}"
45 |         "\n\t* partition-size: {}"
46 |         "\n\t* listings dir:   {}".format(
47 |             get_timestamp(), crawl, DATA_TYPE, partition_size, listings_dir
48 |         )
49 |     )
50 | 
51 |     listings_url = os.path.join(BASE_URL,
52 |                                 f"crawl-data/{crawl}/{DATA_TYPE}.paths.gz")
53 | 
54 |     # create dir to save partitioned listings
55 |     if not os.path.exists(listings_dir):
56 |         os.makedirs(listings_dir)
57 | 
58 |     # download listings
59 |     response = urllib.request.urlopen(listings_url)
60 |     compressed_file = io.BytesIO(response.read())
61 |     decompressed_file = gzip.GzipFile(fileobj=compressed_file)
62 |     listings = decompressed_file.read().decode("utf-8").splitlines()
63 | 
64 |     # partition listings and save as txt files
65 |     idx = 0
66 |     for i in range(0, len(listings), int(partition_size)):
67 |         save_as = os.path.join(
68 |             listings_dir, f"wat.paths.part_{get_idx(idx, n_digits=4)}.txt"
69 |         )
70 | 
71 |         with open(save_as, "w") as f:
72 |             f.write("\n".join(listings[i: i + int(partition_size)]))
73 | 
74 |         idx += 1
75 | 
76 |     # distribute accross node folders
77 |     for i in range(1, args.num_nodes + 1):
78 |         subdir = os.path.join(listings_dir, str(i))
79 |         os.mkdir(subdir)
80 | 
81 |     files = list(Path(listings_dir).glob('*.txt'))
82 | 
83 |     curr_subdir = 1
84 |     for f in files:
85 |         f.rename(Path(listings_dir) / str(curr_subdir) / f.name)
86 |         curr_subdir = (curr_subdir % args.num_nodes) + 1
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     main()
91 | 


--------------------------------------------------------------------------------
/app/cc_parse_snapshot.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import joblib
  3 | import multiprocessing as mp
  4 | import os
  5 | from pathlib import Path
  6 | 
  7 | from src.cc_processing.cc_url_process import CCURLProcess
  8 | import settings
  9 | 
 10 | BASE_URL = "https://data.commoncrawl.org/"
 11 | 
 12 | 
 13 | class URIBatchProvider(mp.Process):
 14 |     def __init__(self, inputs_queue: mp.Queue, listings_dir: str, parts: int,
 15 |                  num_workers: int):
 16 |         """
 17 |         Provides warc URIs to cc_url processes input queue. 
 18 | 
 19 |         @param inputs_queue: Queue to write uris to
 20 |         @param listings_dir: Directory containing wat file part listings to be
 21 |             distributed to worker processes
 22 |         @param parts: Number of parts to proccess
 23 |         @param num_workers: Number of workers to supply with uris
 24 |         """
 25 | 
 26 |         super(URIBatchProvider, self).__init__()
 27 |         self.inputs_queue = inputs_queue
 28 |         self.listings_dir = listings_dir
 29 |         self.parts = parts
 30 |         self.num_workers = num_workers
 31 | 
 32 |     def run(self):
 33 |         """
 34 |         Core process loop required by python multiprocessing
 35 |         """
 36 | 
 37 |         parts_dir = Path(self.listings_dir)
 38 | 
 39 |         for parts_file in parts_dir.glob('*.txt'):
 40 |             with parts_file.open() as file:
 41 |                 contents = file.read()
 42 |                 processed_contents = list(
 43 |                     map(lambda x: BASE_URL + x, contents.split('\n')))
 44 |                 # split individually over all cores --> better DL parallelism
 45 |                 for s in processed_contents:
 46 |                     self.inputs_queue.put([s])
 47 | 
 48 |         for _ in range(self.num_workers):
 49 |             self.inputs_queue.put(None)
 50 | 
 51 | 
 52 | def get_args() -> argparse.Namespace:
 53 |     arg_parser = argparse.ArgumentParser()
 54 |     arg_parser.add_argument("--input", "-i",
 55 |                             help="path to folder containing URL listing parts",
 56 |                             type=str, default=None)
 57 |     arg_parser.add_argument("--parts", "-p",
 58 |                             help="number of parts files to process, -1 for all",
 59 |                             type=str, default=-1)
 60 |     arg_parser.add_argument("--cc_dump", "-cc", help="cc dump being processed",
 61 |                             type=str, default="CC-MAIN-2023-06")
 62 |     args = arg_parser.parse_args()
 63 |     return args
 64 | 
 65 | 
 66 | def main():
 67 |     args = get_args()
 68 | 
 69 |     num_cpus = int(os.environ.get("SLURM_CPUS_PER_TASK", joblib.cpu_count()))
 70 | 
 71 |     # make cc dir if not yet existant
 72 |     if not Path.exists(settings.filesystem.CC_DIR):
 73 |         Path.mkdir(settings.filesystem.CC_DIR)
 74 |     if not Path.exists(settings.filesystem.CC_DIR / args.cc_dump):
 75 |         Path.mkdir(settings.filesystem.CC_DIR / args.cc_dump)
 76 | 
 77 |     num_worker_processes = num_cpus - 2
 78 | 
 79 |     queue_buffer_size = 4 * num_worker_processes
 80 |     inputs_queue = mp.Queue(maxsize=queue_buffer_size)
 81 | 
 82 |     cc_processes = []
 83 |     for i in range(num_worker_processes):
 84 |         cc_process = CCURLProcess(inputs_queue, BASE_URL, args.cc_dump)
 85 |         cc_process.start()
 86 |         print("started cc_url parser")
 87 |         cc_processes.append(cc_process)
 88 | 
 89 |     # provide URI batches
 90 |     provider_process = URIBatchProvider(inputs_queue, args.input, 10, 1)
 91 |     provider_process.start()
 92 |     provider_process.join()
 93 | 
 94 |     # wait for workers to finish
 95 |     for cc in cc_processes:
 96 |         cc.join()
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     main()
101 | 


--------------------------------------------------------------------------------
/app/configs/default_config.yaml:
--------------------------------------------------------------------------------
 1 | IMAGES:
 2 |   IMAGE_FORMAT: jpg
 3 |   IMAGE_HEIGHT: null
 4 |   IMAGE_WIDTH: null
 5 |   IMAGE_DPI: 90
 6 | 
 7 | DECOMPRESS_BOMB_CHECKS:
 8 |   MAX_DECOMPRESS_RATIO: 20
 9 |   MAX_IMAGE_PIXELS: 22369621
10 | 
11 | DOCUMENTS:
12 |   MAX_DOC_BYTES: 10485760
13 |   MAX_DOC_PAGES: 150
14 | 
15 | TIME_LIMITS:
16 |   ANNOTATION_TIMEOUT_SECS: 180
17 |   ANNOTATION_CLEANUP_SECS: 180
18 | 
19 | DATA_ORG:
20 |   MAX_BYTES_IN_SHARD: 5368709120
21 | 
22 | LANGUAGES:
23 |   TOP_K_LANGUAGES: 5
24 | 
25 | LIBREOFFICE:
26 |   UNOSERVER_START_TIMEOUT: 60
27 |   UNOCONVERT_TIMEOUT: 60
28 |   SOFFICE_LAUNCH_TIMEOUT: 120
29 |   SOFFICE_LAUNCH_PING_INTERVAL: 0.1
30 | 
31 | ENTITY_DEFINITION:
32 |   MAX_HEADING_LEN: 150
33 |   FORM_FIELD_MIN_LENGTH: 4
34 | 
35 | ENTITY_RELATIONS:
36 |   BBOX_RELATION_OVERLAP_THRESHOLD: 0.45
37 |   BBOX_RELATION_SCALE_THRESHOLD: 1.2
38 |   BBOX_RELATION_CLOSENESS_THRESHOLD: 10
39 |   WORD_2_ENTITY_OVERLAP_THRESHOLD: 0.8
40 | 
41 | ANNOTATION_FILTER:
42 |   MIN_TEXT_CHARS: 200
43 | 
44 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/1header.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "1header",
 3 |     "train_settings": {
 4 |         "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [],
 8 |         "elem_mergings": {
 9 |             "masters": {
10 |                 "1": "heading_global"
11 |             },
12 |             "mapping": {
13 |                 "2": "1",
14 |                 "3": "1",
15 |                 "4": "1",
16 |                 "5": "1",
17 |                 "6": "1",
18 |                 "7": "1",
19 |                 "8": "1",
20 |                 "9": "1"
21 |             }
22 |         },
23 |         "scanify": false,
24 |         "quality_threshold": -1,
25 |         "language_codes": ["en"],
26 |         "language_code_threshold": 0.75
27 |     },
28 |     "val_settings": {
29 |         "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train",
30 |         "is_validation": true,
31 |         "max_img": 40000,
32 |         "elem_drops": [],
33 |         "elem_mergings": {
34 |             "masters": {
35 |                 "1": "heading_global"
36 |             },
37 |             "mapping": {
38 |                 "2": "1",
39 |                 "3": "1",
40 |                 "4": "1",
41 |                 "5": "1",
42 |                 "6": "1",
43 |                 "7": "1",
44 |                 "8": "1",
45 |                 "9": "1"
46 |             }
47 |         },
48 |         "scanify": false,
49 |         "quality_threshold": -1,
50 |         "language_codes": ["en"],
51 |         "language_code_threshold": 0.75
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/3headers.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "3headers",
 3 |     "train_settings": {
 4 |         "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [],
 8 |         "elem_mergings": {
 9 |             "masters": {
10 |                 "1": "heading_123",
11 |                 "4": "heading_456",
12 |                 "5": "heading_789"
13 |             },
14 |             "mapping": {
15 |                 "2": "1",
16 |                 "3": "1",
17 |                 "5": "4",
18 |                 "6": "4",
19 |                 "8": "7",
20 |                 "9": "7"
21 |             }
22 |         },
23 |         "scanify": false,
24 |         "quality_threshold": -1,
25 |         "language_codes": ["en"],
26 |         "language_code_threshold": 0.75
27 |     },
28 |     "val_settings": {
29 |         "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train",
30 |         "is_validation": true,
31 |         "max_img": 40000,
32 |         "elem_drops": [],
33 |         "elem_mergings": {
34 |             "masters": {
35 |                 "1": "heading_123",
36 |                 "4": "heading_456",
37 |                 "5": "heading_789"
38 |             },
39 |             "mapping": {
40 |                 "2": "1",
41 |                 "3": "1",
42 |                 "5": "4",
43 |                 "6": "4",
44 |                 "8": "7",
45 |                 "9": "7"
46 |             }
47 |         },
48 |         "scanify": false,
49 |         "quality_threshold": -1,
50 |         "language_codes": ["en"],
51 |         "language_code_threshold": 0.75
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/baseline.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "baseline",
 3 |     "train_settings": {
 4 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [],
 8 |         "elem_mergings": {},
 9 |         "scanify": false,
10 |         "quality_threshold": -1,
11 |         "language_codes": ["en"],
12 |         "language_code_threshold": 0.75
13 |     },
14 |     "val_settings": {
15 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/val",
16 |         "is_validation": true,
17 |         "max_img": 40000,
18 |         "elem_drops": [],
19 |         "elem_mergings": {},
20 |         "scanify": false,
21 |         "quality_threshold": -1,
22 |         "language_codes": ["en"],
23 |         "language_code_threshold": 0.75
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/baseline_quality.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "baseline_quality",
 3 |     "train_settings": {
 4 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [],
 8 |         "elem_mergings": {},
 9 |         "scanify": false,
10 |         "quality_threshold": 0.75,
11 |         "language_codes": ["en"],
12 |         "language_code_threshold": 0.75
13 |     },
14 |     "val_settings": {
15 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train",
16 |         "is_validation": true,
17 |         "max_img": 40000,
18 |         "elem_drops": [],
19 |         "elem_mergings": {},
20 |         "scanify": false,
21 |         "quality_threshold": 0.75,
22 |         "language_codes": ["en"],
23 |         "language_code_threshold": 0.75
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/local/1header_balanced_quality.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "name": "1header_balanced_quality",
  3 |     "train_settings": {
  4 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train",
  5 |         "is_validation": false,
  6 |         "max_img": 200000,
  7 |         "elem_drops": [24, 25, 27, 30],
  8 |         "elem_mergings": {
  9 |             "masters": {
 10 |                 "1": "heading_global",
 11 |                 "10": "text_merged",
 12 |                 "11": "list_merged"
 13 |             },
 14 |             "mapping": {
 15 |                 "2": "1",
 16 |                 "3": "1",
 17 |                 "4": "1",
 18 |                 "5": "1",
 19 |                 "6": "1",
 20 |                 "7": "1",
 21 |                 "8": "1",
 22 |                 "9": "1",
 23 |                 "18": "11",
 24 |                 "19": "11",
 25 |                 "20": "10"
 26 |             }
 27 |         },
 28 |         "elem_mins": {
 29 |             "0": 2000,
 30 |             "1": 5000,
 31 |             "10": 5000,
 32 |             "11": 5000,
 33 |             "12": 5000,
 34 |             "13": 5000,
 35 |             "14": 2000,
 36 |             "15": 2000,
 37 |             "16": 15000,
 38 |             "17": 20000,
 39 |             "21": 2000,
 40 |             "22": 5000,
 41 |             "23": 5000,
 42 |             "26": 5000,
 43 |             "28": 10000,
 44 |             "29": 10000
 45 |         },
 46 |         "scanify": false,
 47 |         "quality_threshold": 0.7,
 48 |         "language_codes": ["en"],
 49 |         "language_code_threshold": 0.75
 50 |     },
 51 |     "val_settings": {
 52 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/val",
 53 |         "is_validation": true,
 54 |         "max_img": 40000,
 55 |         "elem_drops": [24, 25, 27, 30],
 56 |         "elem_mergings": {
 57 |             "masters": {
 58 |                 "1": "heading_global",
 59 |                 "10": "text_merged",
 60 |                 "11": "list_merged"
 61 |             },
 62 |             "mapping": {
 63 |                 "2": "1",
 64 |                 "3": "1",
 65 |                 "4": "1",
 66 |                 "5": "1",
 67 |                 "6": "1",
 68 |                 "7": "1",
 69 |                 "8": "1",
 70 |                 "9": "1",
 71 |                 "18": "11",
 72 |                 "19": "11",
 73 |                 "20": "10"
 74 |             }
 75 |         },
 76 |         "elem_mins": {
 77 |             "0": 300,
 78 |             "1": 1000,
 79 |             "10": 1000,
 80 |             "11": 1000,
 81 |             "12": 1000,
 82 |             "13": 1000,
 83 |             "14": 300,
 84 |             "15": 300,
 85 |             "16": 3000,
 86 |             "17": 4000,
 87 |             "21": 300,
 88 |             "22": 1000,
 89 |             "23": 1000,
 90 |             "26": 1000,
 91 |             "28": 2000,
 92 |             "29": 2000
 93 |         },
 94 |         "scanify": false,
 95 |         "quality_threshold": 0.7,
 96 |         "language_codes": ["en"],
 97 |         "language_code_threshold": 0.75
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/local/1header_balanced_quality_multilang.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "1header_balanced_quality_multilang",
 3 |     "train_settings": {
 4 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [14, 28, 29, 24, 25, 27, 30],
 8 |         "elem_mergings": {
 9 |             "masters": {
10 |                 "1": "heading_title_global",
11 |                 "10": "text_merged",
12 |                 "11": "list_merged",
13 |                 "17": "table_cell_merged"
14 |             },
15 |             "mapping": {
16 |                 "0": "1",
17 |                 "2": "1",
18 |                 "3": "1",
19 |                 "4": "1",
20 |                 "5": "1",
21 |                 "6": "1",
22 |                 "7": "1",
23 |                 "8": "1",
24 |                 "9": "1",
25 |                 "15": "17",
26 |                 "18": "11",
27 |                 "19": "11",
28 |                 "20": "10",
29 |                 "23": "10"
30 |             }
31 |         },
32 |         "elem_mins": {
33 |             "1": 20000,
34 |             "10": 20000,
35 |             "11": 20000,
36 |             "12": 20000,
37 |             "13": 20000,
38 |             "16": 20000,
39 |             "17": 20000,
40 |             "21": 20000,
41 |             "22": 20000,
42 |             "26": 20000
43 |         },
44 |         "scanify": false,
45 |         "quality_threshold": 0.7,
46 |         "language_codes": ["es", "fr", "it", "de", "pt", "en"],
47 |         "language_code_threshold": 0.75
48 |     },
49 |     "val_settings": {
50 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/val",
51 |         "is_validation": true,
52 |         "max_img": 40000,
53 |         "elem_drops": [14, 28, 29, 24, 25, 27, 30],
54 |         "elem_mergings": {
55 |             "masters": {
56 |                 "1": "heading_title_global",
57 |                 "10": "text_merged",
58 |                 "11": "list_merged",
59 |                 "17": "table_cell_merged"
60 |             },
61 |             "mapping": {
62 |                 "0": "1",
63 |                 "2": "1",
64 |                 "3": "1",
65 |                 "4": "1",
66 |                 "5": "1",
67 |                 "6": "1",
68 |                 "7": "1",
69 |                 "8": "1",
70 |                 "9": "1",
71 |                 "15": "17",
72 |                 "18": "11",
73 |                 "19": "11",
74 |                 "20": "10",
75 |                 "23": "10"
76 |             }
77 |         },
78 |         "elem_mins": {
79 |             "1": 4000,
80 |             "10": 4000,
81 |             "11": 4000,
82 |             "12": 4000,
83 |             "13": 4000,
84 |             "16": 4000,
85 |             "17": 4000,
86 |             "21": 4000,
87 |             "22": 4000,
88 |             "26": 4000
89 |         },
90 |         "scanify": false,
91 |         "quality_threshold": 0.7,
92 |         "language_codes": ["es", "fr", "it", "de", "pt", "en"],
93 |         "language_code_threshold": 0.75
94 |     }
95 | }
96 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/local/1header_balanced_quality_report.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_counts": {
 3 |         "title": 14372,
 4 |         "heading_global": 275224,
 5 |         "text_merged": 698583,
 6 |         "list_merged": 907053,
 7 |         "header": 59964,
 8 |         "footer": 66648,
 9 |         "table_header": 363,
10 |         "table_header_cell": 1529,
11 |         "table": 7415,
12 |         "table_cell": 36691,
13 |         "equation": 1884,
14 |         "figure": 83010,
15 |         "table_caption": 1170,
16 |         "form_field": 55934,
17 |         "table_row": 16061,
18 |         "table_column": 12296,
19 |         "empty_labels": 7
20 |     },
21 |     "val_counts": {
22 |         "title": 2929,
23 |         "heading_global": 55435,
24 |         "text_merged": 145300,
25 |         "list_merged": 174718,
26 |         "header": 13161,
27 |         "footer": 13625,
28 |         "table_header": 139,
29 |         "table_header_cell": 600,
30 |         "table": 2554,
31 |         "table_cell": 13031,
32 |         "equation": 655,
33 |         "figure": 15416,
34 |         "table_caption": 338,
35 |         "form_field": 10153,
36 |         "table_row": 5521,
37 |         "table_column": 4433,
38 |         "empty_labels": 2
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/local/3headers_balanced_quality.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "name": "3headers_balanced_quality",
  3 |     "train_settings": {
  4 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train",
  5 |         "is_validation": false,
  6 |         "max_img": 200000,
  7 |         "elem_drops": [24, 25, 27],
  8 |         "elem_mergings": {
  9 |             "masters": {
 10 |                 "1": "heading_123",
 11 |                 "4": "heading_456",
 12 |                 "7": "heading_789",
 13 |                 "10": "text_merged",
 14 |                 "11": "list_merged"
 15 |             },
 16 |             "mapping": {
 17 |                 "2": "1",
 18 |                 "3": "1",
 19 |                 "5": "4",
 20 |                 "6": "4",
 21 |                 "8": "7",
 22 |                 "9": "7",
 23 |                 "18": "11",
 24 |                 "19": "11",
 25 |                 "20": "10"
 26 |             }
 27 |         },
 28 |         "elem_mins": {
 29 |             "0": 2000,
 30 |             "1": 5000,
 31 |             "4": 5000,
 32 |             "7": 5000,
 33 |             "10": 5000,
 34 |             "11": 5000,
 35 |             "12": 5000,
 36 |             "13": 5000,
 37 |             "14": 2000,
 38 |             "15": 2000,
 39 |             "16": 5000,
 40 |             "17": 5000,
 41 |             "21": 2000,
 42 |             "22": 5000,
 43 |             "23": 5000,
 44 |             "26": 5000,
 45 |             "28": 5000,
 46 |             "29": 5000,
 47 |             "30": 5000
 48 |         },
 49 |         "scanify": false,
 50 |         "quality_threshold": 0.7,
 51 |         "language_codes": ["en"],
 52 |         "language_code_threshold": 0.75
 53 |     },
 54 |     "val_settings": {
 55 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/val",
 56 |         "is_validation": true,
 57 |         "max_img": 40000,
 58 |         "elem_drops": [24, 25, 27],
 59 |         "elem_mergings": {
 60 |             "masters": {
 61 |                 "1": "heading_123",
 62 |                 "4": "heading_456",
 63 |                 "7": "heading_789",
 64 |                 "10": "text_merged",
 65 |                 "11": "list_merged"
 66 |             },
 67 |             "mapping": {
 68 |                 "2": "1",
 69 |                 "3": "1",
 70 |                 "5": "4",
 71 |                 "6": "4",
 72 |                 "8": "7",
 73 |                 "9": "7",
 74 |                 "18": "11",
 75 |                 "19": "11",
 76 |                 "20": "10"
 77 |             }
 78 |         },
 79 |         "elem_mins": {
 80 |             "0": 300,
 81 |             "1": 1000,
 82 |             "4": 1000,
 83 |             "7": 1000,
 84 |             "10": 1000,
 85 |             "11": 1000,
 86 |             "12": 1000,
 87 |             "13": 1000,
 88 |             "14": 300,
 89 |             "15": 300,
 90 |             "16": 1000,
 91 |             "17": 1000,
 92 |             "21": 300,
 93 |             "22": 1000,
 94 |             "23": 1000,
 95 |             "26": 1000,
 96 |             "28": 1000,
 97 |             "29": 1000,
 98 |             "30": 1000
 99 |         },
100 |         "scanify": false,
101 |         "quality_threshold": 0.7,
102 |         "language_codes": ["en"],
103 |         "language_code_threshold": 0.75
104 |     }
105 | }
106 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/local/3headers_balanced_quality_report.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_counts": {
 3 |         "title": 14158,
 4 |         "heading_123": 222622,
 5 |         "heading_456": 48883,
 6 |         "heading_789": 18720,
 7 |         "text_merged": 711231,
 8 |         "list_merged": 894044,
 9 |         "header": 59981,
10 |         "footer": 68337,
11 |         "table_header": 363,
12 |         "table_header_cell": 1529,
13 |         "table": 7154,
14 |         "table_cell": 36691,
15 |         "equation": 1884,
16 |         "figure": 82880,
17 |         "table_caption": 1170,
18 |         "form_field": 55734,
19 |         "table_row": 16061,
20 |         "table_column": 12296,
21 |         "table_header_row": 0
22 |     },
23 |     "val_counts": {
24 |         "title": 2918,
25 |         "heading_123": 45048,
26 |         "heading_456": 9700,
27 |         "heading_789": 3808,
28 |         "text_merged": 148337,
29 |         "list_merged": 178763,
30 |         "header": 12695,
31 |         "footer": 13428,
32 |         "table_header": 139,
33 |         "table_header_cell": 600,
34 |         "table": 1948,
35 |         "table_cell": 10588,
36 |         "equation": 655,
37 |         "figure": 16139,
38 |         "table_caption": 338,
39 |         "form_field": 10131,
40 |         "table_row": 4470,
41 |         "table_column": 3558,
42 |         "table_header_row": 0
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/local/tableonly_balanced.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "tableonly_balanced",
 3 |     "train_settings": {
 4 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [
 8 |             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22,
 9 |             24, 25, 26, 27, 30
10 |         ],
11 |         "elem_mins": {
12 |             "14": 1000,
13 |             "15": 3000,
14 |             "16": 20000,
15 |             "17": 150000,
16 |             "23": 2000,
17 |             "28": 30000,
18 |             "29": 30000
19 |         },
20 |         "elem_mergings": {},
21 |         "scanify": false,
22 |         "quality_threshold": -1,
23 |         "language_codes": ["en", "de"],
24 |         "language_code_threshold": 0.75
25 |     },
26 |     "val_settings": {
27 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/val",
28 |         "is_validation": true,
29 |         "max_img": 40000,
30 |         "elem_drops": [
31 |             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22,
32 |             24, 25, 26, 27, 30
33 |         ],
34 |         "elem_mins": {
35 |             "14": 200,
36 |             "15": 600,
37 |             "16": 4000,
38 |             "17": 30000,
39 |             "23": 400,
40 |             "28": 6000,
41 |             "29": 6000
42 |         },
43 |         "elem_mergings": {},
44 |         "scanify": false,
45 |         "quality_threshold": -1,
46 |         "language_codes": ["en", "de"],
47 |         "language_code_threshold": 0.75
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/local/tableonly_balanced_cut.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "tableonly_balanced_cut",
 3 |     "train_settings": {
 4 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [
 8 |             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 18, 19, 20, 21,
 9 |             22, 23, 24, 25, 26, 27, 28, 29, 30
10 |         ],
11 |         "elem_mins": {
12 |             "16": 190000
13 |         },
14 |         "elem_mergings": {
15 |             "masters": {
16 |                 "17": "table_cell_merged"
17 |             },
18 |             "mapping": {
19 |                 "15": "17"
20 |             }
21 |         },
22 |         "scanify": false,
23 |         "quality_threshold": -1,
24 |         "language_codes": ["en", "de"],
25 |         "language_code_threshold": 0.75
26 |     },
27 |     "val_settings": {
28 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/val",
29 |         "is_validation": true,
30 |         "max_img": 40000,
31 |         "elem_drops": [
32 |             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 18, 19, 20, 21,
33 |             22, 23, 24, 25, 26, 27, 28, 29, 30
34 |         ],
35 |         "elem_mins": {
36 |             "16": 38000
37 |         },
38 |         "elem_mergings": {
39 |             "masters": {
40 |                 "17": "table_cell_merged"
41 |             },
42 |             "mapping": {
43 |                 "15": "17"
44 |             }
45 |         },
46 |         "scanify": false,
47 |         "quality_threshold": -1,
48 |         "language_codes": ["en", "de"],
49 |         "language_code_threshold": 0.75
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/spaceml/1header.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "1header",
 3 |     "train_settings": {
 4 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [],
 8 |         "elem_mergings": {
 9 |             "masters": {
10 |                 "1": "heading_global"
11 |             },
12 |             "mapping": {
13 |                 "2": "1",
14 |                 "3": "1",
15 |                 "4": "1",
16 |                 "5": "1",
17 |                 "6": "1",
18 |                 "7": "1",
19 |                 "8": "1",
20 |                 "9": "1"
21 |             }
22 |         },
23 |         "scanify": false,
24 |         "quality_threshold": -1,
25 |         "language_codes": ["en"],
26 |         "language_code_threshold": 0.75
27 |     },
28 |     "val_settings": {
29 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val",
30 |         "is_validation": true,
31 |         "max_img": 40000,
32 |         "elem_drops": [],
33 |         "elem_mergings": {
34 |             "masters": {
35 |                 "1": "heading_global"
36 |             },
37 |             "mapping": {
38 |                 "2": "1",
39 |                 "3": "1",
40 |                 "4": "1",
41 |                 "5": "1",
42 |                 "6": "1",
43 |                 "7": "1",
44 |                 "8": "1",
45 |                 "9": "1"
46 |             }
47 |         },
48 |         "scanify": false,
49 |         "quality_threshold": -1,
50 |         "language_codes": ["en"],
51 |         "language_code_threshold": 0.75
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/spaceml/1header_quality.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "1header_quality",
 3 |     "train_settings": {
 4 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [],
 8 |         "elem_mergings": {
 9 |             "masters": {
10 |                 "1": "heading_global"
11 |             },
12 |             "mapping": {
13 |                 "2": "1",
14 |                 "3": "1",
15 |                 "4": "1",
16 |                 "5": "1",
17 |                 "6": "1",
18 |                 "7": "1",
19 |                 "8": "1",
20 |                 "9": "1"
21 |             }
22 |         },
23 |         "scanify": false,
24 |         "quality_threshold": 0.75,
25 |         "language_codes": ["en"],
26 |         "language_code_threshold": 0.75
27 |     },
28 |     "val_settings": {
29 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val",
30 |         "is_validation": true,
31 |         "max_img": 40000,
32 |         "elem_drops": [],
33 |         "elem_mergings": {
34 |             "masters": {
35 |                 "1": "heading_global"
36 |             },
37 |             "mapping": {
38 |                 "2": "1",
39 |                 "3": "1",
40 |                 "4": "1",
41 |                 "5": "1",
42 |                 "6": "1",
43 |                 "7": "1",
44 |                 "8": "1",
45 |                 "9": "1"
46 |             }
47 |         },
48 |         "scanify": false,
49 |         "quality_threshold": 0.75,
50 |         "language_codes": ["en"],
51 |         "language_code_threshold": 0.75
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/spaceml/3headers.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "3headers",
 3 |     "train_settings": {
 4 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [],
 8 |         "elem_mergings": {
 9 |             "masters": {
10 |                 "1": "heading_123",
11 |                 "4": "heading_456",
12 |                 "7": "heading_789"
13 |             },
14 |             "mapping": {
15 |                 "2": "1",
16 |                 "3": "1",
17 |                 "5": "4",
18 |                 "6": "4",
19 |                 "8": "7",
20 |                 "9": "7"
21 |             }
22 |         },
23 |         "scanify": false,
24 |         "quality_threshold": -1,
25 |         "language_codes": ["en"],
26 |         "language_code_threshold": 0.75
27 |     },
28 |     "val_settings": {
29 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val",
30 |         "is_validation": true,
31 |         "max_img": 40000,
32 |         "elem_drops": [],
33 |         "elem_mergings": {
34 |             "masters": {
35 |                 "1": "heading_123",
36 |                 "4": "heading_456",
37 |                 "7": "heading_789"
38 |             },
39 |             "mapping": {
40 |                 "2": "1",
41 |                 "3": "1",
42 |                 "5": "4",
43 |                 "6": "4",
44 |                 "8": "7",
45 |                 "9": "7"
46 |             }
47 |         },
48 |         "scanify": false,
49 |         "quality_threshold": -1,
50 |         "language_codes": ["en"],
51 |         "language_code_threshold": 0.75
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/spaceml/3headers_quality.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "3headers_quality",
 3 |     "train_settings": {
 4 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [],
 8 |         "elem_mergings": {
 9 |             "masters": {
10 |                 "1": "heading_123",
11 |                 "4": "heading_456",
12 |                 "7": "heading_789"
13 |             },
14 |             "mapping": {
15 |                 "2": "1",
16 |                 "3": "1",
17 |                 "5": "4",
18 |                 "6": "4",
19 |                 "8": "7",
20 |                 "9": "7"
21 |             }
22 |         },
23 |         "scanify": false,
24 |         "quality_threshold": 0.75,
25 |         "language_codes": ["en"],
26 |         "language_code_threshold": 0.75
27 |     },
28 |     "val_settings": {
29 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val",
30 |         "is_validation": true,
31 |         "max_img": 40000,
32 |         "elem_drops": [],
33 |         "elem_mergings": {
34 |             "masters": {
35 |                 "1": "heading_123",
36 |                 "4": "heading_456",
37 |                 "7": "heading_789"
38 |             },
39 |             "mapping": {
40 |                 "2": "1",
41 |                 "3": "1",
42 |                 "5": "4",
43 |                 "6": "4",
44 |                 "8": "7",
45 |                 "9": "7"
46 |             }
47 |         },
48 |         "scanify": false,
49 |         "quality_threshold": 0.75,
50 |         "language_codes": ["en"],
51 |         "language_code_threshold": 0.75
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/spaceml/baseline.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "baseline",
 3 |     "train_settings": {
 4 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [],
 8 |         "elem_mergings": {},
 9 |         "scanify": false,
10 |         "quality_threshold": -1,
11 |         "language_codes": ["en"],
12 |         "language_code_threshold": 0.75
13 |     },
14 |     "val_settings": {
15 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val",
16 |         "is_validation": true,
17 |         "max_img": 40000,
18 |         "elem_drops": [],
19 |         "elem_mergings": {},
20 |         "scanify": false,
21 |         "quality_threshold": -1,
22 |         "language_codes": ["en"],
23 |         "language_code_threshold": 0.75
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/spaceml/baseline_quality.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "baseline_quality",
 3 |     "train_settings": {
 4 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [],
 8 |         "elem_mergings": {},
 9 |         "scanify": false,
10 |         "quality_threshold": 0.75,
11 |         "language_codes": ["en"],
12 |         "language_code_threshold": 0.75
13 |     },
14 |     "val_settings": {
15 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val",
16 |         "is_validation": true,
17 |         "max_img": 40000,
18 |         "elem_drops": [],
19 |         "elem_mergings": {},
20 |         "scanify": false,
21 |         "quality_threshold": 0.75,
22 |         "language_codes": ["en"],
23 |         "language_code_threshold": 0.75
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/spaceml/tableonly.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "tableonly",
 3 |     "train_settings": {
 4 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [
 8 |             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22,
 9 |             23, 24, 25, 26, 27, 28, 29, 30
10 |         ],
11 |         "elem_mergings": {},
12 |         "scanify": false,
13 |         "quality_threshold": -1,
14 |         "language_codes": ["en"],
15 |         "language_code_threshold": 0.75
16 |     },
17 |     "val_settings": {
18 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val",
19 |         "is_validation": true,
20 |         "max_img": 40000,
21 |         "elem_drops": [
22 |             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22,
23 |             23, 24, 25, 26, 27, 28, 29, 30
24 |         ],
25 |         "elem_mergings": {},
26 |         "scanify": false,
27 |         "quality_threshold": -1,
28 |         "language_codes": ["en"],
29 |         "language_code_threshold": 0.75
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/spaceml/tableonly_quality.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "tableonly_quality",
 3 |     "train_settings": {
 4 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [
 8 |             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22,
 9 |             23, 24, 25, 26, 27, 28, 29, 30
10 |         ],
11 |         "elem_mergings": {},
12 |         "scanify": false,
13 |         "quality_threshold": 0.75,
14 |         "language_codes": ["en"],
15 |         "language_code_threshold": 0.75
16 |     },
17 |     "val_settings": {
18 |         "raw_path": "/mnt/scratch/thannerv/msc-data/wordscape-2023-14/val",
19 |         "is_validation": true,
20 |         "max_img": 40000,
21 |         "elem_drops": [
22 |             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22,
23 |             23, 24, 25, 26, 27, 28, 29, 30
24 |         ],
25 |         "elem_mergings": {},
26 |         "scanify": false,
27 |         "quality_threshold": 0.75,
28 |         "language_codes": ["en"],
29 |         "language_code_threshold": 0.75
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/tableonly.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "tableonly",
 3 |     "train_settings": {
 4 |         "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [
 8 |             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22,
 9 |             23, 24, 25, 26, 27, 28, 29, 30
10 |         ],
11 |         "elem_mergings": {},
12 |         "scanify": false,
13 |         "quality_threshold": -1,
14 |         "language_codes": ["en"],
15 |         "language_code_threshold": 0.75
16 |     },
17 |     "val_settings": {
18 |         "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train",
19 |         "is_validation": true,
20 |         "max_img": 40000,
21 |         "elem_drops": [
22 |             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22,
23 |             23, 24, 25, 26, 27, 28, 29, 30
24 |         ],
25 |         "elem_mergings": {},
26 |         "scanify": false,
27 |         "quality_threshold": -1,
28 |         "language_codes": ["en"],
29 |         "language_code_threshold": 0.75
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/app/configs/extensions/obj_detection/ws_yolo/tableonly_quality.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "tableonly_quality",
 3 |     "train_settings": {
 4 |         "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [
 8 |             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22,
 9 |             23, 24, 25, 26, 27, 28, 29, 30
10 |         ],
11 |         "elem_mergings": {},
12 |         "scanify": false,
13 |         "quality_threshold": 0.75,
14 |         "language_codes": ["en"],
15 |         "language_code_threshold": 0.75
16 |     },
17 |     "val_settings": {
18 |         "raw_path": "/home/valde/GitHub/msc-thesis/data/raw/format_test_new_train",
19 |         "is_validation": true,
20 |         "max_img": 40000,
21 |         "elem_drops": [
22 |             0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22,
23 |             23, 24, 25, 26, 27, 28, 29, 30
24 |         ],
25 |         "elem_mergings": {},
26 |         "scanify": false,
27 |         "quality_threshold": 0.75,
28 |         "language_codes": ["en"],
29 |         "language_code_threshold": 0.75
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/app/configs/extensions/pretrain/layoutlm/1header_balanced_quality.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "1header_balanced_quality_layoutlm",
 3 |     "settings": {
 4 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 200000,
 7 |         "elem_drops": [24, 25, 27, 30],
 8 |         "elem_mergings": {
 9 |             "masters": {
10 |                 "1": "heading_global",
11 |                 "10": "text_merged",
12 |                 "11": "list_merged"
13 |             },
14 |             "mapping": {
15 |                 "2": "1",
16 |                 "3": "1",
17 |                 "4": "1",
18 |                 "5": "1",
19 |                 "6": "1",
20 |                 "7": "1",
21 |                 "8": "1",
22 |                 "9": "1",
23 |                 "18": "11",
24 |                 "19": "11",
25 |                 "20": "10"
26 |             }
27 |         },
28 |         "elem_mins": {
29 |             "0": 2300,
30 |             "1": 6000,
31 |             "10": 6000,
32 |             "11": 6000,
33 |             "12": 6000,
34 |             "13": 6000,
35 |             "14": 2300,
36 |             "15": 2300,
37 |             "16": 15000,
38 |             "17": 20000,
39 |             "21": 2300,
40 |             "22": 6000,
41 |             "23": 6000,
42 |             "26": 6000,
43 |             "28": 10000,
44 |             "29": 10000
45 |         },
46 |         "scanify": false,
47 |         "quality_threshold": 0.7,
48 |         "language_codes": ["en"],
49 |         "language_code_threshold": 0.75
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/app/configs/extensions/pretrain/layoutlm/1header_balanced_quality_test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "1header_balanced_quality_layoutlm",
 3 |     "settings": {
 4 |         "raw_path": "/mnt/DATA/msc-data/cc_main_2023_14/train",
 5 |         "is_validation": false,
 6 |         "max_img": 50,
 7 |         "elem_drops": [24, 25, 27, 30],
 8 |         "elem_mergings": {
 9 |             "masters": {
10 |                 "1": "heading_global",
11 |                 "10": "text_merged",
12 |                 "11": "list_merged"
13 |             },
14 |             "mapping": {
15 |                 "2": "1",
16 |                 "3": "1",
17 |                 "4": "1",
18 |                 "5": "1",
19 |                 "6": "1",
20 |                 "7": "1",
21 |                 "8": "1",
22 |                 "9": "1",
23 |                 "18": "11",
24 |                 "19": "11",
25 |                 "20": "10"
26 |             }
27 |         },
28 |         "elem_mins": {
29 |             "0": 2,
30 |             "1": 2,
31 |             "10": 2,
32 |             "11": 2,
33 |             "12": 2,
34 |             "13": 2,
35 |             "14": 2,
36 |             "15": 2,
37 |             "16": 2,
38 |             "17": 2,
39 |             "21": 2,
40 |             "22": 2,
41 |             "23": 2,
42 |             "26": 2,
43 |             "28": 2,
44 |             "29": 2
45 |         },
46 |         "scanify": false,
47 |         "quality_threshold": 0.7,
48 |         "language_codes": ["en"],
49 |         "language_code_threshold": 0.75
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/app/download_prepare_urls.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os
 4 | import pathlib
 5 | import settings
 6 | import argparse
 7 | 
 8 | arg_parser = argparse.ArgumentParser()
 9 | arg_parser.add_argument("--cc_dump", "-cc", type=str, default=None,
10 |                         help="cc dump being processed")
11 | arg_parser.add_argument("--clean_urls_dir", type=str, default=None)
12 | arg_parser.add_argument("--num_nodes", type=int, default=25,
13 |                         help="number of nodes")
14 | args = arg_parser.parse_args()
15 | 
16 | 
17 | def main():
18 |     if args.clean_urls_dir is None:
19 |         clean_urls_dir = settings.filesystem.CLEAN_URLS_DIR
20 |     else:
21 |         clean_urls_dir = pathlib.Path(args.clean_urls_dir)
22 | 
23 |     # make folder
24 |     write_folder = clean_urls_dir / args.cc_dump
25 |     if not (os.path.exists(write_folder)):
26 |         os.mkdir(write_folder)
27 | 
28 |     # read parquet file
29 |     clean_list = pd.read_parquet(
30 |         clean_urls_dir / (args.cc_dump + ".parquet")
31 |     )
32 | 
33 |     # split accross num_nodes
34 |     df_split = np.array_split(clean_list, args.num_nodes)
35 |     for i in range(1, args.num_nodes + 1):
36 |         df_split[i - 1].to_parquet(str(write_folder / (str(i) + ".parquet")))
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/app/orm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/orm/__init__.py


--------------------------------------------------------------------------------
/app/orm/dbutils/db_connection.py:
--------------------------------------------------------------------------------
 1 | from logging.config import fileConfig
 2 | 
 3 | from sqlalchemy import create_engine, Engine
 4 | from sqlalchemy import pool
 5 | import settings
 6 | import configparser
 7 | 
 8 | 
 9 | def connect_to_db() -> Engine:
10 |     config = configparser.ConfigParser()
11 |     config.read(settings.filesystem.ALEMBIC_INI_LOC)
12 |     key = config.get('alembic', 'sqlalchemy.url')
13 |     engine = create_engine(key)
14 |     return engine
15 | 
16 | 


--------------------------------------------------------------------------------
/app/pp_compute_perplexity.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import itertools
  3 | import joblib
  4 | import jsonlines
  5 | import multiprocessing as mp
  6 | import os
  7 | from pathlib import Path
  8 | from typing import Dict, Union
  9 | import warnings
 10 | import subprocess
 11 | 
 12 | from src.quality.perplexity import LanguageModel
 13 | 
 14 | WIKI_LM_URL = "http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.arpa.bin"
 15 | WIKI_SP_URL = "http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.sp.model"
 16 | 
 17 | 
 18 | def parse_args() -> argparse.Namespace:
 19 |     args = argparse.ArgumentParser()
 20 |     args.add_argument("--lang", "-l", type=str, required=True)
 21 |     args.add_argument("--data", "-d", type=str, required=True,
 22 |                       help="Path to data directory containing output of the"
 23 |                            "annotation step.")
 24 |     return args.parse_args()
 25 | 
 26 | 
 27 | def _compute_ppl(
 28 |         text_rec: dict, meta_rec: dict, lm: LanguageModel, lang: str
 29 | ) -> Union[float, None]:
 30 |     content = text_rec["text"]
 31 | 
 32 |     # identify top lang
 33 |     langs: Dict[str, float] = meta_rec["languages_fasttext"]
 34 |     top_lang = max(langs, key=langs.get)
 35 |     top_lang = top_lang.replace("__label__", "")
 36 | 
 37 |     if top_lang == lang:
 38 |         # compute perplexity
 39 |         perplexity = lm.compute_perplexity(content=content)
 40 |     else:
 41 |         perplexity = meta_rec.get("perplexity", None)
 42 | 
 43 |     return perplexity
 44 | 
 45 | 
 46 | def _compute_doclaynet_score() -> Union[float, None]:
 47 |     warnings.warn("doclaynet similarity score not implemented yet")
 48 |     return None
 49 | 
 50 | 
 51 | def process_shard(shard_id: str, data_dir, args: argparse.Namespace):
 52 |     print(f"(worker_id={os.getpid()}) start processing shard {shard_id}...")
 53 | 
 54 |     # get file paths
 55 |     text_fp = data_dir / "text" / f"doc_text_{shard_id}.jsonl"
 56 |     meta_fp = data_dir / "meta" / f"doc_meta_{shard_id}.jsonl"
 57 | 
 58 |     # make temporary file to store results
 59 |     ppl_meta_fp = data_dir / "meta_ppl" / f"temp_doc_meta_{shard_id}.jsonl"
 60 | 
 61 |     if not (data_dir / "meta_ppl").exists():
 62 |         (data_dir / "meta_ppl").mkdir()
 63 |         print(f"(worker_id={os.getpid()}) created directory "
 64 |               f"{str(data_dir / 'meta_ppl')}")
 65 | 
 66 |     # load models
 67 |     sp_fp = Path("resources", "wikipedia-models", f"{args.lang}.sp.model")
 68 |     lm_fp = Path("resources", "wikipedia-models", f"{args.lang}.arpa.bin")
 69 |     lm = LanguageModel(sp_model=sp_fp, lm_model=lm_fp)
 70 | 
 71 |     num_records = 0
 72 | 
 73 |     # load data
 74 |     with jsonlines.open(ppl_meta_fp, "w") as res_writer:
 75 |         with jsonlines.open(text_fp) as text_reader, \
 76 |                 jsonlines.open(meta_fp) as meta_reader:
 77 |             for text, meta in zip(text_reader, meta_reader):
 78 |                 # compute perplexity
 79 |                 perplexity = _compute_ppl(text, meta, lm, args.lang)
 80 |                 meta["perplexity"] = perplexity
 81 | 
 82 |                 # add to results
 83 |                 res_writer.write(meta)
 84 | 
 85 |                 num_records += 1
 86 | 
 87 |     print(f"[worker_id={os.getpid()}] done with {shard_id}; "
 88 |           f"num_recs: {num_records:<6}")
 89 | 
 90 | 
 91 | def _prepare_models(args: argparse.Namespace):
 92 |     def _dl_model(url, out_dir: Path):
 93 |         subprocess.run(["wget", "-c", "-P", out_dir, url])
 94 | 
 95 |     sp_fp = Path("resources", "wikipedia-models", f"{args.lang}.sp.model")
 96 |     if not sp_fp.is_file():
 97 |         print(f"downloading {args.lang} sentencepiece model...")
 98 |         _dl_model(WIKI_SP_URL.format(lang=args.lang), sp_fp.parent)
 99 | 
100 |     lm_fp = Path("resources", "wikipedia-models", f"{args.lang}.arpa.bin")
101 |     if not lm_fp.is_file():
102 |         print(f"downloading {args.lang} Kneser-Ney model...")
103 |         _dl_model(WIKI_LM_URL.format(lang=args.lang), lm_fp.parent)
104 | 
105 | 
106 | def main():
107 |     args = parse_args()
108 | 
109 |     # check if models exist -- if not, downlaoad them.
110 |     _prepare_models(args)
111 | 
112 |     data_root = Path(args.data)
113 | 
114 |     if not data_root.exists():
115 |         raise FileNotFoundError(f"could not find data directory: {data_root}")
116 | 
117 |     text_dir = data_root / "text"
118 | 
119 |     shard_ids = list(
120 |         s.stem.replace("doc_text_", "") for s in text_dir.glob("*.jsonl")
121 |         if s.is_file() and s.stem.startswith("doc_text_")
122 |     )
123 | 
124 |     num_workers = joblib.cpu_count() // 2
125 |     print(f"num_workers: {num_workers}")
126 | 
127 |     with mp.Pool(processes=num_workers) as pool:
128 |         pool.starmap(
129 |             process_shard,
130 |             itertools.product(shard_ids, [data_root], [args])
131 |         )
132 | 
133 | 
134 | if __name__ == '__main__':
135 |     main()
136 | 


--------------------------------------------------------------------------------
/app/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiosignal==1.3.1
 2 | anyio==3.6.2
 3 | attrs==22.2.0
 4 | certifi==2022.12.7
 5 | cffi==1.15.1
 6 | charset-normalizer==3.1.0
 7 | click==8.1.3
 8 | colorclass==2.2.2
 9 | contourpy==1.0.7
10 | cramjam==2.6.2
11 | cryptography==40.0.1
12 | cycler==0.11.0
13 | deprecation==2.1.0
14 | dill==0.3.6
15 | distlib==0.3.6
16 | easygui==0.98.3
17 | fastparquet==2023.4.0
18 | fasttext==0.9.2
19 | filelock==3.11.0
20 | fonttools==4.39.3
21 | frozenlist==1.3.3
22 | fsspec==2023.4.0
23 | greenlet==2.0.2
24 | grpcio==1.53.0
25 | idna==3.4
26 | img2pdf==0.4.4
27 | importlib-resources==5.12.0
28 | iso639==0.1.4
29 | joblib==1.2.0
30 | jsonlines==3.1.0
31 | jsonschema==4.17.3
32 | kiwisolver==1.4.4
33 | lxml==4.9.2
34 | matplotlib==3.7.1
35 | msgpack==1.0.5
36 | msoffcrypto-tool==5.0.1
37 | numpy==1.21.5
38 | olefile==0.46
39 | oletools==0.60.1
40 | opencv-python==4.5.5.64
41 | packaging==23.0
42 | pandas==1.5.1
43 | pcodedmp==1.2.6
44 | pdf2image==1.16.0
45 | pdfminer.six==20221105
46 | pdfplumber==0.8.1
47 | pikepdf==7.1.2
48 | Pillow==9.4.0
49 | pkgutil_resolve_name==1.3.10
50 | platformdirs==3.2.0
51 | protobuf==4.22.1
52 | psutil==5.9.4
53 | pybind11==2.10.4
54 | pycparser==2.21
55 | pyparsing==2.4.7
56 | pyrsistent==0.19.3
57 | python-dateutil==2.8.2
58 | python-docx==0.8.11
59 | pytz==2023.3
60 | PyYAML==6.0
61 | regex==2021.11.10
62 | requests==2.28.2
63 | six==1.16.0
64 | sniffio==1.3.0
65 | SQLAlchemy==2.0.9
66 | tabulate==0.9.0
67 | tqdm==4.64.1
68 | typing_extensions==4.5.0
69 | urllib3==1.26.15
70 | zipp==3.15.0
71 | py4j==0.10.9.5
72 | warcio==1.7.4
73 | pyarrow==12.0.0
74 | gitpython==3.1.32
75 | 


--------------------------------------------------------------------------------
/app/resources/fasttext-models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/resources/fasttext-models/.gitkeep


--------------------------------------------------------------------------------
/app/resources/wikipedia-models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/resources/wikipedia-models/.gitkeep


--------------------------------------------------------------------------------
/app/scripts/annotation-kickoff.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | WORKERS=25
 6 | TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
 7 | CRAWL_ID="$1"
 8 | DATA_ROOT="$2"
 9 | OUTPUT_DIR="data/annotated/${CRAWL_ID}/${TIMESTAMP}"
10 | PARTITIONS_DIR="${OUTPUT_DIR}/partitions"
11 | mkdir -p "$PARTITIONS_DIR"
12 | 
13 | echo "CRAWL_ID: $CRAWL_ID"
14 | echo "DATA_ROOT: $DATA_ROOT"
15 | echo "PARTITIONS_DIR: $PARTITIONS_DIR"
16 | echo "OUTPUT_DIR: $OUTPUT_DIR"
17 | 
18 | TMP_FILE="${PARTITIONS_DIR}/tmp.txt"
19 | 
20 | echo $(find "$DATA_ROOT" -type f -name "*.tar.gz") | tr " " "\n" >"$TMP_FILE"
21 | 
22 | # split into partitions
23 | N_FILES=$(wc -l <"$TMP_FILE")
24 | N_FILES_PER_PARTITION=$((N_FILES / WORKERS + 1))
25 | split -d -l $N_FILES_PER_PARTITION "$TMP_FILE" "${PARTITIONS_DIR}/part_"
26 | 
27 | # remove tmp file
28 | rm "$TMP_FILE"
29 | 
30 | # rename partitions to have .txt extension
31 | for f in "${PARTITIONS_DIR}/part_"*; do
32 |   mv "$f" "${f}.txt"
33 |   echo "created partition ${f}.txt"
34 | done
35 | 
36 | sbatch scripts/annotation-launch.sbatch "$CRAWL_ID" "$OUTPUT_DIR" "$PARTITIONS_DIR"
37 | 


--------------------------------------------------------------------------------
/app/scripts/annotation-launch.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # shellcheck disable=SC2206
 3 | #SBATCH --time=93:59:00
 4 | #SBATCH --job-name=annotate
 5 | #SBATCH --cpus-per-task=24
 6 | #SBATCH --mem-per-cpu=4GB
 7 | #SBATCH --nodes=1
 8 | #SBATCH --ntasks=1
 9 | #SBATCH --tasks-per-node=1
10 | #SBATCH --array=1-25
11 | #SBATCH --output="logs/annotation/mp_annotation-docs-%j.out"
12 | #SBATCH --error="logs/annotation/mp_annotation-docs-%j.err"
13 | 
14 | set -e
15 | 
16 | mkdir -p logs/annotation
17 | 
18 | # load modules
19 | # [... placeholder ...]
20 | 
21 | # activate virtual environment
22 | source .venv/bin/activate
23 | 
24 | # read args
25 | CRAWL_ID="$1"
26 | OUTPUT_DIR="$2"
27 | PARTITIONS_DIR="$3"
28 | 
29 | # export env variables
30 | export SLURM_CPUS_PER_TASK
31 | 
32 | echo "SLURM_CPUS_PER_TASK: ${SLURM_CPUS_PER_TASK}"
33 | echo "SLURM_MEM_PER_CPU: ${SLURM_MEM_PER_CPU}"
34 | 
35 | if [ -z "$PARTITIONS_DIR" ]; then
36 |   echo "PARTITIONS_DIR is not set"
37 |   exit 1
38 | fi
39 | 
40 | if [ -z "$CRAWL_ID" ]; then
41 |   echo "CRAWL_ID is not set"
42 |   exit 1
43 | fi
44 | 
45 | if [ -z "$OUTPUT_DIR" ]; then
46 |   echo "OUTPUT_DIR is not set"
47 |   exit 1
48 | fi
49 | 
50 | INPUT_FILE=$(ls ${PARTITIONS_DIR}/part_*.txt | sed -n "${SLURM_ARRAY_TASK_ID}p")
51 | echo "starting annotation on $(hostname) with inputs from ${INPUT_FILE}; using ${SLURM_CPUS_PER_TASK} cpu cores."
52 | 
53 | python -u annotate_run.py \
54 |   --input_files "$INPUT_FILE" \
55 |   --crawl_id "$CRAWL_ID" \
56 |   --output_dir "$OUTPUT_DIR" \
57 |   --soffice_executable ".apps/libreoffice/opt/libreoffice7.4/program/soffice" \
58 |   --max_docs -1
59 | 


--------------------------------------------------------------------------------
/app/scripts/cc-parse-launch.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # shellcheck disable=SC2206
 3 | #SBATCH --time=23:59:00
 4 | #SBATCH --job-name=cc_docs
 5 | #SBATCH --cpus-per-task=16
 6 | #SBATCH --mem-per-cpu=2GB
 7 | #SBATCH --nodes=1
 8 | #SBATCH --ntasks=1
 9 | #SBATCH --tasks-per-node=1
10 | #SBATCH --array=1-180
11 | #SBATCH --output="logs/cc_docs/mp_cc-docs-%j.out"
12 | #SBATCH --error="logs/cc_docs/mp_cc-docs-%j.err"
13 | 
14 | set -e
15 | 
16 | mkdir -p logs/cc_docs
17 | 
18 | # load modules
19 | # [... placeholder ...]
20 | 
21 | # activate virtual environment
22 | source .venv/bin/activate
23 | 
24 | export SLURM_CPUS_PER_TASK
25 | 
26 | INPUT_DIR="$1"/"${SLURM_ARRAY_TASK_ID}"
27 | CC_DUMP="$2"
28 | 
29 | echo "starting url parsing on ${HOSTNAME} with inputs from ${INPUT_DIR} for dump ${CC_DUMP}; using ${SLURM_CPUS_PER_TASK} cpu cores."
30 | 
31 | python -u cc_parse_snapshot.py \
32 |   --input "$INPUT_DIR" \
33 |   --cc_dump "$CC_DUMP"
34 | 


--------------------------------------------------------------------------------
/app/scripts/download-launch.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # shellcheck disable=SC2206
 3 | #SBATCH --time=23:59:00
 4 | #SBATCH --job-name=download
 5 | #SBATCH --cpus-per-task=64
 6 | #SBATCH --mem-per-cpu=2GB
 7 | #SBATCH --nodes=1
 8 | #SBATCH --ntasks=1
 9 | #SBATCH --tasks-per-node=1
10 | #SBATCH --array=1-25
11 | #SBATCH --output="logs/download/mp_download-docs-%j.out"
12 | #SBATCH --error="logs/download/mp_download-docs-%j.err"
13 | 
14 | set -e
15 | 
16 | mkdir -p logs/download
17 | 
18 | # load modules
19 | # [... placeholder ...]
20 | 
21 | # activate virtual environment
22 | source .venv/bin/activate
23 | 
24 | INPUT_FILE="$1/${SLURM_ARRAY_TASK_ID}.parquet"
25 | OUTPUT="$3"
26 | 
27 | echo "starting download on ${HOSTNAME} with inputs from ${INPUT_FILE}, outputting to ${OUTPUT}; using ${SLURM_CPUS_PER_TASK} cpu cores"
28 | python -u download_run.py -i "$INPUT_FILE" -ss $2 -wd "$OUTPUT"
29 | 


--------------------------------------------------------------------------------
/app/scripts/install_libreoffice_centos.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # exit when any command fails
 4 | set -e
 5 | 
 6 | # echo an error message before exiting
 7 | trap 'echo "\"${last_command}\" command exited with code $?."' EXIT
 8 | 
 9 | if [[ "$OSTYPE" == "linux-gnu"* ]]; then
10 |   distro=$(cat /etc/os-release | grep "^ID=" | cut -d "=" -f 2 | tr -d '"')
11 | else
12 |   echo "This script does not handle OS $OSTYPE ! Check the README.md for installation instructions."
13 |   exit 1
14 | fi
15 | 
16 | if [[ "$distro" == "centos" ]]; then
17 |   echo "installing libreoffice on centos..."
18 |   lo_pkg=LibreOffice_7.4.7_Linux_x86-64_rpm.tar.gz
19 |   lo_path=/libreoffice/stable/7.4.7/rpm/x86_64/${lo_pkg}
20 | 
21 |   OPENDOC_ROOT=$(dirname "$(dirname "$(readlink -f "$0")")")
22 |   APP_LOCATION=${OPENDOC_ROOT}/.apps/libreoffice
23 | 
24 |   # create directory for libreoffice app
25 |   mkdir -p "${APP_LOCATION}"
26 | 
27 |   # download and unpack package
28 |   wget https://download.documentfoundation.org/${lo_path} -P "${APP_LOCATION}"
29 |   tar xvzf ${APP_LOCATION}/LibreOffice_7.4.7_Linux_x86-64_rpm.tar.gz --directory "${APP_LOCATION}"
30 | 
31 |   # unpack rpm files
32 |   for i in ${APP_LOCATION}/LibreOffice_7.4.7.2_Linux_x86-64_rpm/RPMS/*.rpm; do
33 |     rpm2cpio $i | (
34 |       cd $APP_LOCATION
35 |       cpio -id
36 |     )
37 |   done
38 | 
39 |   # cleanup
40 |   echo "cleaning up..."
41 |   rm -rv ${APP_LOCATION}/LibreOffice_7.4.7.2_Linux_x86-64_rpm/
42 |   rm -v ${APP_LOCATION}/LibreOffice_7.4.7_Linux_x86-64_rpm.tar.gz
43 | 
44 |   # install unoserver
45 |   echo "pip installing unoserver..."
46 |   wget https://bootstrap.pypa.io/get-pip.py
47 |   ${APP_LOCATION}/opt/libreoffice7.4/program/python get-pip.py
48 |   ${APP_LOCATION}/opt/libreoffice7.4/program/python -m pip install unoserver
49 | 
50 |   # fix shebangs in unoserver and unoconvert (when install with pip the shebangs get messed up)
51 |   sed -i '1s/python\.bin/python/' ${APP_LOCATION}/opt/libreoffice7.4/program/python-core-3.8.16/bin/unoserver
52 |   sed -i '1s/python\.bin/python/' ${APP_LOCATION}/opt/libreoffice7.4/program/python-core-3.8.16/bin/unoconvert
53 | 
54 |   # add unoserver and unoconvert to path
55 |   echo "export PATH=${APP_LOCATION}/opt/libreoffice7.4/program/python-core-3.8.16/bin:\$PATH" >>~/.bashrc
56 |   echo "added unoserver and unoconvert to path. To test it, run 'unoserver -h' and 'unoconvert -h'."
57 | 
58 | else
59 |   echo "this script does not support distro $distro"
60 |   exit 1
61 | fi
62 | 


--------------------------------------------------------------------------------
/app/scripts/pp-compute-perplexity.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # shellcheck disable=SC2206
 3 | #SBATCH --time=23:59:00
 4 | #SBATCH --job-name=annotate
 5 | #SBATCH --cpus-per-task=32
 6 | #SBATCH --mem-per-cpu=4GB
 7 | #SBATCH --nodes=1
 8 | #SBATCH --ntasks=1
 9 | #SBATCH --tasks-per-node=1
10 | #SBATCH --output="logs/postprocess/quality-indicators-%j.out"
11 | #SBATCH --error="logs/postprocess/quality-indicators-%j.err"
12 | 
13 | set -e
14 | 
15 | # load modules
16 | module load eth_proxy jdk gcc/6.3.0 python/3.8.5
17 | 
18 | # activate virtual environment
19 | source .venv/bin/activate
20 | 
21 | DATA_ROOT="/cluster/project/zhang/opendoc/data/annotated/cc_main_2022_49/20230531_144800"
22 | LANGS=("ru" "en" "uk" "pl" "es" "fr" "it" "pt" "cs" "hu" "de" "bg" "tr" "nl" "el")
23 | for lang in "${LANGS[@]}"; do
24 |   echo "computing perplexity values for $lang"
25 | 
26 |   python pp_compute_perplexity.py \
27 |     --data "$DATA_ROOT" \
28 |     --lang "$lang"
29 | 
30 |   # remove language models
31 |   rm resources/wikipedia-models/${lang}.arpa.bin
32 |   rm resources/wikipedia-models/${lang}.sp.model
33 | done
34 | 


--------------------------------------------------------------------------------
/app/scripts/run-filter-tars.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # shellcheck disable=SC2206
 3 | #SBATCH --time=23:59:00
 4 | #SBATCH --job-name=annotate
 5 | #SBATCH --cpus-per-task=64
 6 | #SBATCH --mem-per-cpu=2GB
 7 | #SBATCH --nodes=1
 8 | #SBATCH --ntasks=1
 9 | #SBATCH --tasks-per-node=1
10 | #SBATCH --output="logs/postprocess/quality-indicators-%j.out"
11 | #SBATCH --error="logs/postprocess/quality-indicators-%j.err"
12 | 
13 | set -e
14 | 
15 | mkdir -p logs/postprocess
16 | 
17 | # load modules
18 | # [... placeholder ...]
19 | 
20 | # activate virtual environment
21 | source .venv/bin/activate
22 | 
23 | python utilties/run_filter_tars.py --data_root "$1"
24 | 


--------------------------------------------------------------------------------
/app/scripts/run_single_node.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | trap cleanup_on_error ERR SIGINT SIGTERM
  5 | 
  6 | cleanup_on_error() {
  7 |   echo "Error: $0:$LINENO: command \`$BASH_COMMAND\` failed with exit code $?"
  8 |   exit 1
  9 | }
 10 | 
 11 | help() {
 12 |   echo "Usage: run_single_node.sh [ -d | --dump_id ] [-m | --max_docs]"
 13 |   exit 2
 14 | }
 15 | 
 16 | while [[ $# -gt 0 ]]; do
 17 |   key="$1"
 18 |   case $key in
 19 |   -d | --dump_id)
 20 |     DUMP_ID="$2"
 21 |     shift 2
 22 |     ;;
 23 |   -m | --max_docs)
 24 |     MAX_DOCS="$2"
 25 |     shift 2
 26 |     ;;
 27 |   -h | --help)
 28 |     help
 29 |     ;;
 30 |   --)
 31 |     shift
 32 |     break
 33 |     ;;
 34 |   *)
 35 |     echo "Invalid option: -$1"
 36 |     help
 37 |     ;;
 38 |   esac
 39 | done
 40 | 
 41 | # generate random run id
 42 | RUN_ID=$(openssl rand -hex 12)
 43 | 
 44 | CLEAN_URLS_DIR="/mnt/data/${RUN_ID}/clean_urls"
 45 | SOURCES_DIR="/mnt/data/${RUN_ID}/download/${DUMP_ID}"
 46 | OUTPUT_DIR="/mnt/data/${RUN_ID}/annotated/${DUMP_ID}"
 47 | 
 48 | # create directories
 49 | mkdir -p "$CLEAN_URLS_DIR"
 50 | mkdir -p "$SOURCES_DIR"
 51 | mkdir -p "$OUTPUT_DIR"
 52 | 
 53 | printf "Created directories:\n"
 54 | printf "  * CLEAN_URLS_DIR: %s\n" "$CLEAN_URLS_DIR"
 55 | printf "  * SOURCES_DIR: %s\n" "$SOURCES_DIR"
 56 | printf "  * OUTPUT_DIR: %s\n" "$OUTPUT_DIR"
 57 | 
 58 | if [ -z "${MAX_DOCS}" ]; then
 59 |   MAX_DOCS=-1
 60 | fi
 61 | 
 62 | # get file fid
 63 | case $DUMP_ID in
 64 | "CC-MAIN-2013-48")
 65 |   FID="1359HSlQighPkMV3iEf_z6pO5rdknZhJ_"
 66 |   ;;
 67 | "CC-MAIN-2016-50")
 68 |   FID="14_YuQeu6S0u2lKYKOcpEy5AUjmvSeQdE"
 69 |   ;;
 70 | "CC-MAIN-2020-40")
 71 |   FID="1hKFv4gkUqV_cJcR-02J7rbVm2vJ8HRHH"
 72 |   ;;
 73 | "CC-MAIN-2021-43")
 74 |   FID="1wuXzQ6RKmV56RldqRImbbbHnnza7GSpF"
 75 |   ;;
 76 | "CC-MAIN-2023-06")
 77 |   FID="1mKWK79_M_ENGJy781tPUCtsNJtuoxu5d"
 78 |   ;;
 79 | "CC-MAIN-2023-14")
 80 |   FID="15Od3TdMrkondhfyCNCBSxijXbuyq5rz3"
 81 |   ;;
 82 | *)
 83 |   echo "Invalid dump id: $DUMP_ID"
 84 |   exit 1
 85 |   ;;
 86 | esac
 87 | 
 88 | # download urls
 89 | printf "\n================================\nFetching URL List...\n"
 90 | gdown "https://drive.google.com/uc?id=$FID" -O "$CLEAN_URLS_DIR/$DUMP_ID.parquet"
 91 | 
 92 | mkdir -p /usr/app/data/tmp
 93 | 
 94 | # 1) Prepare urls for download
 95 | printf "\n================================\nURL prep...\n"
 96 | python3 download_prepare_urls.py \
 97 |   --cc_dump "$DUMP_ID" \
 98 |   --clean_urls_dir "$CLEAN_URLS_DIR" \
 99 |   --num_nodes 1
100 | 
101 | # 2) Download documents
102 | printf "\n================================\nDownloading documents...\n"
103 | python3 download_run.py \
104 |   --input "${CLEAN_URLS_DIR}/${DUMP_ID}/1.parquet" \
105 |   --subset_size $MAX_DOCS \
106 |   --write_dir "$SOURCES_DIR"
107 | 
108 | # 3) Annotate documents
109 | printf "\n================================\nAnnotating documents...\n"
110 | python3 annotate_run.py \
111 |   --data_dir "$SOURCES_DIR" \
112 |   --crawl_id "$DUMP_ID" \
113 |   --max_docs $MAX_DOCS \
114 |   --output_dir "$OUTPUT_DIR" \
115 |   --soffice_executable "soffice"
116 | 
117 | printf "\n---------------------------------\n"
118 | printf "WordScape pipeline complete.\n"
119 | printf "Dataset is in %s\n" "$OUTPUT_DIR"
120 | 


--------------------------------------------------------------------------------
/app/settings/__init__.py:
--------------------------------------------------------------------------------
1 | from . import annotation
2 | from . import bbox
3 | from . import colors
4 | from . import content_awareness
5 | from . import entities
6 | from . import filesystem
7 | from . import download
8 | 


--------------------------------------------------------------------------------
/app/settings/annotation.py:
--------------------------------------------------------------------------------
 1 | # possible sources of colorization decision
 2 | 
 3 | ANNOTATION_BUILTIN = "builtin"
 4 | ANNOTATION_XML_PATTERN = "xml_pattern"
 5 | ANNOTATION_CONTENT_AWARE_HEURISTIC = "content_aware_heuristic"
 6 | ANNOTATION_BODY_HEADING_HEURISTIC_USINGBUILTIN = "body_heading_heuristic_usingbuiltin"
 7 | ANNOTATION_BODY_HEADING_HEURISTIC_BASE = "body_heading_heuristic_base"
 8 | 
 9 | DECISION_SOURCES = [
10 |     ANNOTATION_BUILTIN,
11 |     ANNOTATION_XML_PATTERN,
12 |     ANNOTATION_CONTENT_AWARE_HEURISTIC,
13 |     ANNOTATION_BODY_HEADING_HEURISTIC_USINGBUILTIN,
14 |     ANNOTATION_BODY_HEADING_HEURISTIC_BASE
15 | ]
16 | # builtins vs heuristics
17 | BUILTIN_SOURCES = [
18 |     ANNOTATION_BUILTIN,
19 |     ANNOTATION_XML_PATTERN
20 | ]
21 | HEURISTIC_SOURCES = [
22 |     ANNOTATION_CONTENT_AWARE_HEURISTIC,
23 |     ANNOTATION_BODY_HEADING_HEURISTIC_USINGBUILTIN,
24 |     ANNOTATION_BODY_HEADING_HEURISTIC_BASE,
25 | ]
26 | 


--------------------------------------------------------------------------------
/app/settings/bbox.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains all basic settings related to bounding boxes.
 3 | """
 4 | import settings.entities as entities
 5 | 
 6 | # tolerance for bbox color detection
 7 | BBOX_COLOR_TOL = 1
 8 | 
 9 | # minimum size of bounding boxes (expressed as a fraction of page width and
10 | # page height)
11 | DEFAULT_FRACTION_NORMAL = 1e-2
12 | DEFAULT_FRACTION_SMALL = 1e-3
13 | DEFAULT_FRACTION_TINY = 5e-4
14 | 
15 | BBOX_MIN_FRACTIONS = {
16 |     entities.ENTITY_TITLE_ID: DEFAULT_FRACTION_NORMAL,
17 |     entities.ENTITY_HEADING_1_ID: DEFAULT_FRACTION_NORMAL,
18 |     entities.ENTITY_HEADING_2_ID: DEFAULT_FRACTION_NORMAL,
19 |     entities.ENTITY_HEADING_3_ID: DEFAULT_FRACTION_NORMAL,
20 |     entities.ENTITY_HEADING_4_ID: DEFAULT_FRACTION_NORMAL,
21 |     entities.ENTITY_HEADING_5_ID: DEFAULT_FRACTION_NORMAL,
22 |     entities.ENTITY_HEADING_6_ID: DEFAULT_FRACTION_NORMAL,
23 |     entities.ENTITY_HEADING_7_ID: DEFAULT_FRACTION_NORMAL,
24 |     entities.ENTITY_HEADING_8_ID: DEFAULT_FRACTION_NORMAL,
25 |     entities.ENTITY_HEADING_9_ID: DEFAULT_FRACTION_NORMAL,
26 |     entities.ENTITY_TEXT_ID: DEFAULT_FRACTION_NORMAL,
27 |     entities.ENTITY_LIST_ID: DEFAULT_FRACTION_NORMAL,
28 |     entities.ENTITY_HEADER_ID: DEFAULT_FRACTION_NORMAL,
29 |     entities.ENTITY_FOOTER_ID: DEFAULT_FRACTION_NORMAL,
30 |     entities.ENTITY_TABLE_HEADER_ID: DEFAULT_FRACTION_NORMAL,
31 |     entities.ENTITY_TABLE_HEADER_CELL_ID: DEFAULT_FRACTION_NORMAL,
32 |     entities.ENTITY_TABLE_ID: DEFAULT_FRACTION_NORMAL,
33 |     entities.ENTITY_TABLE_CELL_ID: DEFAULT_FRACTION_NORMAL,
34 |     entities.ENTITY_TABLE_CAPTION_ID: DEFAULT_FRACTION_NORMAL,
35 |     entities.ENTITY_TOC_ID: DEFAULT_FRACTION_NORMAL,
36 |     entities.ENTITY_BIBLIOGRAPHY_ID: DEFAULT_FRACTION_NORMAL,
37 |     entities.ENTITY_QUOTE_ID: DEFAULT_FRACTION_NORMAL,
38 |     entities.ENTITY_EQUATION_ID: DEFAULT_FRACTION_NORMAL,
39 |     entities.ENTITY_FIGURE_ID: DEFAULT_FRACTION_NORMAL,
40 |     entities.ENTITY_FOOTNOTE_ID: DEFAULT_FRACTION_NORMAL,
41 |     entities.ENTITY_ANNOTATION_ID: DEFAULT_FRACTION_NORMAL,
42 |     entities.ENTITY_FORM_FIELD_ID: DEFAULT_FRACTION_TINY,
43 |     entities.ENTITY_FORM_TAG_ID: DEFAULT_FRACTION_TINY,
44 | }
45 | 


--------------------------------------------------------------------------------
/app/settings/content_awareness.py:
--------------------------------------------------------------------------------
 1 | # settings for content-aware heuristics
 2 | 
 3 | # symbols we consider to constitute a possible form field
 4 | # note the special triple-period symbol, which word likes to auto-create
 5 | FORM_FIELD_SYMBOLS = ['_', '.', '…']
 6 | 
 7 | # symbols we consider to indicate a quote; must be at start and end.
 8 | QUOTE_SYMBOLS = ["\"", "\'"]
 9 | 
10 | # symbols we consider to constitute a possible numbering
11 | # ! warning: we also include any number followed by a '.', there are infinite
12 | # ! such possibilities.
13 | # here we only list single symbols that we consider to indicate a list entry
14 | # also, the check for builtin numbering indicators is handled separately
15 | NUMBERING_SYMBOLS = [
16 |     '-', '\u2022', '\u27A2', '\u25E6', '\u25AA', '\u25AB', '\u25CF', '\u25CB',
17 |     '\u25A0', '\u25A1', '\u25B6', '\u2043', '\u25C6', '\u25C7', '\u25D0',
18 |     '\u25D1'
19 | ]
20 | 
21 | NUMBERING_FOLLOWERS = ['\.', ':', '\)']
22 | 


--------------------------------------------------------------------------------
/app/settings/download.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | 
 3 | # constants
 4 | MAX_FILESIZE = 90 * 1024 * 1024  # 90 MB
 5 | 
 6 | # string patterns
 7 | DOC_FN_PATTERN = "doc_{url_hash}{ext}"
 8 | TAR_PATTERN = "docs_{part_id}-shard_{shard_num:05d}.tar.gz"
 9 | META_DATA_FN_PATTERN = "meta_{part_id}.parquet"
10 | LOG_FN_PATTERN = "info_{part_id}.log"
11 | LOG_FORMAT = "[%(asctime)s]::%(name)s::%(levelname)s::%(message)s"
12 | 
13 | VALID_CT_REGEX = pattern = regex.compile(
14 |     r'(application|text)/.*(openxml|word|doc|msword|msdownload|rtf).*',
15 |     flags=regex.IGNORECASE | regex.DOTALL
16 | )
17 | 
18 | # header fields
19 | HEADER_FIELDS = [
20 |     "content-type",
21 |     "content-length",
22 |     "content-encoding",
23 |     "content-language",
24 |     "last-modified"
25 | ]
26 | 
27 | # mapping from olet library names to DB olet fields
28 | OLET_DB_MAPPING = {
29 |     'File format': 'olet_ftype',
30 |     'Container format': 'olet_container',
31 |     'Properties code page': 'olet_codepage',
32 |     'Python codec': 'olet_python_codec',
33 |     'Application name': 'olet_appname',
34 |     'Author': 'olet_author',
35 |     'Encrypted': 'olet_encrypted',
36 |     'VBA Macros': 'olet_vba',
37 |     'XLM Macros': 'olet_xlm',
38 |     'External Relationships': 'olet_ext_rels',
39 |     'ObjectPool': 'olet_ObjectPool',
40 |     'Flash objects': 'olet_flash'
41 | }
42 | 


--------------------------------------------------------------------------------
/app/settings/entities.py:
--------------------------------------------------------------------------------
  1 | TOTAL_BASE_LABELS = 31
  2 | LABEL_NUMS = [i for i in range(0, TOTAL_BASE_LABELS)]
  3 | 
  4 | ENTITY_TITLE_NAME = "title"
  5 | ENTITY_TITLE_ID = 0
  6 | 
  7 | ENTITY_HEADING_1_NAME = "heading_1"
  8 | ENTITY_HEADING_1_ID = 1
  9 | 
 10 | ENTITY_HEADING_2_NAME = "heading_2"
 11 | ENTITY_HEADING_2_ID = 2
 12 | 
 13 | ENTITY_HEADING_3_NAME = "heading_3"
 14 | ENTITY_HEADING_3_ID = 3
 15 | 
 16 | ENTITY_HEADING_4_NAME = "heading_4"
 17 | ENTITY_HEADING_4_ID = 4
 18 | 
 19 | ENTITY_HEADING_5_NAME = "heading_5"
 20 | ENTITY_HEADING_5_ID = 5
 21 | 
 22 | ENTITY_HEADING_6_NAME = "heading_6"
 23 | ENTITY_HEADING_6_ID = 6
 24 | 
 25 | ENTITY_HEADING_7_NAME = "heading_7"
 26 | ENTITY_HEADING_7_ID = 7
 27 | 
 28 | ENTITY_HEADING_8_NAME = "heading_8"
 29 | ENTITY_HEADING_8_ID = 8
 30 | 
 31 | ENTITY_HEADING_9_NAME = "heading_9"
 32 | ENTITY_HEADING_9_ID = 9
 33 | 
 34 | ENTITY_TEXT_NAME = "text"
 35 | ENTITY_TEXT_ID = 10
 36 | 
 37 | ENTITY_LIST_NAME = "list"
 38 | ENTITY_LIST_ID = 11
 39 | 
 40 | ENTITY_HEADER_NAME = "header"
 41 | ENTITY_HEADER_ID = 12
 42 | 
 43 | ENTITY_FOOTER_NAME = "footer"
 44 | ENTITY_FOOTER_ID = 13
 45 | 
 46 | ENTITY_TABLE_HEADER_NAME = "table_header"
 47 | ENTITY_TABLE_HEADER_ID = 14
 48 | 
 49 | ENTITY_TABLE_HEADER_CELL_NAME = "table_header_cell"
 50 | ENTITY_TABLE_HEADER_CELL_ID = 15
 51 | 
 52 | ENTITY_TABLE_NAME = "table"
 53 | ENTITY_TABLE_ID = 16
 54 | 
 55 | ENTITY_TABLE_CELL_NAME = "table_cell"
 56 | ENTITY_TABLE_CELL_ID = 17
 57 | 
 58 | ENTITY_TOC_NAME = "toc"
 59 | ENTITY_TOC_ID = 18
 60 | 
 61 | ENTITY_BIBLIOGRAPHY_NAME = "bibliography"
 62 | ENTITY_BIBLIOGRAPHY_ID = 19
 63 | 
 64 | ENTITY_QUOTE_NAME = "quote"
 65 | ENTITY_QUOTE_ID = 20
 66 | 
 67 | ENTITY_EQUATION_NAME = "equation"
 68 | ENTITY_EQUATION_ID = 21
 69 | 
 70 | ENTITY_FIGURE_NAME = "figure"
 71 | ENTITY_FIGURE_ID = 22
 72 | 
 73 | ENTITY_TABLE_CAPTION_NAME = "table_caption"
 74 | ENTITY_TABLE_CAPTION_ID = 23
 75 | 
 76 | ENTITY_FOOTNOTE_NAME = "footnote"
 77 | ENTITY_FOOTNOTE_ID = 24
 78 | 
 79 | ENTITY_ANNOTATION_NAME = "annotation"
 80 | ENTITY_ANNOTATION_ID = 25
 81 | 
 82 | ENTITY_FORM_FIELD_NAME = "form_field"
 83 | ENTITY_FORM_FIELD_ID = 26
 84 | 
 85 | ENTITY_FORM_TAG_NAME = "form_tag"
 86 | ENTITY_FORM_TAG_ID = 27
 87 | 
 88 | ENTITY_TABLE_ROW_NAME = "table_row"
 89 | ENTITY_TABLE_ROW_ID = 28
 90 | 
 91 | ENTITY_TABLE_COLUMN_NAME = "table_column"
 92 | ENTITY_TABLE_COLUMN_ID = 29
 93 | 
 94 | ENTITY_TABLE_HEADER_ROW_NAME = "table_header_row"
 95 | ENTITY_TABLE_HEADER_ROW_ID = 30
 96 | 
 97 | # put all entity names in a list
 98 | ALL_ENTITY_NAMES = [
 99 |     eval(entity_name_var)
100 |     for entity_name_var in dir()
101 |     if (
102 |         entity_name_var.startswith("ENTITY_")
103 |         and entity_name_var.endswith("_NAME")
104 |         and isinstance(eval(entity_name_var), str)
105 |     )
106 | ]
107 | 
108 | # put all entity ids in a list
109 | ALL_ENTITY_IDS = [
110 |     eval(entity_id_var)
111 |     for entity_id_var in dir()
112 |     if (
113 |         entity_id_var.startswith("ENTITY_")
114 |         and entity_id_var.endswith("_ID")
115 |         and isinstance(eval(entity_id_var), int)
116 |     )
117 | ]
118 | 
119 | ENTITY_ID_TO_NAME = {
120 |     ENTITY_TITLE_ID: ENTITY_TITLE_NAME,
121 |     ENTITY_HEADING_1_ID: ENTITY_HEADING_1_NAME,
122 |     ENTITY_HEADING_2_ID: ENTITY_HEADING_2_NAME,
123 |     ENTITY_HEADING_3_ID: ENTITY_HEADING_3_NAME,
124 |     ENTITY_HEADING_4_ID: ENTITY_HEADING_4_NAME,
125 |     ENTITY_HEADING_5_ID: ENTITY_HEADING_5_NAME,
126 |     ENTITY_HEADING_6_ID: ENTITY_HEADING_6_NAME,
127 |     ENTITY_HEADING_7_ID: ENTITY_HEADING_7_NAME,
128 |     ENTITY_HEADING_8_ID: ENTITY_HEADING_8_NAME,
129 |     ENTITY_HEADING_9_ID: ENTITY_HEADING_9_NAME,
130 |     ENTITY_TEXT_ID: ENTITY_TEXT_NAME,
131 |     ENTITY_LIST_ID: ENTITY_LIST_NAME,
132 |     ENTITY_HEADER_ID: ENTITY_HEADER_NAME,
133 |     ENTITY_FOOTER_ID: ENTITY_FOOTER_NAME,
134 |     ENTITY_TABLE_HEADER_ID: ENTITY_TABLE_HEADER_NAME,
135 |     ENTITY_TABLE_HEADER_CELL_ID: ENTITY_TABLE_HEADER_CELL_NAME,
136 |     ENTITY_TABLE_ID: ENTITY_TABLE_NAME,
137 |     ENTITY_TABLE_CELL_ID: ENTITY_TABLE_CELL_NAME,
138 |     ENTITY_TOC_ID: ENTITY_TOC_NAME,
139 |     ENTITY_BIBLIOGRAPHY_ID: ENTITY_BIBLIOGRAPHY_NAME,
140 |     ENTITY_QUOTE_ID: ENTITY_QUOTE_NAME,
141 |     ENTITY_EQUATION_ID: ENTITY_EQUATION_NAME,
142 |     ENTITY_FIGURE_ID: ENTITY_FIGURE_NAME,
143 |     ENTITY_TABLE_CAPTION_ID: ENTITY_TABLE_CAPTION_NAME,
144 |     ENTITY_FOOTNOTE_ID: ENTITY_FOOTNOTE_NAME,
145 |     ENTITY_ANNOTATION_ID: ENTITY_ANNOTATION_NAME,
146 |     ENTITY_FORM_FIELD_ID: ENTITY_FORM_FIELD_NAME,
147 |     ENTITY_FORM_TAG_ID: ENTITY_FORM_TAG_NAME,
148 |     ENTITY_TABLE_ROW_ID: ENTITY_TABLE_ROW_NAME,
149 |     ENTITY_TABLE_COLUMN_ID: ENTITY_TABLE_COLUMN_NAME,
150 |     ENTITY_TABLE_HEADER_ROW_ID: ENTITY_TABLE_HEADER_ROW_NAME,
151 | }
152 | 
153 | ENTITY_NAME_TO_ID = {}
154 | for k, v in ENTITY_ID_TO_NAME.items():
155 |     ENTITY_NAME_TO_ID[v] = k
156 | 


--------------------------------------------------------------------------------
/app/settings/entity_names.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "title": 0,
 3 |     "heading_1": 1,
 4 |     "heading_2": 2,
 5 |     "heading_3": 3,
 6 |     "heading_4": 4,
 7 |     "heading_5": 5,
 8 |     "heading_6": 6,
 9 |     "heading_7": 7,
10 |     "heading_8": 8,
11 |     "heading_9": 9,
12 |     "text": 10,
13 |     "list": 11,
14 |     "header": 12,
15 |     "footer": 13,
16 |     "table_header": 14,
17 |     "table_header_cell": 15,
18 |     "table": 16,
19 |     "table_cell": 17,
20 |     "toc": 18,
21 |     "bibliography": 19,
22 |     "quote": 20,
23 |     "equation": 21,
24 |     "figure": 22,
25 |     "table_caption": 23,
26 |     "footnote": 24,
27 |     "annotation": 25,
28 |     "form_field": 26,
29 |     "form_tag": 27,
30 |     "table_row": 28,
31 |     "table_column": 29,
32 |     "table_header_row": 30
33 | }


--------------------------------------------------------------------------------
/app/settings/filesystem.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains basic directory structures for the project.
 3 | """
 4 | from pathlib import Path
 5 | 
 6 | ROOT = Path(__file__).parent.parent
 7 | 
 8 | # resources
 9 | RESOURCES_DIR = ROOT / "resources"
10 | FASTTEXT_CLASSIFIERS_DIR = Path(RESOURCES_DIR, "fasttext-models")
11 | 
12 | # data dirs
13 | DATA_ROOT = ROOT / "data"
14 | DOC_SOURCES_DIR = DATA_ROOT / "doc_sources"
15 | CC_SEGMENT_DIR = DATA_ROOT / "crawl-data"
16 | CC_DIR = DATA_ROOT / "cc_urls"
17 | CLEAN_URLS_DIR = DATA_ROOT / "clean_urls"
18 | DOWNLOAD_DIR = DATA_ROOT / "download"
19 | 
20 | # tmp dirs
21 | TMP_DIR = DATA_ROOT / "tmp"
22 | 
23 | # fixed-location files
24 | ALEMBIC_INI_LOC = ROOT / "alembic.ini"
25 | 
26 | # for pipeline extensions
27 | RAW_DIR = DATA_ROOT / "raw"
28 | EXPERIMENT_DIR = DATA_ROOT / "experiments"
29 | 
30 | # structure of wordscape annotated files
31 | WS_MULTIMODAL = "multimodal"
32 | WS_META = "meta"
33 | 


--------------------------------------------------------------------------------
/app/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/__init__.py


--------------------------------------------------------------------------------
/app/src/annotation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/annotation/__init__.py


--------------------------------------------------------------------------------
/app/src/annotation/annotation_quality.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Tuple
 2 | 
 3 | import settings.entities as entity_settings
 4 | import settings.annotation as annotation_settings
 5 | from src.annotation.colorization import ColorizationDecision
 6 | 
 7 | __all__ = [
 8 |     "calc_annotation_quality_score"
 9 | ]
10 | 
11 | IGNORE_ENTITY_IDS = [
12 |     entity_settings.ENTITY_TABLE_ROW_ID,
13 |     entity_settings.ENTITY_TABLE_CELL_ID,
14 |     entity_settings.ENTITY_TABLE_COLUMN_ID
15 | ]
16 | 
17 | 
18 | def calc_annotation_quality_score(
19 |         colorization_decisions: List[ColorizationDecision],
20 |         entity_counts: Dict[int, int],
21 | ) -> Tuple[float, Dict[int, float]]:
22 |     r""" Calculate the annotation quality score for a document
23 | 
24 |     @param colorization_decisions: the colorization decisions for a document;
25 |         this is a list of ColorizationDecision objects with the attributes:
26 |         - text (str): the text of the element
27 |         - decision_source (str): the source of the decision
28 |         - entity_decision (int): the id of the entity category
29 |     @param entity_counts: dictionary with the number of entities for each
30 |         entity category
31 | 
32 |     @return: the annotation quality score for the document, and the proportion
33 |         of builtin characters for each entity
34 |     """
35 |     # count the number of characters for each entity
36 |     char_counter = {
37 |         k: {'builtin': 0, 'heuristic': 0}
38 |         for k in entity_settings.ALL_ENTITY_IDS
39 |     }
40 | 
41 |     for col_decision in colorization_decisions:
42 |         category_id = col_decision.entity_decision
43 | 
44 |         if col_decision.text is None:
45 |             # we assign text length 1 to entity categories that do not have
46 |             # text (this only concerns tables and figures which are always
47 |             # builtins)
48 |             text_len = 1.0
49 |         else:
50 |             text_len = len(col_decision.text)
51 | 
52 |         if col_decision.decision_source in annotation_settings.BUILTIN_SOURCES:
53 |             char_counter[category_id]['builtin'] += text_len
54 |         else:
55 |             char_counter[category_id]['heuristic'] += text_len
56 | 
57 |     # compute proportion of builtin characters for each entity
58 |     builtin_props = dict.fromkeys(entity_settings.ALL_ENTITY_IDS, 0.0)
59 | 
60 |     for cat_id, char_counts in char_counter.items():
61 |         total_chars = char_counts['builtin'] + char_counts['heuristic']
62 | 
63 |         if total_chars == 0:
64 |             prop = 0.0
65 |         else:
66 |             prop = char_counts['builtin'] / total_chars
67 | 
68 |         builtin_props[cat_id] = prop
69 | 
70 |     # compute final score
71 |     num_entities = sum(entity_counts.values())
72 | 
73 |     if num_entities == 0:
74 |         return 0.0, builtin_props
75 | 
76 |     quality_score = 0.0
77 |     for entity_id, count in entity_counts.items():
78 |         if entity_id not in IGNORE_ENTITY_IDS:
79 |             quality_score += count * builtin_props[entity_id]
80 | 
81 |     quality_score /= num_entities
82 | 
83 |     return quality_score, builtin_props
84 | 


--------------------------------------------------------------------------------
/app/src/annotation/colorization/__init__.py:
--------------------------------------------------------------------------------
 1 | # handlers
 2 | from .colorization_handler import ColorizationHandler, ColorizationDecision
 3 | from .heuristics.build_heuristics import ParagraphHeuristic
 4 | 
 5 | # entities modules
 6 | from .entities import colorize_builtin_form_elements
 7 | from .entities import colorize_builtin_toc_elements
 8 | from .entities import colorize_figures
 9 | from .entities import colorize_header_and_footer
10 | from .entities import colorize_paragraph
11 | from .entities import colorize_table
12 | from .entities import colorize_text_boxes
13 | 


--------------------------------------------------------------------------------
/app/src/annotation/colorization/colorize_doc.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from typing import Union
 3 | from docx.document import Document as DocxDocument
 4 | 
 5 | from src.annotation.colorization import (
 6 |     ColorizationHandler,
 7 |     ParagraphHeuristic,
 8 |     colorize_builtin_form_elements,
 9 |     colorize_builtin_toc_elements,
10 |     colorize_figures,
11 |     colorize_header_and_footer,
12 |     colorize_paragraph,
13 |     colorize_text_boxes,
14 |     colorize_table
15 | )
16 | from src.annotation.config import AnnotationConfig
17 | from src.annotation.utils.color_utils import sanitize_figure_settings
18 | 
19 | import settings.colors as color_settings
20 | 
21 | 
22 | def colorize_word_doc(
23 |         word_doc: DocxDocument,
24 |         colorization_handler: ColorizationHandler,
25 |         config: AnnotationConfig,
26 |         temp_dir: Union[pathlib.Path, None] = None,
27 | ) -> DocxDocument:
28 |     r""" Colorize a word document, and return the colorized document
29 | 
30 |     @param word_doc: the word document to colorize
31 |     @param colorization_handler: the colorization handler to use for
32 |         colorization
33 |     @param config: the annotation config to use for colorization
34 |     @param temp_dir: directory to use for storing temporary files
35 | 
36 |     @return: the colorized word document
37 |     """
38 |     # some elements do not have builtin styles, or styles we do not recognize.
39 |     # For these cases, we build heuristics as a fallback option
40 |     paragraph_heuristics = ParagraphHeuristic(word_doc, config)
41 | 
42 |     # sanitization step: change figure settings so that no preset styles are
43 |     # applied which could change the color of figures
44 |     sanitize_figure_settings(document=word_doc)
45 | 
46 |     # 1) colorize headers and footers
47 |     colorize_header_and_footer(
48 |         word_doc, colorization_handler=colorization_handler
49 |     )
50 | 
51 |     # 2) colorize text boxes
52 |     colorize_text_boxes(
53 |         word_doc, hsv_color=color_settings.COLOR_TEXT,
54 |         colorization_handler=colorization_handler
55 |     )
56 | 
57 |     # 3) colorize tables
58 |     for table in word_doc.tables:
59 |         colorize_table(table, colorization_handler=colorization_handler)
60 | 
61 |     # 4) colorize paragraph elements
62 |     for paragraph in word_doc.paragraphs:
63 |         colorize_paragraph(
64 |             paragraph,
65 |             colorization_handler=colorization_handler,
66 |             paragraph_heuristics=paragraph_heuristics
67 |         )
68 | 
69 |     # 5) colorize table of contents elements
70 |     # ! this has to be done before forms, due to XML overlaps
71 |     colorize_builtin_toc_elements(
72 |         word_doc, colorization_handler=colorization_handler
73 |     )
74 | 
75 |     # 6) colorize built-in form elements
76 |     # !this has to be done after regular colorization, because form fields may
77 |     # !overlap with other entity types, therefore being overcolored if this is
78 |     # !done first
79 |     colorize_builtin_form_elements(
80 |         word_doc, colorization_handler=colorization_handler
81 |     )
82 | 
83 |     # 6) colorize figures
84 |     word_doc = colorize_figures(
85 |         word_doc, temp_dir=temp_dir, colorization_handler=colorization_handler
86 |     )
87 | 
88 |     return word_doc
89 | 


--------------------------------------------------------------------------------
/app/src/annotation/colorization/entities/__init__.py:
--------------------------------------------------------------------------------
1 | from .figure import colorize_figures
2 | from .form import colorize_builtin_form_elements
3 | from .tables import colorize_table
4 | from .paragraph import colorize_paragraph
5 | from .text_box import colorize_text_boxes
6 | from .toc import colorize_builtin_toc_elements
7 | from .header_footer import colorize_header_and_footer
8 | 


--------------------------------------------------------------------------------
/app/src/annotation/colorization/entities/figure.py:
--------------------------------------------------------------------------------
  1 | from docx.document import Document as _Document
  2 | from docx import Document
  3 | import io
  4 | import numpy as np
  5 | import os
  6 | import pathlib
  7 | from PIL import Image
  8 | import tempfile
  9 | import zipfile
 10 | 
 11 | from src.annotation.colorization import ColorizationHandler
 12 | from src.annotation.utils.color_utils import hsv_to_rgb
 13 | from src.annotation.utils.updateable_zipfile import UpdateableZipFile
 14 | 
 15 | import settings
 16 | 
 17 | IMG_EXT = (
 18 |     '.bmp',
 19 |     '.gif',
 20 |     '.jpeg',
 21 |     '.jpg',
 22 |     '.png',
 23 |     '.tiff',
 24 |     '.ico',
 25 |     '.pcx',
 26 |     '.ppm',
 27 |     '.pgm',
 28 |     '.pbm',
 29 |     '.pnm',
 30 |     '.webp',
 31 |     '.hdr',
 32 |     '.dds',
 33 |     '.im',
 34 |     '.eps',
 35 |     '.svg'
 36 | )
 37 | 
 38 | 
 39 | def colorize_figures(
 40 |         word_doc: _Document,
 41 |         temp_dir: pathlib.Path,
 42 |         colorization_handler: ColorizationHandler,
 43 | ) -> _Document:
 44 |     r""" Colorizes figures in word document. It does so by first creating a
 45 |     temporary word document, then extracting all images from the document,
 46 |     colorizing them, and finally overwriting the images in the temporary
 47 |     document. The temporary document is then loaded into memory, destroyed
 48 |     on disk and returned.
 49 | 
 50 |     @param word_doc: word document to colorize
 51 |     @param temp_dir: directory to use for storing temporary files
 52 |     @param colorization_handler: colorization handler; here, this is only used
 53 |         to keep track of annotation sources.
 54 | 
 55 | 
 56 |     @return: colorized word document instance
 57 |     """
 58 | 
 59 |     # create temporary file
 60 |     temp_doc_fp = tempfile.NamedTemporaryFile(
 61 |         mode="w+b", suffix=".docx", dir=temp_dir
 62 |     )
 63 | 
 64 |     # save doc to temp file
 65 |     word_doc.save(path_or_stream=temp_doc_fp)
 66 | 
 67 |     # we raise an error if something has gone wrong and the document is not a
 68 |     # valid zip file
 69 |     if not zipfile.is_zipfile(temp_doc_fp):
 70 |         raise ValueError(f"document is not a valid zip file")
 71 | 
 72 |     # convert hsv to rgb
 73 |     rgb_color = tuple(hsv_to_rgb(hsv_color=settings.colors.COLOR_FIGURES))
 74 | 
 75 |     # extract image files, overwrite them with images color
 76 |     with UpdateableZipFile(temp_doc_fp, "a") as archive:
 77 |         for fp in archive.namelist():
 78 |             if (
 79 |                     not fp.startswith("word/media") or
 80 |                     not fp.lower().endswith(IMG_EXT)
 81 |             ):
 82 |                 continue
 83 | 
 84 |             # extract image to temp dir
 85 |             img_bytes = archive.read(fp)
 86 | 
 87 |             # read and overwrite image
 88 |             try:
 89 |                 img = Image.open(io.BytesIO(img_bytes))
 90 |             except Exception as e:
 91 |                 print(f"[WARNING] reading image {fp} "
 92 |                       f"failed with {e.__class__.__name__}: {e}")
 93 |                 continue
 94 |             img = Image.new("RGB", img.size)
 95 |             img.putdata([rgb_color] * np.prod(img.size))
 96 | 
 97 |             _, ext = os.path.splitext(fp)
 98 |             ext = ext.lower().strip(".")
 99 |             ext = 'jpeg' if ext == 'jpg' else ext
100 |             with io.BytesIO() as temp_img:
101 |                 try:
102 |                     img.save(temp_img, format=ext)
103 |                 except IOError:
104 |                     continue
105 |                 except Exception as e:
106 |                     # could not write file, skip
107 |                     print(f"unknown exception while writing image {fp};\n{e}")
108 |                     continue
109 | 
110 |                 temp_img.seek(0)
111 |                 archive.write(temp_img, fp)
112 | 
113 |             # add annotation source
114 |             colorization_handler.update_colorization_decisions(
115 |                 text=None,
116 |                 decision_source=settings.annotation.ANNOTATION_BUILTIN,
117 |                 entity_decision=settings.entities.ENTITY_FIGURE_ID
118 |             )
119 | 
120 |     word_doc = Document(temp_doc_fp.name)
121 |     temp_doc_fp.close()
122 | 
123 |     return word_doc
124 | 


--------------------------------------------------------------------------------
/app/src/annotation/colorization/entities/header_footer.py:
--------------------------------------------------------------------------------
 1 | from docx.document import Document as _Document
 2 | 
 3 | from src.annotation.colorization import ColorizationHandler
 4 | from src.annotation.colorization.entities.tables import colorize_table
 5 | 
 6 | import settings
 7 | 
 8 | 
 9 | def colorize_header_and_footer(
10 |         document: _Document, colorization_handler: ColorizationHandler
11 | ):
12 |     r""" Colorize header and footer of the document.
13 | 
14 |     @param document: the document to colorize
15 |     @param colorization_handler: the colorization handler to use for
16 |         colorization of the header and footer
17 |     """
18 |     # colorize document header
19 |     header_name = settings.colors.get_entity_name(settings.colors.COLOR_HEADER)
20 |     _colorize(
21 |         document, entity_name=header_name,
22 |         colorization_handler=colorization_handler
23 |     )
24 | 
25 |     # colorize document footer
26 |     footer_name = settings.colors.get_entity_name(settings.colors.COLOR_FOOTER)
27 |     _colorize(
28 |         document, entity_name=footer_name,
29 |         colorization_handler=colorization_handler
30 |     )
31 | 
32 | 
33 | def _colorize(
34 |         document: _Document,
35 |         entity_name: str,
36 |         colorization_handler: ColorizationHandler
37 | ):
38 |     r""" Colorize header or footer of the document.
39 | 
40 |     @param document: the document to colorize
41 |     @param entity_name: either "header" or "footer"
42 |     @param colorization_handler: the colorization handler to use for
43 |         colorization of the header and footer
44 |     """
45 |     assert entity_name in [
46 |         settings.entities.ENTITY_HEADER_NAME,
47 |         settings.entities.ENTITY_FOOTER_NAME
48 |     ]
49 | 
50 |     color = settings.colors.ENTITY_NAME_TO_COLOR[entity_name]
51 | 
52 |     for section in document.sections:
53 |         header_or_footer_obj = getattr(section, entity_name)
54 | 
55 |         # skip if obj is linked to previous section
56 |         if header_or_footer_obj.is_linked_to_previous:
57 |             continue
58 | 
59 |         # colorize paragraphs
60 |         for par in header_or_footer_obj.paragraphs:
61 |             if len(par.text) == 0:
62 |                 continue
63 | 
64 |             colorization_handler.assign_par_color(
65 |                 par=par,
66 |                 base_color=color,
67 |                 decision_source=settings.annotation.ANNOTATION_BUILTIN
68 |             )
69 | 
70 |         # colorize tables as footer / header
71 |         for table in header_or_footer_obj.tables:
72 |             colorize_table(
73 |                 table=table,
74 |                 base_color_table=color,
75 |                 base_color_table_header=color,
76 |                 colorization_handler=colorization_handler,
77 |                 sat_val_step=0
78 |             )
79 | 


--------------------------------------------------------------------------------
/app/src/annotation/colorization/entities/paragraph.py:
--------------------------------------------------------------------------------
 1 | from docx.oxml.xmlchemy import serialize_for_reading
 2 | from docx.text.paragraph import Paragraph
 3 | 
 4 | import settings
 5 | from src.annotation.builtin_styles import BUILTIN_STYLES
 6 | from src.annotation.colorization import ColorizationHandler
 7 | from src.annotation.colorization import ParagraphHeuristic
 8 | from src.annotation.colorization.mappings import MAP_BUILTIN_TO_ENTITY_COLOR
 9 | from src.annotation.utils.color_utils import check_if_par_is_numbered
10 | 
11 | 
12 | def colorize_paragraph(
13 |         paragraph: Paragraph,
14 |         colorization_handler: ColorizationHandler,
15 |         paragraph_heuristics: ParagraphHeuristic
16 | ):
17 |     r""" Colorize a paragraph. This function relies primarily on builtin styles
18 |     to identify which category a paragraph belongs to. If no builtin style is
19 |     found, we fall back to heuristics.
20 | 
21 |     @param paragraph: the paragraph to colorize
22 |     @param colorization_handler: the colorization handler
23 |     @param paragraph_heuristics: the paragraph heuristics
24 |     """
25 |     # skip paragraph if it has no style associated
26 |     if paragraph.style is None:
27 |         return
28 | 
29 |     # skip paragraph if it is empty
30 |     par_style = paragraph.style.name.lower()
31 |     par_text = "".join(s for s in paragraph.text if s not in ["\n", "\t"])
32 |     if len(par_text) == 0 and "toc" not in par_style:
33 |         return
34 | 
35 |     # if no built-in style, we can try to fall back to heuristics
36 |     if par_style not in BUILTIN_STYLES:
37 |         colorization_handler.assign_par_color_considering_runs(
38 |             paragraph, paragraph_heuristics,
39 |             original_was_builtin=False,
40 |             original_builtin_entity_id=settings.entities.ENTITY_TEXT_ID
41 |         )
42 |         return
43 | 
44 |     # check the builtin --> entity mapping
45 |     entity_color_found_for_builtin = None
46 |     for possible_start in MAP_BUILTIN_TO_ENTITY_COLOR:
47 |         if par_style.startswith(possible_start):
48 |             entity_color_found_for_builtin = \
49 |                 MAP_BUILTIN_TO_ENTITY_COLOR[possible_start]
50 | 
51 |     # ! some entity types we want to deal with specially
52 |     # ! this may include run-checking or detecting other entity signals
53 |     if entity_color_found_for_builtin == settings.colors.COLOR_TEXT:
54 |         attributes = set(
55 |             paragraph._p.xml._attr_seq(serialize_for_reading(paragraph._p))
56 |         )
57 | 
58 |         if "</m:oMath>" in attributes or "</m:oMathPara>" in attributes:
59 |             colorization_handler.assign_par_color(
60 |                 par=paragraph,
61 |                 base_color=settings.colors.COLOR_EQUATION,
62 |                 decision_source=settings.annotation.ANNOTATION_XML_PATTERN
63 |             )
64 |         elif check_if_par_is_numbered(paragraph):
65 |             colorization_handler.assign_par_color(
66 |                 par=paragraph,
67 |                 base_color=settings.colors.COLOR_LIST,
68 |                 decision_source=settings.annotation.ANNOTATION_XML_PATTERN
69 |             )
70 |         else:
71 |             colorization_handler.assign_par_color_considering_runs(
72 |                 par=paragraph,
73 |                 para_heuristics=paragraph_heuristics,
74 |                 original_was_builtin=True,
75 |                 original_builtin_entity_id=settings.entities.ENTITY_TEXT_ID
76 |             )
77 | 
78 |     elif entity_color_found_for_builtin is not None:
79 |         colorization_handler.assign_par_color(
80 |             par=paragraph,
81 |             base_color=entity_color_found_for_builtin,
82 |             decision_source=settings.annotation.ANNOTATION_BUILTIN
83 |         )
84 | 
85 |     else:
86 |         print(f"unrecognized style {par_style}")
87 | 


--------------------------------------------------------------------------------
/app/src/annotation/colorization/entities/tables/__init__.py:
--------------------------------------------------------------------------------
1 | from . import styles
2 | from .table_colorization_handler import TableColorizationHandler
3 | from .colorize_table import colorize_table
4 | 


--------------------------------------------------------------------------------
/app/src/annotation/colorization/entities/tables/colorize_table.py:
--------------------------------------------------------------------------------
 1 | from docx.table import Table
 2 | 
 3 | import settings
 4 | from src.annotation.colorization import ColorizationHandler
 5 | from src.annotation.colorization.entities.tables import \
 6 |     TableColorizationHandler
 7 | 
 8 | 
 9 | def colorize_table(
10 |         table: Table,
11 |         colorization_handler: ColorizationHandler = None,
12 |         base_color_table=settings.colors.COLOR_TABLE,
13 |         base_color_table_header=settings.colors.COLOR_TABLE_HEADER,
14 |         sat_val_step=settings.colors.SAT_VAL_STEP,
15 | ):
16 |     ct_tbl_ref_style = getattr(table.style, "_element", None)
17 | 
18 |     # record table in the colorization decisions
19 |     colorization_handler.update_colorization_decisions(
20 |         text=None,
21 |         decision_source=settings.annotation.ANNOTATION_BUILTIN,
22 |         entity_decision=settings.entities.ENTITY_TABLE_ID
23 |     )
24 | 
25 |     # initialize table colorization handler
26 |     tbl_col_handler = TableColorizationHandler(
27 |         ct_tbl=table._tbl, ct_tbl_ref_style=ct_tbl_ref_style,
28 |         colorization_handler=colorization_handler,
29 |         base_color_table=base_color_table,
30 |         base_color_header=base_color_table_header,
31 |         sat_val_step=sat_val_step
32 |     )
33 | 
34 |     # colorize table
35 |     tbl_col_handler.colorize_table()
36 | 


--------------------------------------------------------------------------------
/app/src/annotation/colorization/entities/text_box.py:
--------------------------------------------------------------------------------
 1 | from docx.document import Document as _Document
 2 | from docx.text.paragraph import Paragraph
 3 | from typing import Tuple
 4 | 
 5 | from src.annotation.colorization import ColorizationHandler
 6 | 
 7 | import settings
 8 | 
 9 | 
10 | def colorize_text_boxes(
11 |         document: _Document,
12 |         hsv_color: Tuple[int, int, int],
13 |         colorization_handler: ColorizationHandler
14 | ):
15 |     r"""
16 |     Colorize all text boxes in the document.
17 |     Currently, the basic assumption is that any text box near to a table
18 |     or figure should be viewed as a caption; the only default behavior
19 |     of word which creates text-boxes is when inserting captions.
20 | 
21 |     @param document: the document to colorize
22 |     @param hsv_color: the color to use for text boxes in hsv color space
23 |     @param colorization_handler: global tracker for colorization information
24 |     """
25 |     text_box_elements = document.element.body.xpath(".//w:txbxContent//w:p")
26 |     for par_xml in text_box_elements:
27 |         colorization_handler.assign_par_color(
28 |             par=Paragraph(par_xml, document),
29 |             base_color=hsv_color,
30 |             decision_source=settings.annotation.ANNOTATION_XML_PATTERN
31 |         )
32 | 


--------------------------------------------------------------------------------
/app/src/annotation/colorization/heuristics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/annotation/colorization/heuristics/__init__.py


--------------------------------------------------------------------------------
/app/src/annotation/colorization/mappings.py:
--------------------------------------------------------------------------------
 1 | r"""
 2 | In order to nicely support headers, the heuristic mapping uses ints as values:
 3 | 1 through 9 for known headers
 4 | -1 for unknown
 5 | special values for various builtins / other special properties
 6 | 
 7 | Here is also provided a mapping from builtins to constants that
 8 | work with the run-level mapping strategy.
 9 | """
10 | 
11 | import settings
12 | 
13 | HEURISTIC_LEVEL_BODY = -10
14 | HEURISTIC_LEVEL_TITLE = -20
15 | HEURISTIC_LEVEL_LIST = -30
16 | 
17 | HEURISTIC_FONT_UNKNOWN = -1.0
18 | 
19 | CONSIDER_RUN_COLORING_FOR = [settings.colors.COLOR_TEXT]
20 | 
21 | # if style starts with one of the following names, it
22 | # should map to that color
23 | MAP_BUILTIN_TO_ENTITY_COLOR = {
24 |     # BODY
25 |     "body": settings.colors.COLOR_TEXT,
26 |     "normal": settings.colors.COLOR_TEXT,
27 |     "plain text": settings.colors.COLOR_TEXT,
28 |     "no spacing": settings.colors.COLOR_TEXT,
29 |     "default": settings.colors.COLOR_TEXT,
30 | 
31 |     # TITLE
32 |     "title": settings.colors.COLOR_DOCUMENT_TITLE,
33 | 
34 |     # HEADINGS
35 |     "heading 1": settings.colors.COLOR_SECTION_HEADING_1,
36 |     "heading 2": settings.colors.COLOR_SECTION_HEADING_2,
37 |     "heading 3": settings.colors.COLOR_SECTION_HEADING_3,
38 |     "heading 4": settings.colors.COLOR_SECTION_HEADING_4,
39 |     "heading 5": settings.colors.COLOR_SECTION_HEADING_5,
40 |     "heading 6": settings.colors.COLOR_SECTION_HEADING_6,
41 |     "heading 7": settings.colors.COLOR_SECTION_HEADING_7,
42 |     "heading 8": settings.colors.COLOR_SECTION_HEADING_8,
43 |     "heading 9": settings.colors.COLOR_SECTION_HEADING_9,
44 | 
45 |     # HEADERS AND FOOTERS
46 |     "header": settings.colors.COLOR_HEADER,
47 |     "footer": settings.colors.COLOR_FOOTER,
48 | 
49 |     # LIST
50 |     "list": settings.colors.COLOR_LIST,
51 | 
52 |     # TOC
53 |     "toc": settings.colors.COLOR_TOC,
54 | 
55 |     # BIBLIOGRAPHY
56 |     "bibliography": settings.colors.COLOR_BIBLIOGRAPHY,
57 | 
58 |     # QUOTE
59 |     "quote": settings.colors.COLOR_QUOTE,
60 |     "intense quote": settings.colors.COLOR_QUOTE,
61 | 
62 |     # CAPTIONS
63 |     "caption": settings.colors.COLOR_TABLE_CAPTIONS,
64 | 
65 |     # FOOTNOTES
66 |     "footnote": settings.colors.COLOR_FOOTNOTE,
67 | 
68 |     # ANNOTATION
69 |     "annotation": settings.colors.COLOR_ANNOTATION,
70 | }
71 | 


--------------------------------------------------------------------------------
/app/src/annotation/config.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | import pathlib
 3 | import yaml
 4 | 
 5 | 
 6 | @dataclass
 7 | class AnnotationConfig:
 8 |     # image config
 9 |     image_format: str
10 |     image_height: int
11 |     image_width: int
12 |     image_dpi: int
13 | 
14 |     # decompression bomb checks
15 |     max_decompress_ratio: float
16 |     max_image_pixels: int
17 | 
18 |     # documents
19 |     max_doc_bytes: int
20 |     max_doc_pages: int
21 | 
22 |     # time limits
23 |     annotation_timeout_secs: int
24 |     annotation_cleanup_secs: int
25 | 
26 |     # data org
27 |     max_bytes_in_shard: int
28 | 
29 |     # language
30 |     top_k_languages: int
31 | 
32 |     # libreoffice
33 |     unoserver_start_timeout: int
34 |     unoconvert_timeout: int
35 |     soffice_launch_timeout: int
36 |     soffice_launch_ping_interval: float
37 | 
38 |     # entity detection
39 |     max_heading_len: int
40 |     form_field_min_length: int
41 | 
42 |     # entity relations
43 |     bbox_relation_overlap_threshold: float
44 |     bbox_relation_scale_threshold: float
45 |     bbox_relation_closeness_threshold: float
46 |     word_2_entity_overlap_threshold: float
47 | 
48 |     # annotation config
49 |     min_text_chars: int
50 | 
51 | 
52 | def load_config(fp: pathlib.Path) -> AnnotationConfig:
53 |     with fp.open(mode='r') as f:
54 |         data = yaml.safe_load(f)
55 | 
56 |     kwargs = {}
57 |     for d in data.values():
58 |         kwargs.update({k.lower(): v for k, v in d.items()})
59 | 
60 |     return AnnotationConfig(**kwargs)
61 | 


--------------------------------------------------------------------------------
/app/src/annotation/entity_detection/__init__.py:
--------------------------------------------------------------------------------
1 | from .entity_detector import EntityDetector
2 | from .detection import detect_entities_in_document
3 | 


--------------------------------------------------------------------------------
/app/src/annotation/entity_detection/detection.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import os
  4 | from typing import List, Dict, Tuple, Union
  5 | import pathlib
  6 | 
  7 | from src.annotation.annotation_objects import Entity
  8 | from src.annotation.colorization import ColorizationHandler
  9 | from src.annotation.entity_detection import EntityDetector
 10 | from src.annotation.utils.identifiers import get_page_id
 11 | from src.annotation.utils.pdf_utils import pdf_to_page_images_iterator
 12 | 
 13 | 
 14 | def detect_entities_in_document(
 15 |         doc_id: str,
 16 |         temp_pdf_fp: Union[str, pathlib.Path],
 17 |         colorization_handler: ColorizationHandler,
 18 |         debug_dir: Union[str, pathlib.Path] = None,
 19 |         word_doc_fp: Union[str, pathlib.Path] = None,
 20 |         dpi: int = 100,
 21 |         size: Tuple[Union[int, None], Union[int, None]] = (None, None)
 22 | ) -> Dict[str, Dict[int, List[Entity]]]:
 23 |     r"""Detect entities in a document.
 24 | 
 25 |     @param doc_id: id of the document
 26 |     @param temp_pdf_fp: path to pdf file
 27 |     @param colorization_handler: colorization handler containts the colors used
 28 |         for colorization
 29 |     @param debug_dir: path to save colorized image pages for debugging
 30 |     @param word_doc_fp: path to word document; this is only used for debugging
 31 |     @param dpi: resolution of the output image(s)
 32 |     @param size: size of the output image(s), uses the Pillow (width, height)
 33 |         standard. If one of width or height is set to None, the image
 34 |         aspect ratio is preserved.
 35 | 
 36 |     @return: Dict with page number as key and as value a dictionary with
 37 |         entity_category_id as key and list of entity objects for detected
 38 |         entities as value
 39 |     """
 40 |     pages_entities = {}
 41 |     page_number = 1  # page number starts at 1
 42 | 
 43 |     # extract pages from pdf as images
 44 |     # ! important: output format needs to use lossless compression when
 45 |     # ! converting the colorized pdf to images. Otherwise, the entity
 46 |     # ! detection will be inaccurate. ALWAYS USE fmt="png"!
 47 |     for pages_block in pdf_to_page_images_iterator(
 48 |             pdf_fp=temp_pdf_fp,
 49 |             fmt="png",
 50 |             size=size,
 51 |             dpi=dpi,
 52 |             output_folder=None
 53 |     ):
 54 |         for page in pages_block:
 55 |             # convert to cv2 format with HSV color space
 56 |             page = np.array(page).astype(np.uint8)
 57 |             page_cv2 = cv2.cvtColor(page, cv2.COLOR_RGB2HSV)
 58 | 
 59 |             if debug_dir is not None:
 60 |                 fn_root = os.path.splitext(os.path.split(word_doc_fp)[-1])[0]
 61 |                 debug_save_as = os.path.join(
 62 |                     debug_dir, f"colorized_{fn_root}_p{page_number}.png"
 63 |                 )
 64 |                 cv2.imwrite(
 65 |                     debug_save_as, cv2.cvtColor(page, cv2.COLOR_RGB2BGR)
 66 |                 )
 67 | 
 68 |             # detect entities in page: this function returns a dictionary with
 69 |             # entity_category as key and list of bounding boxes for detected
 70 |             # entities as value
 71 |             page_id = get_page_id(doc_id, page_number)
 72 |             entities = _detect_entities_on_page(
 73 |                 doc_id=doc_id,
 74 |                 page_id=page_id,
 75 |                 page_num=page_number,
 76 |                 page_image=page_cv2,
 77 |                 colorization_handler=colorization_handler
 78 |             )
 79 |             pages_entities[page_id] = entities
 80 |             page_number += 1
 81 | 
 82 |     return pages_entities
 83 | 
 84 | 
 85 | def _detect_entities_on_page(
 86 |         doc_id: str,
 87 |         page_id: str,
 88 |         page_num: int,
 89 |         page_image: np.ndarray,
 90 |         colorization_handler: ColorizationHandler
 91 | ) -> Dict[int, List[Entity]]:
 92 |     r"""Detect entities in a page.
 93 | 
 94 |     @param page_image: page to detect entities in; ! this needs to be a cv2
 95 |         image in HSV color space
 96 |     @param colorization_handler: colorization handler containts the colors used
 97 |         for colorization
 98 | 
 99 |     @return: Dictionary with entity_category_id as key and list of entity
100 |         objects for detected entities as value
101 |     """
102 |     entity_detector = EntityDetector(
103 |         doc_id=doc_id, page_id=page_id, page_num=page_num,
104 |         image_numpy=page_image, colorization_handler=colorization_handler
105 |     )
106 |     return entity_detector.detect_entities()
107 | 


--------------------------------------------------------------------------------
/app/src/annotation/entity_detection/utils.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/app/src/annotation/language_detection/__init__.py:
--------------------------------------------------------------------------------
1 | from .inference import predict_lang_per_page
2 | from .inference import predict_lang
3 | 


--------------------------------------------------------------------------------
/app/src/annotation/language_detection/inference.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | import fasttext
 3 | 
 4 | from src.annotation.annotation_objects import Word
 5 | import src.annotation.language_detection.utils as lang_utils
 6 | 
 7 | 
 8 | def predict_lang_per_page(
 9 |         pages_words: Dict[str, List[Word]],
10 |         k: int,
11 |         lm: fasttext.FastText._FastText = None
12 | ) -> Dict[str, Dict[str, float]]:
13 |     r""" Detects top-k languages occuring in text using the fasttext model
14 |     trained on trained on data from Wikipedia, Tatoeba and SETimes.
15 | 
16 |     Reference: https://fasttext.cc/docs/en/language-identification.html
17 | 
18 |     @param pages_words: dictionary mapping page ids to list of words
19 |     @param k: number of predictions to return, defaults to 5
20 |     @param lm: language model, defaults to None, in which case it is loaded in
21 |         the function
22 | 
23 |     @return: dictionary mapping page ids to list of predicted languages and
24 |         list of corresponding confidence scores
25 |     """
26 |     if lm is None:
27 |         lm = lang_utils.load_lang_model(version="ftz")
28 | 
29 |     pages_langs = {}
30 |     for page_id, page_words in pages_words.items():
31 |         page_text = " ".join([word.text for word in page_words])
32 |         pages_langs[page_id] = predict_lang(page_text, k=k, lm=lm)
33 | 
34 |     return pages_langs
35 | 
36 | 
37 | def _clean_text(text: str) -> str:
38 |     return " ".join(text.strip().lower().split())
39 | 
40 | 
41 | def predict_lang(text: str, k: int, lm=None) -> Dict[str, float]:
42 |     # clean text
43 |     text = _clean_text(text)
44 | 
45 |     if len(text) == 0:
46 |         return {"__label__unknown": 1.0}
47 | 
48 |     if lm is None:
49 |         lm = lang_utils.load_lang_model(version="ftz")
50 | 
51 |     # predict language
52 |     tags, confs = lm.predict(text, k=k)
53 | 
54 |     # convert predictions to dictionary
55 |     langs: Dict[str, float] = {
56 |         lang: float(conf) for lang, conf in zip(tags, confs)
57 |     }
58 | 
59 |     return langs
60 | 


--------------------------------------------------------------------------------
/app/src/annotation/language_detection/utils.py:
--------------------------------------------------------------------------------
 1 | import iso639
 2 | import fasttext
 3 | from pathlib import Path
 4 | from settings.filesystem import FASTTEXT_CLASSIFIERS_DIR
 5 | 
 6 | # suppress fasttext warning
 7 | fasttext.FastText.eprint = lambda x: None
 8 | 
 9 | 
10 | def lang_code_to_name(lang_code: str) -> str:
11 |     r""" Convert language iso639 code to human readable language name. """
12 |     try:
13 |         return iso639.to_name(lang_code)
14 |     except iso639.NonExistentLanguageError:
15 |         return "unknown"
16 | 
17 | 
18 | def load_lang_model(version: str = "bin") -> fasttext.FastText._FastText:
19 |     r""" Load language model. """
20 |     if version.lower() == "bin":
21 |         return fasttext.load_model(
22 |             path=str(Path(FASTTEXT_CLASSIFIERS_DIR, "lid.176.bin"))
23 |         )
24 |     elif version.lower() == "ftz":
25 |         return fasttext.load_model(
26 |             path=str(Path(FASTTEXT_CLASSIFIERS_DIR, "lid.176.ftz"))
27 |         )
28 |     else:
29 |         raise ValueError(f"Invalid fasttext model version {version}")
30 | 


--------------------------------------------------------------------------------
/app/src/annotation/oxml_metadata.py:
--------------------------------------------------------------------------------
 1 | r"""
 2 | Module to get metadata that can be acquired when viewing the XML of a word
 3 | document
 4 | """
 5 | 
 6 | from typing import List
 7 | from docx.document import Document
 8 | 
 9 | 
10 | class OXMLMetadata:
11 |     r"""
12 |     Class for metadata not directly originating from annotation, but intrinsic
13 |     to one document.
14 |     """
15 |     languages_autocorrect: List[str]
16 |     template_name: str
17 | 
18 |     # !!! IMPORTANT INFO (also has category, subject, title, status)
19 |     # https://python-docx.readthedocs.io/en/latest/api/document.html#coreproperties-objects
20 |     core_category: str
21 |     core_comments: str
22 |     core_content_status: str
23 |     core_created: str
24 |     core_identifier: str
25 |     core_keywords: str
26 |     core_last_printed: str
27 |     core_modified: str
28 |     core_subject: str
29 |     core_title: str
30 |     core_version: str
31 | 
32 | 
33 | def get_langs(doc: Document) -> List[str]:
34 |     # get w:lang tags
35 |     lang_tags = doc.element.body.xpath("//w:lang")
36 |     lang_list = []
37 |     for tag in lang_tags:
38 |         for k, v in tag.items():
39 |             lang_list.append(v)
40 |     return list(set(lang_list))
41 | 
42 | 
43 | def get_oxml_metadata(doc: Document) -> OXMLMetadata:
44 |     data = OXMLMetadata()
45 |     data.languages_autocorrect = get_langs(doc)
46 | 
47 |     core = doc.core_properties
48 |     data.core_category = core.category
49 |     data.core_comments = core.comments
50 |     data.core_content_status = core.content_status
51 |     data.core_created = core.created
52 |     data.core_identifier = core.identifier
53 |     data.core_keywords = core.keywords
54 |     data.core_last_printed = core.last_printed
55 |     data.core_modified = core.modified
56 |     data.core_subject = core.subject
57 |     data.core_title = core.title
58 |     data.core_version = core.version
59 | 
60 |     return data
61 | 


--------------------------------------------------------------------------------
/app/src/annotation/postprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from .postprocess import postprocess_words
2 | from .postprocess import postprocess_entities
3 | from .postprocess import postprocess_entities_content_based
4 | from .postprocess import postprocess_tables
5 | 


--------------------------------------------------------------------------------
/app/src/annotation/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/annotation/preprocessing/__init__.py


--------------------------------------------------------------------------------
/app/src/annotation/preprocessing/highlighting.py:
--------------------------------------------------------------------------------
 1 | from docx.document import Document as DocxDocument
 2 | from docx.table import _Cell
 3 | 
 4 | 
 5 | def sanitize_highlighting(word_doc: DocxDocument) -> DocxDocument:
 6 |     r"""Remove highlighting from a word document, as this interferes with our
 7 |     colorization based annotation process.
 8 | 
 9 |     @param word_doc: word document instance
10 | 
11 |     @return: sanitized word document instance
12 |     """
13 |     for para in word_doc.paragraphs:
14 |         # Iterate over all runs in the paragraph
15 |         for run in para.runs:
16 |             # Check if the run has highlighting
17 |             if run.font.highlight_color is not None:
18 |                 # Remove the highlighting
19 |                 run.font.highlight_color = None
20 | 
21 |     # Iterate over all tables in the document
22 |     for table in word_doc.tables:
23 |         for row in table.rows:
24 |             try:
25 |                 row_cells = row.cells
26 |             except IndexError:
27 |                 row_cells = [_Cell(tc, table) for tc in row._tr.tc_lst]
28 |             for cell in row_cells:
29 |                 for para in cell.paragraphs:
30 |                     for run in para.runs:
31 |                         # remove highlighting
32 |                         if run.font.highlight_color is not None:
33 |                             run.font.highlight_color = None
34 | 
35 |     return word_doc
36 | 


--------------------------------------------------------------------------------
/app/src/annotation/sanity_checks.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from typing import Dict, Tuple, Set
 3 | 
 4 | from src.exceptions import *
 5 | 
 6 | ASPECT_RATIO_TOL = 1e-2
 7 | 
 8 | 
 9 | def pages_aspect_ratios(
10 |         page_dims_pdf_parser: Dict[str, Tuple[int, int]],
11 |         page_dims_renderings: Dict[str, Tuple[int, int]]
12 | ):
13 |     r""" checks that the aspect ratios of the pages in the PDF file are
14 |     consistent with the aspect ratios of the pages in the rendered page images.
15 | 
16 |     @param page_dims_pdf_parser: dictionary mapping page_ids to tuples
17 |         containing the width and height of the page
18 |     @param page_dims_renderings: dictionary mapping page_ids to tuples
19 |         containing the width and height of the page
20 | 
21 |     @raises InconsistentAspectRatiosError: if the aspect ratios are not
22 |         consistent
23 |     """
24 |     for pg_key in page_dims_pdf_parser.keys():
25 |         # compute aspect ratios
26 |         aspect_ratio_pdf = \
27 |             page_dims_pdf_parser[pg_key][0] / page_dims_pdf_parser[pg_key][1]
28 |         aspect_ratio_renderings = \
29 |             page_dims_renderings[pg_key][0] / page_dims_renderings[pg_key][1]
30 | 
31 |         if not math.isclose(
32 |                 aspect_ratio_renderings, aspect_ratio_pdf,
33 |                 rel_tol=ASPECT_RATIO_TOL
34 |         ):
35 |             raise InconsistentAspectRatiosError(
36 |                 aspect_ratio_pdf, aspect_ratio_renderings
37 |             )
38 | 
39 | 
40 | def page_counts_consistency(
41 |         pages_from_entity_detection: Set,
42 |         pages_from_pdf_parser: Set
43 | ):
44 |     r""" checks that the page numberings are consistent between the entity
45 |     detection and the pdf parser.
46 | 
47 |     @param pages_from_entity_detection: dictionary mapping page numbers to
48 |         dictionaries mapping entity ids to lists of bounding boxes
49 |     @param pages_from_pdf_parser: dictionary mapping page numbers to lists of
50 |         words
51 | 
52 |     @raises InconsistentPageCountError: if the page numberings are inconsistent
53 |     """
54 |     if pages_from_entity_detection != pages_from_pdf_parser:
55 |         raise InconsistentPageCountError(
56 |             expected=pages_from_entity_detection,
57 |             actual=pages_from_pdf_parser
58 |         )
59 | 


--------------------------------------------------------------------------------
/app/src/annotation/soffice/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/annotation/soffice/__init__.py


--------------------------------------------------------------------------------
/app/src/annotation/soffice/utils.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | 
 3 | import psutil
 4 | from typing import Union
 5 | 
 6 | __all__ = [
 7 |     "get_soffice_process_on_port",
 8 |     "get_free_port"
 9 | ]
10 | 
11 | 
12 | def get_soffice_process_on_port(port) -> Union[psutil.Process, None]:
13 |     """ function returns the soffice process object on the given port or
14 |     None if no process is running on the given port.
15 |     """
16 |     for proc in psutil.process_iter():
17 |         try:
18 |             name = proc.name()
19 |         except (
20 |                 psutil.NoSuchProcess,
21 |                 psutil.AccessDenied,
22 |                 psutil.ZombieProcess
23 |         ):
24 |             continue
25 | 
26 |         if not name.startswith("soffice"):
27 |             continue
28 | 
29 |         try:
30 |             connections = proc.connections()
31 |         except (
32 |                 psutil.NoSuchProcess,
33 |                 psutil.AccessDenied,
34 |                 psutil.ZombieProcess
35 |         ):
36 |             continue
37 | 
38 |         for conn in connections:
39 |             if (
40 |                     conn.status == psutil.CONN_LISTEN and
41 |                     conn.laddr.port == port
42 |             ):
43 |                 return proc
44 | 
45 |     return None
46 | 
47 | 
48 | def get_free_port():
49 |     r""" function returns a free port on the current machine """
50 |     with socket.socket() as s:
51 |         s.bind(("", 0))
52 |         return s.getsockname()[1]
53 | 


--------------------------------------------------------------------------------
/app/src/annotation/text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/annotation/text/__init__.py


--------------------------------------------------------------------------------
/app/src/annotation/text/text_entity_matching.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | from typing import Dict, List
 3 | 
 4 | from src.annotation.utils.bbox_utils import area_of_overlap
 5 | from src.annotation.annotation_objects import Entity, Word, BoundingBox
 6 | 
 7 | 
 8 | def assign_entities_to_words(
 9 |         pages_entities: Dict[str, Dict[int, List[Entity]]],
10 |         pages_words: Dict[str, List[Word]],
11 |         threshold: float
12 | ) -> Dict[str, List[Word]]:
13 |     r""" Assigns entities to words based on on whether the word bounding
14 |     box is overlapping with the entity bounding box by at least the threshold.
15 | 
16 |     @param pages_entities: dictionary with page_ids as keys and dictionary of
17 |         entities as value. The dictionary of entities is indexed by entity
18 |         category and contains a list of entity objects.
19 |     @param pages_words: dictionary with page_ids as keys and list of words as
20 |         value.
21 |     @param threshold: threshold for overlap between word and entity bounding
22 |         boxes.
23 | 
24 |     @return: dictionary with page_ids as keys and list of words as value.
25 |     """
26 |     for page_id in pages_words.keys():
27 |         words = pages_words[page_id]
28 |         entities = pages_entities[page_id]
29 |         for word in words:
30 |             # find candidate entities for word
31 |             candidate_entities = list(itertools.chain(*[
32 |                 _find_candidate_entities(
33 |                     word=word, entities=entity_lst, threshold=threshold
34 |                 ) for entity_lst in entities.values()
35 |             ]))
36 | 
37 |             # assign candidate entities to word
38 |             for entity in candidate_entities:
39 |                 word.entity_ids.append(entity.id)
40 |                 word.entity_categories.append(entity.entity_category)
41 | 
42 |     return pages_words
43 | 
44 | 
45 | def _find_candidate_entities(
46 |         word: Word, entities: List[Entity], threshold: float
47 | ) -> List[Entity]:
48 |     r"""Finds candidate entities for a word based on whether the word bounding
49 |     box is overlapping with the entity bounding box by at least threshold.
50 | 
51 |     @param word: word object to find candidate entities for
52 |     @param entities: list of entity objects
53 |     @param threshold: threshold for overlap
54 | 
55 |     @return: list of candidate entities
56 |     """
57 |     return list(filter(
58 |         lambda e: is_contained_in(word.bbox, e.bbox, threshold=threshold),
59 |         entities
60 |     ))
61 | 
62 | 
63 | def is_contained_in(
64 |         bbox1: BoundingBox, bbox2: BoundingBox, threshold: float
65 | ) -> bool:
66 |     r""" Checks whether bbox1 is contained in bbox2 by at least the threshold.
67 | 
68 |     @param bbox1: first bounding box
69 |     @param bbox2: second bounding box
70 |     @param threshold: threshold for overlap
71 | 
72 |     @return: True if the two bounding boxes are overlapping by at least the
73 |         threshold, False otherwise
74 |     """
75 |     if bbox1.area <= 0:
76 |         return False
77 | 
78 |     overlap = area_of_overlap(bbox1, bbox2)
79 |     ratio = overlap / bbox1.area
80 | 
81 |     return ratio >= threshold
82 | 


--------------------------------------------------------------------------------
/app/src/annotation/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .color_utils import hsv_to_rgb, hsv_to_bgr
2 | 


--------------------------------------------------------------------------------
/app/src/annotation/utils/bbox_utils.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import math
  3 | import numpy as np
  4 | from typing import Tuple, List
  5 | 
  6 | from src.annotation.annotation_objects import BoundingBox
  7 | 
  8 | 
  9 | def area_of_overlap(
 10 |         bbox1: BoundingBox,
 11 |         bbox2: BoundingBox
 12 | ) -> float:
 13 |     r"""calculates the area of overlap between two bounding boxes
 14 | 
 15 |     @param bbox1: tuple of floats (x, y, w, h) indicating top-left corner
 16 |         (x, y), height h, and width w of the first bounding box
 17 |     @param bbox2: tuple of floats (x, y, w, h) indicating top-left corner
 18 |         (x, y), height h, and width w of the second bounding box
 19 | 
 20 |     returns: a float indicating the area of intersection between the two
 21 |         bounding boxes
 22 |     """
 23 |     x1, y1, w1, h1 = bbox1.box
 24 |     x2, y2, w2, h2 = bbox2.box
 25 | 
 26 |     # determine coordinates of intersection triangle
 27 |     x_left = max(x1, x2)
 28 |     x_right = min(x1 + w1, x2 + w2)
 29 |     y_top = max(y1, y2)
 30 |     y_bottom = min(y1 + h1, y2 + h2)
 31 | 
 32 |     if x_right < x_left or y_bottom < y_top:
 33 |         return 0.0
 34 | 
 35 |     # The intersection of two axis-aligned bounding boxes is always an
 36 |     # axis-aligned bounding box
 37 |     intersection_area = (x_right - x_left) * (y_bottom - y_top)
 38 | 
 39 |     return intersection_area
 40 | 
 41 | 
 42 | def euclidean_distance(
 43 |         bbox1: Tuple[float, float, float, float],
 44 |         bbox2: Tuple[float, float, float, float]
 45 | ) -> float:
 46 |     r"""calculates the euclidean distance between two bounding boxes
 47 | 
 48 |     @param bbox1: tuple of floats (x, y, w, h) indicating top-left corner
 49 |         (x, y), height h, and width w of the first bounding box
 50 |     @param bbox2: tuple of floats (x, y, w, h) indicating top-left corner
 51 |         (x, y), height h, and width w of the second bounding box
 52 | 
 53 |     returns: a float indicating the euclidean distance between the two
 54 |         bounding boxes
 55 |     """
 56 | 
 57 |     x1, y1, w1, h1 = bbox1
 58 |     x2, y2, w2, h2 = bbox2
 59 | 
 60 |     left = x1 + w1 < x2
 61 |     right = x2 + w2 < x1
 62 |     bottom = y2 + h2 < y1
 63 |     top = y1 + h1 < y2
 64 | 
 65 |     if top and left:
 66 |         return math.dist([x1 + w1, y1 + h1], [x2, y2])
 67 |     elif left and bottom:
 68 |         return math.dist([x1 + w1, y1], [x2, y2 + h2])
 69 |     elif bottom and right:
 70 |         return math.dist([x1, y1], [x2 + w2, y2 + h2])
 71 |     elif right and top:
 72 |         return math.dist([x1, y1 + h1], [x2 + w2, y2])
 73 |     elif left:
 74 |         return x2 - (x1 + w1)
 75 |     elif right:
 76 |         return x1 - (x2 + w2)
 77 |     elif bottom:
 78 |         return y1 - (y2 + h2)
 79 |     elif top:
 80 |         return y2 - (y1 + h1)
 81 |     else:  # rectangles intersect
 82 |         return 0.
 83 | 
 84 | 
 85 | def is_contained_in(
 86 |         bbox1: BoundingBox,
 87 |         bbox2: BoundingBox,
 88 | ) -> bool:
 89 |     r"""determines whether bbox1 is contained in bbox2
 90 | 
 91 |     @param bbox1: tuple of floats (x, y, w, h) indicating top-left corner
 92 |         (x, y), height h, and width w of the first bounding box
 93 |     @param bbox2: tuple of floats (x, y, w, h) indicating top-left corner
 94 |         (x, y), height h, and width w of the second bounding box
 95 | 
 96 |     @return: True if bbox1 is contained in bbox2, False otherwise
 97 |     """
 98 |     # determine the area of the first bounding box
 99 |     _, _, w1, h1 = bbox1.box
100 |     area_bbox1 = w1 * h1
101 | 
102 |     intersection_area = area_of_overlap(bbox1, bbox2)
103 | 
104 |     # bbox1 is contained in bbox2 if the area of intersection is equal to the
105 |     # area of bbox 1
106 |     return math.isclose(intersection_area, area_bbox1)
107 | 
108 | 
109 | def detect_contours(
110 |         image: np.array,
111 |         lowerb: Tuple[int, int, int],
112 |         upperb: Tuple[int, int, int]
113 | ) -> Tuple[List, List]:
114 |     r""" utility function: detects contours in the image for values that fall
115 |     in the range lowerb, upperb.
116 | 
117 |     @param image: image where contours are detected
118 |     @param lowerb: lower bound of the range
119 |     @param upperb: upper bound of the range
120 | 
121 |     @return: a tuple of two lists: the first list contains the contours, the
122 |         second list contains the hierarchy
123 |     """
124 |     # create mask where values are in range
125 |     mask = cv2.inRange(image, lowerb=lowerb, upperb=upperb)
126 | 
127 |     # get contours in mask
128 |     contours, hierarchy = cv2.findContours(
129 |         mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
130 |     )
131 | 
132 |     return contours, hierarchy
133 | 


--------------------------------------------------------------------------------
/app/src/annotation/utils/color_utils.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | from docx.document import Document as DocxDocument
  4 | from docx.oxml import OxmlElement
  5 | from docx.oxml.ns import qn
  6 | from docx.text.paragraph import Paragraph
  7 | 
  8 | from typing import Tuple
  9 | 
 10 | 
 11 | def rgb_to_hex(rgb_color: Tuple[int, int, int]) -> str:
 12 |     r"""convert rgb colors to hex
 13 | 
 14 |     @param rgb_color: a tuple of 3 values (r, g, b)
 15 | 
 16 |     @return: a hex string
 17 |     """
 18 |     rgb_color = tuple(int(c) % 256 for c in np.squeeze(rgb_color))
 19 | 
 20 |     if len(rgb_color) != 3:
 21 |         raise ValueError(
 22 |             "rgb color must consist of 3 positive numbers! "
 23 |             "got {}".format(len(rgb_color))
 24 |         )
 25 | 
 26 |     return "#%02x%02x%02x" % rgb_color
 27 | 
 28 | 
 29 | def hsv_to_rgb(hsv_color: Tuple[int, int, int]) -> Tuple[int, int, int]:
 30 |     r"""convert hsv colors to rgb
 31 | 
 32 |     @param hsv_color: a tuple of 3 values (h, s, v)
 33 | 
 34 |     @return: a tuple of 3 values (r, g, b)
 35 |     """
 36 |     hsv_color_uint8 = np.uint8(hsv_color)
 37 | 
 38 |     # TODO: remove this ambiguity: in the future only a tuple of 3 values is
 39 |     #  accepted
 40 |     if len(hsv_color_uint8.shape) == 1:
 41 |         hsv_color_uint8 = np.expand_dims(hsv_color_uint8, axis=[0, 1])
 42 |     elif len(hsv_color_uint8.shape) == 2:
 43 |         hsv_color_uint8 = np.expand_dims(hsv_color_uint8, axis=0)
 44 |     else:
 45 |         raise ValueError(
 46 |             "! Warning: hsv color has shape {}; this function "
 47 |             "excpects hsv_color to be a tuple of 3 values".format(
 48 |                 hsv_color_uint8.shape)
 49 |         )
 50 | 
 51 |     return tuple(
 52 |         cv2.cvtColor(hsv_color_uint8, cv2.COLOR_HSV2RGB)
 53 |         .squeeze()
 54 |         .astype(int)
 55 |         .tolist()
 56 |     )
 57 | 
 58 | 
 59 | def hsv_to_bgr(hsv_color: Tuple[int, int, int]) -> Tuple[int, int, int]:
 60 |     r"""convert hsv colors to bgr
 61 | 
 62 |     @param hsv_color: a tuple of 3 values (h, s, v)
 63 | 
 64 |     @return: a tuple of 3 values (r, g, b)
 65 |     """
 66 |     hsv_color_uint8 = np.uint8(hsv_color)
 67 | 
 68 |     # TODO: remove this ambiguity: in the future only a tuple of 3 values is
 69 |     #  accepted
 70 |     if len(hsv_color_uint8.shape) == 1:
 71 |         hsv_color_uint8 = np.expand_dims(hsv_color_uint8, axis=[0, 1])
 72 |     elif len(hsv_color_uint8.shape) == 2:
 73 |         hsv_color_uint8 = np.expand_dims(hsv_color_uint8, axis=0)
 74 |     else:
 75 |         raise ValueError(
 76 |             "! Warning: hsv color has shape {}; this function "
 77 |             "excpects hsv_color to be a tuple of 3 values".format(
 78 |                 hsv_color_uint8.shape)
 79 |         )
 80 | 
 81 |     return tuple(
 82 |         cv2.cvtColor(hsv_color_uint8, cv2.COLOR_HSV2BGR)
 83 |         .squeeze()
 84 |         .astype(int)
 85 |         .tolist()
 86 |     )
 87 | 
 88 | 
 89 | def sanitize_figure_settings(document: DocxDocument):
 90 |     r"""
 91 |     Removing all child entries of the `a:blip xml` element
 92 |     ensures that all figures are loaded as-is with no rendering mods,
 93 |     enabling our figure-detection method to work
 94 | 
 95 |     @param document: the document to sanitize
 96 |     """
 97 |     fig_blip_elements = document.element.body.xpath(".//pic:blipFill//a:blip")
 98 |     # delete the child elements of this
 99 |     for blip_wrapper in fig_blip_elements:
100 |         for img_mod_child in blip_wrapper.getchildren():
101 |             blip_wrapper.remove(img_mod_child)
102 | 
103 | 
104 | def shade_element(prop, color_hex):
105 |     r""" Apply shading to an element """
106 |     color_hex = color_hex.replace('#', '').upper()
107 |     shd = OxmlElement("w:shd")
108 |     shd.set(qn("w:fill"), color_hex)
109 |     prop.append(shd)
110 | 
111 | 
112 | def check_if_par_is_numbered(par: Paragraph) -> bool:
113 |     r"""
114 |     Check if a par is numbered, which we assume to indicate a list.
115 | 
116 |     @param par: the paragraph to check
117 | 
118 |     @return: True if the paragraph is numbered, False otherwise
119 |     """
120 | 
121 |     # a list style (even within a normal paragraph!) means numbering has
122 |     # occured.
123 |     par_xml_numbering = par._p.xpath(".//w:pPr//w:numPr")
124 | 
125 |     if len(par_xml_numbering) > 0:
126 |         return True
127 | 
128 |     return False
129 | 


--------------------------------------------------------------------------------
/app/src/annotation/utils/docx_utils.py:
--------------------------------------------------------------------------------
 1 | from docx.document import Document as DocxDocument
 2 | from lxml import etree
 3 | from typing import Union
 4 | 
 5 | from src.exceptions import UnknownPageCountException
 6 | 
 7 | 
 8 | def get_page_count(doc: DocxDocument) -> Union[int, None]:
 9 |     r""" Get page count from docx file.
10 | 
11 |     @param doc: docx document
12 | 
13 |     @return: page count or None if not found
14 |     """
15 |     for part in doc._part.package.iter_parts():
16 |         if part.partname.endswith("app.xml"):
17 |             app_etree = etree.fromstring(part._blob)
18 |             break
19 |     else:
20 |         raise UnknownPageCountException("app.xml not found")
21 | 
22 |     # get pages from app.xml
23 |     for child in app_etree:
24 |         if child.tag.endswith("Pages"):
25 |             if child.text is None:
26 |                 break
27 |             pages = int(child.text)
28 |             return pages
29 | 
30 |     raise UnknownPageCountException("`Pages` tag not found")
31 | 


--------------------------------------------------------------------------------
/app/src/annotation/utils/identifiers.py:
--------------------------------------------------------------------------------
 1 | def get_page_id(doc_id: str, page_number: int) -> str:
 2 |     """ Generate a page id. """
 3 |     return f"{doc_id}_p{page_number:05d}"
 4 | 
 5 | 
 6 | def get_page_num_from_page_id(page_id: str) -> int:
 7 |     """ Extract the page number from a page id. """
 8 |     return int(page_id.split("_p")[-1])
 9 | 
10 | 
11 | def get_doc_id(cc_dump_id: str, doc_number: int) -> str:
12 |     """ Generate a document id. """
13 |     return f"doc_{cc_dump_id}_{doc_number:08d}"
14 | 


--------------------------------------------------------------------------------
/app/src/annotation/utils/pdf_utils.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | 
  3 | import cv2
  4 | import numpy as np
  5 | import os
  6 | from pdf2image import pdfinfo_from_path, convert_from_path
  7 | from typing import Union, Tuple, Dict
  8 | 
  9 | from src.annotation.utils.identifiers import get_page_id
 10 | 
 11 | PDF2IMG_BLOCKSIZE = 4
 12 | 
 13 | __all__ = [
 14 |     "get_page_count_from_pdf",
 15 |     "pdf_to_page_images_iterator",
 16 |     "extract_page_images_and_dimensions_from_pdf"
 17 | ]
 18 | 
 19 | 
 20 | def get_page_count_from_pdf(pdf_fp: pathlib.Path) -> int:
 21 |     r"""Get number of pages from pdf file.
 22 | 
 23 |     @param pdf_fp: path to pdf file
 24 | 
 25 |     @return: number of pages
 26 |     """
 27 |     pdf_info = pdfinfo_from_path(pdf_fp, userpw=None, poppler_path=None)
 28 |     return int(pdf_info["Pages"])
 29 | 
 30 | 
 31 | def pdf_to_page_images_iterator(
 32 |         pdf_fp: str,
 33 |         fmt: str,
 34 |         dpi: int,
 35 |         size: Tuple[Union[int, None], Union[int, None]],
 36 |         output_folder: Union[str, None]
 37 | ):
 38 |     r"""Iterate over pages of a pdf file. The function creates pages in batches
 39 |     of `block_size` pages. This is to avoid memory issues when converting
 40 |     large pdf files.
 41 | 
 42 |     @param pdf_fp: path to pdf file
 43 |     @param fmt: output format; this should be a lossless format when the
 44 |         function is used for entity detection.
 45 |     @param dpi: resolution of the output image(s)
 46 |     @param size: size of the output image(s), uses the Pillow (width, height)
 47 |         standard. If one of width or height is set to None, the image
 48 |         aspect ratio is preserved.
 49 |     @param output_folder: path to output folder
 50 | 
 51 |     @return: iterator over pages
 52 |     """
 53 |     pdf_info = pdfinfo_from_path(pdf_fp, userpw=None, poppler_path=None)
 54 |     num_pages = pdf_info["Pages"]
 55 |     for page in range(1, num_pages + 1, PDF2IMG_BLOCKSIZE):
 56 |         # ! important: output format needs to use lossless compression when
 57 |         # ! converting the colorized pdf to images. Otherwise, the entity
 58 |         # ! detection will be inaccurate.
 59 |         yield convert_from_path(
 60 |             pdf_path=pdf_fp,
 61 |             size=size,
 62 |             dpi=dpi,
 63 |             first_page=page,
 64 |             thread_count=4,
 65 |             last_page=min(page + PDF2IMG_BLOCKSIZE - 1, num_pages), fmt=fmt,
 66 |             output_folder=output_folder
 67 |         )
 68 | 
 69 | 
 70 | def extract_page_images_and_dimensions_from_pdf(
 71 |         doc_id: str,
 72 |         pdf_fp: Union[str, pathlib.Path],
 73 |         target_dir: Union[str, pathlib.Path],
 74 |         fmt: str,
 75 |         dpi: int,
 76 |         size: Tuple[Union[int, None], Union[int, None]]
 77 | ) -> Tuple[Dict[str, str], Dict[str, Tuple[int, int]]]:
 78 |     r"""Extract page images and dimensions from a pdf file.
 79 | 
 80 |     Note: Currently, this function saves individual page images to the
 81 |         target_dir directory. This will be removed in the  future as want to
 82 |         write the images directly from memory into tar archives.
 83 | 
 84 |     @param doc_id: document id
 85 |     @param pdf_fp: path to pdf file
 86 |     @param target_dir: path to target directory
 87 |     @param fmt: output format; this should be a lossless format when the
 88 |         function is used for entity detection.
 89 |     @param dpi: resolution of the output image(s)
 90 |     @param size: size of the output image(s), uses the Pillow (width, height)
 91 |         standard. If one of width or height is set to None, the image
 92 |         aspect ratio is preserved.
 93 | 
 94 |     @return: dict with page_id as keys and paths to extracted images as
 95 |         value, dict with page_id as keys and dimensions for each
 96 |         image/page as value
 97 |     """
 98 |     image_paths = {}
 99 |     image_dimensions = {}
100 |     page_number = 1  # page number starts at 1
101 | 
102 |     # extract pages from pdf as images
103 |     for pages_block in pdf_to_page_images_iterator(
104 |             pdf_fp=pdf_fp,
105 |             fmt=fmt,
106 |             dpi=dpi,
107 |             size=size,
108 |             output_folder=None
109 |     ):
110 |         for page_img in pages_block:
111 |             # get page id
112 |             page_id = get_page_id(doc_id, page_number)
113 | 
114 |             # convert to cv2 format with HSV color space
115 |             page_img = np.array(page_img).astype(np.uint8)
116 |             page_img = cv2.cvtColor(page_img, code=cv2.COLOR_RGB2BGR)
117 | 
118 |             # extract dimensions
119 |             height, width, _ = page_img.shape
120 |             image_dimensions[page_id] = (width, height)
121 | 
122 |             fp = os.path.join(
123 |                 target_dir, f"{page_id}.{fmt}"
124 |             )
125 |             image_paths[page_id] = fp
126 |             cv2.imwrite(fp, page_img)
127 | 
128 |             page_number += 1
129 | 
130 |     return image_paths, image_dimensions
131 | 


--------------------------------------------------------------------------------
/app/src/annotation/utils/zip_bomb.py:
--------------------------------------------------------------------------------
  1 | from io import BytesIO
  2 | import zipfile
  3 | from pathlib import Path
  4 | from PIL import Image
  5 | from PIL import UnidentifiedImageError
  6 | 
  7 | from src.exceptions import *
  8 | 
  9 | __all__ = [
 10 |     "get_uncompressed_file_size",
 11 |     "detect_image_decompression_bombs",
 12 |     "zip_bomb_check"
 13 | ]
 14 | 
 15 | # Limit images to around 32MB for a 24-bit (3 bpp) image
 16 | MAX_IMAGE_PIXELS = int(1024 * 1024 * 1024 // 32 // 3)
 17 | 
 18 | IMG_EXT = (
 19 |     '.bmp',
 20 |     '.gif',
 21 |     '.jpeg',
 22 |     '.jpg',
 23 |     '.png',
 24 |     '.tiff',
 25 |     '.ico',
 26 |     '.pcx',
 27 |     '.ppm',
 28 |     '.pgm',
 29 |     '.pbm',
 30 |     '.pnm',
 31 |     '.webp',
 32 |     '.hdr',
 33 |     '.dds',
 34 |     '.im',
 35 |     '.eps',
 36 |     '.svg'
 37 | )
 38 | 
 39 | 
 40 | def _compression_ratio(zip_file: zipfile.ZipFile):
 41 |     uncompressed_size = sum(zp.file_size for zp in zip_file.infolist())
 42 |     compressed_size = sum(zp.compress_size for zp in zip_file.infolist())
 43 | 
 44 |     if compressed_size == 0:
 45 |         return 0
 46 | 
 47 |     return uncompressed_size / compressed_size
 48 | 
 49 | 
 50 | def get_uncompressed_file_size(doc_bytes: bytes, doc_fn: Path):
 51 |     # check if file is a valid zip file
 52 |     with BytesIO(doc_bytes) as f:
 53 |         if not zipfile.is_zipfile(f):
 54 |             raise NoZipFileException(f"{doc_fn} is not a valid zip file")
 55 | 
 56 |         # calculate uncompressed size
 57 |         with zipfile.ZipFile(f) as zf:
 58 |             uncompressed_size = sum(zp.file_size for zp in zf.infolist())
 59 | 
 60 |         return uncompressed_size
 61 | 
 62 | 
 63 | def detect_image_decompression_bombs(doc_bytes: bytes, doc_fn: Path):
 64 |     with BytesIO(doc_bytes) as f:
 65 |         if not zipfile.is_zipfile(f):
 66 |             raise NoZipFileException(f"{doc_fn} is not a valid zip file")
 67 | 
 68 |         # check if one of the images is a decompression bomb
 69 |         with zipfile.ZipFile(f) as zf:
 70 |             # check images in zip file
 71 |             for fp in zf.namelist():
 72 |                 if not fp.lower().endswith(IMG_EXT):
 73 |                     continue
 74 | 
 75 |                 img_bytes_compressed = zf.read(fp)
 76 | 
 77 |                 try:
 78 |                     Image.open(BytesIO(img_bytes_compressed))
 79 |                 except Image.DecompressionBombError as e:
 80 |                     raise ImageDecompressionBombError(
 81 |                         f"{doc_fn} -- Image decompression bomb detected: "
 82 |                         "image pixels exceed max image pixels; "
 83 |                         f"error:\n\t{e}"
 84 |                     )
 85 |                 except Exception as e:
 86 |                     print(f"[WARNING] reading image {fp} "
 87 |                           f"failed with {e.__class__.__name__}: {e}")
 88 |                     continue
 89 | 
 90 | 
 91 | def zip_bomb_check(
 92 |         doc_bytes: bytes, threshold: float = 100,
 93 |         max_image_pixels=MAX_IMAGE_PIXELS
 94 | ):
 95 |     Image.MAX_IMAGE_PIXELS = max_image_pixels
 96 | 
 97 |     with BytesIO(doc_bytes) as f:
 98 |         if not zipfile.is_zipfile(f):
 99 |             raise NoZipFileException(f"document is not a valid zip file")
100 | 
101 |         with zipfile.ZipFile(f, "r") as zip_file:
102 |             cr = _compression_ratio(zip_file)
103 | 
104 |             if cr > threshold:
105 |                 raise ZipBombException(f"zip bomb detected: compression ratio"
106 |                                        f" {cr} exceeds threshold {threshold}")
107 | 
108 |             # check images in zip file
109 |             for fp in zip_file.namelist():
110 |                 if (
111 |                         not fp.startswith("word/media") or
112 |                         not fp.lower().endswith(IMG_EXT)
113 |                 ):
114 |                     continue
115 | 
116 |                 img_bytes_compressed = zip_file.read(fp)
117 | 
118 |                 try:
119 |                     Image.open(BytesIO(img_bytes_compressed))
120 |                 except Image.DecompressionBombError as e:
121 |                     raise ImageDecompressionBombError(
122 |                         "Image decompression bomb detected: "
123 |                         "image pixels exceed max image pixels; "
124 |                         f"error:\n\t{e}"
125 |                     )
126 |                 except UnidentifiedImageError as e:
127 |                     raise UnidentifiedImageError(e)
128 |                 except Exception as e:
129 |                     print(f"[WARNING] reading image {fp} "
130 |                           f"failed with {e.__class__.__name__}: {e}")
131 |                     continue
132 | 


--------------------------------------------------------------------------------
/app/src/cc_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/cc_processing/__init__.py


--------------------------------------------------------------------------------
/app/src/cc_processing/deduplicate.py:
--------------------------------------------------------------------------------
 1 | r"""Utility module, which globally deduplicates URLs obtained from CC dumps 
 2 | (i.e after running this script, a URL will show up only once globally accross all dump parquets)
 3 | before passing them to download_docs_ray. Input are parquet files in CLEAN_URLS_DIR,
 4 | output is globally deduplicated df."""
 5 | 
 6 | import pandas as pd
 7 | import os
 8 | from pathlib import Path
 9 | 
10 | def dedupe_urls(src_dir: str, input_df: pd.DataFrame) -> pd.DataFrame:
11 |     r"""
12 |     Deduplicate URLs globally: While processing a new URL dump, deduplicate against already processed dumps.
13 |     @param src_dir: dir with parquets to deduplicate against.
14 |     @param input_df: df of URLs currently being processed.
15 |     @raises ValueError: No files in src_dir, or unexpected parquet format.
16 |     
17 |     return: Globally deduplicated df
18 |     """
19 | 
20 |     # list parquet files in src_dir
21 |     unprocessed = list(filter(lambda x: x.endswith('.parquet'), os.listdir(src_dir)))
22 |     initial_len = len(input_df)
23 |     
24 |     if (len(unprocessed) <= 0):
25 |         raise ValueError("No parquet files found in " + src_dir)
26 |     
27 |     # build initial set (note: set lookup for contains is O(1))
28 |     # ! note individual parquets are already deduped on per-dump basis
29 |     try:
30 |         pqname =  unprocessed.pop()
31 |         curr_df = pd.read_parquet(Path(src_dir, pqname))
32 |     except:
33 |         raise ValueError("Cannot read initial parquet file from " + src_dir)
34 |     try:
35 |         url_hash_tracker = set(curr_df['url_hash'])
36 |     except:
37 |         raise ValueError("Unexpected parquet format, url_hash required (in file) " + pqname)
38 | 
39 |     # go through each parquet file, and get the hashes
40 |     while (len(unprocessed) > 0):
41 |         pqname = unprocessed.pop()
42 |         curr_df = pd.read_parquet(Path(src_dir, pqname))
43 |         try:
44 |             url_hashes = set(curr_df['url_hash'])
45 |         except:
46 |             raise ValueError("Unexpected parquet format, url_hash required (in file) " + pqname)
47 |         # add to hashes we compare against
48 |         url_hash_tracker = url_hash_tracker.union(url_hashes)
49 | 
50 |     # remove duplicates
51 |     hash_series = pd.Series(list(url_hash_tracker))
52 |     out_df = input_df[~input_df['url_hash'].isin(hash_series)]
53 |     end_len = len(out_df)
54 | 
55 |     print('Removed ' + str(initial_len - end_len) + ' duplicates through comparison with already processed dumps')
56 |     return out_df


--------------------------------------------------------------------------------
/app/src/data_sources/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/app/src/data_sources/download_exceptions.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Classes for custom exceptions that may be thrown whilst downloading documents.
  3 | """
  4 | import settings
  5 | from typing import Tuple, Union
  6 | 
  7 | 
  8 | class InvalidContentType(Exception):
  9 |     def __init__(self, content_type):
 10 |         """
 11 |         The HTTP content type header is not acceptable.
 12 |         """
 13 |         self.content_type = content_type
 14 | 
 15 |     def __repr__(self):
 16 |         return "InvalidContentType({})".format(self.content_type)
 17 | 
 18 | 
 19 | class FileSizeExceeded(Exception):
 20 |     def __init__(self, filesize):
 21 |         """
 22 |         Attemtped to download a document of too large filesize.
 23 |         """
 24 |         self.filesize = filesize
 25 | 
 26 |     def __repr__(self):
 27 |         return "FileSizeExceeded({})".format(self.filesize)
 28 | 
 29 | 
 30 | class OleCheckFailed(Exception):
 31 |     def __init__(self, error):
 32 |         """
 33 |         A safety check on a .docx or .doc files OLE properties failed, so the document is not safe to download.
 34 |         (See Microsoft OLE documentation for safety of OLE properties, and maldoc_check.py for implementation)
 35 |         """
 36 |         self.error = error
 37 | 
 38 |     def __repr__(self):
 39 |         return "OleCheckFailed({})".format(self.error)
 40 |     
 41 | class HTTPError(Exception):
 42 |     def __init__(self, status_code=None):
 43 |         """
 44 |         A benign HTTP error, signified by a status code
 45 |         """
 46 |         self.status_code = status_code
 47 | 
 48 |     def __repr__(self):
 49 |         return "HTTPError={}".format(self.status_code)
 50 |     
 51 | """
 52 | Functions to check validity of downloads and requests
 53 | """
 54 | 
 55 | def valid_content_type(
 56 |         content_type: str
 57 | ) -> Tuple[
 58 |     Union[str, None], Union[InvalidContentType, None]
 59 | ]:
 60 |     """check if content type is valid; this functions returns True if either
 61 |     the content type is unknown or if the content type is known and is found to
 62 |     be valid.
 63 | 
 64 |     @content_type: str content type
 65 |     return: bool, InvalidContentType exception or None
 66 |     """
 67 |     if content_type is None:
 68 |         # unknown content type
 69 |         return content_type, None
 70 | 
 71 |     # sanitize content type string
 72 |     content_type = content_type.lower().replace('-', '')
 73 | 
 74 |     if settings.download.VALID_CT_REGEX.match(content_type) is None:
 75 |         return content_type, InvalidContentType(content_type=content_type)
 76 | 
 77 |     return content_type, None
 78 | 
 79 | 
 80 | def valid_content_length(
 81 |         content_length: Union[str, None]
 82 | ) -> Tuple[
 83 |     Union[int, None], Union[FileSizeExceeded, None]
 84 | ]:
 85 |     """check if content length is valid; this functions returns True if either
 86 |     the file size is known and below the maximally allowed file size, or if the
 87 |     file size is unknown.
 88 | 
 89 |     @content_length: str content length
 90 | 
 91 |     return: bool, FileSizeExceeded exception or None
 92 |     """
 93 |     try:
 94 |         content_length = int(content_length)
 95 |     except (TypeError, ValueError):
 96 |         return content_length, None
 97 | 
 98 |     # check size of content
 99 |     if content_length > settings.download.MAX_FILESIZE:
100 |         return content_length, FileSizeExceeded(filesize=content_length)
101 | 
102 |     return content_length, None


--------------------------------------------------------------------------------
/app/src/data_sources/http_handlers.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import settings
  3 | from typing import Tuple, Union
  4 | import requests
  5 | from src.data_sources.download_exceptions import (
  6 |     FileSizeExceeded,
  7 |     HTTPError,
  8 |     InvalidContentType,
  9 |     valid_content_length,
 10 |     valid_content_type
 11 | )
 12 | 
 13 | 
 14 | def run_sess(
 15 |         sess_method: Union[requests.get, requests.head],
 16 |         timeout: int,
 17 |         allow_redirects: bool, url: str
 18 | ) -> Tuple[requests.Response, Exception, int]:
 19 |     """run session
 20 |     @param sess_method: requests.get or requests.head
 21 |     @param timeout: int timeout
 22 |     @param allow_redirects: bool allow redirects
 23 |     @param url: str url
 24 | 
 25 |     return: requests.Response, Exception str, Timestamp int
 26 |     """
 27 |     timestamp = int(time.time())
 28 |     exception = None
 29 | 
 30 |     try:
 31 |         response = sess_method(
 32 |             url, timeout=timeout, allow_redirects=allow_redirects, stream=True
 33 |         )
 34 |     except Exception as e:
 35 |         response = None
 36 |         exception = e
 37 | 
 38 |     return response, exception, timestamp
 39 | 
 40 | 
 41 | def header_handler(
 42 |         response: requests.Response,
 43 |         exception: Exception
 44 | ) -> Tuple[
 45 |     Union[requests.Response, None],
 46 |     dict,
 47 |     Union[Exception, FileSizeExceeded, InvalidContentType, HTTPError, None]
 48 | ]:
 49 |     """ handle header: check for valid content type and content length, and
 50 |     return header metadata
 51 | 
 52 |     @param response: requests.Response
 53 |     @param exception: Exception raised during call to requests.head
 54 | 
 55 |     return: requests.Response, dict, Exception
 56 |     """
 57 |     header_metadata = {}
 58 | 
 59 |     # in this case, the download failed during run_sess, so we return the
 60 |     # original exception raise by the call to sess.head
 61 |     if response is None:
 62 |         return response, header_metadata, exception
 63 | 
 64 |     # in this case, the server sent a response, but the response code is not
 65 |     # 200, so we return the HTTPError exception
 66 |     if response.status_code != 200:
 67 |         return (
 68 |             response,
 69 |             header_metadata,
 70 |             HTTPError(status_code=response.status_code)
 71 |         )
 72 | 
 73 |     header_metadata = {k: response.headers.get(k, None) for k in
 74 |                        settings.download.HEADER_FIELDS}
 75 | 
 76 |     # check for valid content length
 77 |     content_length, exception = valid_content_length(
 78 |         header_metadata['content-length']
 79 |     )
 80 |     header_metadata['content-length'] = content_length
 81 | 
 82 |     if exception is not None:
 83 |         return response, header_metadata, exception
 84 | 
 85 |     # check for valid content type
 86 |     content_type, exception = valid_content_type(
 87 |         header_metadata['content-type']
 88 |     )
 89 |     header_metadata['content-type'] = content_type
 90 | 
 91 |     return response, header_metadata, exception
 92 | 
 93 | 
 94 | def body_handler(
 95 |         response: requests.Response,
 96 |         exception: Exception
 97 | ) -> Tuple[
 98 |     Union[requests.Response, None],
 99 |     dict,
100 |     Union[Exception, HTTPError, FileSizeExceeded, None]
101 | ]:
102 |     """ handle body: check if response is valid, fetch ip-address and content
103 |     length, and return body metadata
104 | 
105 |     @param response: requests.Response
106 |     @param exception: Exception raised during call to sess.get
107 | 
108 |     return: requests.Response, dict, Exception
109 |     """
110 |     body_metadata = {}
111 | 
112 |     # in this case, the download failed during run_sess, so we return the
113 |     # original exception raise by the call to sess.get
114 |     if response is None:
115 |         return response, body_metadata, exception
116 | 
117 |     if response.status_code != 200:
118 |         return (
119 |             response,
120 |             body_metadata,
121 |             HTTPError(status_code=response.status_code)
122 |         )
123 | 
124 |     # get content length
125 |     try:
126 |         content_length = len(response.content)
127 |     except TypeError:
128 |         content_length = None
129 | 
130 |     content_length, exception = valid_content_length(content_length)
131 |     body_metadata = {
132 |         # dummy value for ip --> not collected
133 |         'ip_address': 0, 'content_length': content_length
134 |     }
135 | 
136 |     return response, body_metadata, exception
137 | 


--------------------------------------------------------------------------------
/app/src/exceptions.py:
--------------------------------------------------------------------------------
  1 | from typing import Set
  2 | 
  3 | __all__ = [
  4 |     "InconsistentPageCountError",
  5 |     "InconsistentAspectRatiosError",
  6 |     "UnsupportedDocumentLayoutError",
  7 |     "ConversionFailedException",
  8 |     "SofficeStartFailed",
  9 |     "UnknownPageCountException",
 10 |     "PageCountExceededException",
 11 |     "ZipBombException",
 12 |     "NoZipFileException",
 13 |     "CompressedFileSizeExceededException",
 14 |     "UncompressedFileSizeExceededException",
 15 |     "ImageDecompressionBombError",
 16 |     "TextTooShortException"
 17 | ]
 18 | 
 19 | 
 20 | class InconsistentPageCountError(Exception):
 21 |     r"""Raised when the number of pages in the PDF file is not consistent with
 22 |     the number of pages in the annotated pdf.
 23 | 
 24 |     Note: If this error is raised, then it might indicate that the colorization
 25 |         step has interfered with the layout of the document!
 26 |     """
 27 | 
 28 |     def __init__(self, expected: Set, actual: Set):
 29 |         self.expected = expected
 30 |         self.actual = actual
 31 |         super().__init__(f"Expected {expected} pages, but got {actual} pages.")
 32 | 
 33 |     def __repr__(self):
 34 |         return f"InconsistentPageCountError(" \
 35 |                f"expected={self.expected}, actual={self.actual}" \
 36 |                f")"
 37 | 
 38 | 
 39 | class InconsistentAspectRatiosError(Exception):
 40 |     r"""Raised when the aspect ratios of the pages in the PDF file are not
 41 |     consistent with the aspect ratios of the pages in the rendered page images.
 42 |     """
 43 | 
 44 |     def __init__(self, ar_pdf, ar_img):
 45 |         self.ar_pdf = ar_pdf
 46 |         self.ar_img = ar_img
 47 |         super().__init__(f"Expected inconsistent aspect ratios:"
 48 |                          f"got {ar_pdf} from pdf"
 49 |                          f"and {ar_img} from renderings.")
 50 | 
 51 |     def __repr__(self):
 52 |         return f"InconsistentAspectRatiosError(" \
 53 |                f"ar_pdf={self.ar_pdf}, ar_img={self.ar_img}" \
 54 |                f")"
 55 | 
 56 | 
 57 | class SofficeStartFailed(Exception):
 58 |     r"""Raised when the soffice process fails to start."""
 59 |     pass
 60 | 
 61 | 
 62 | class UnsupportedDocumentLayoutError(Exception):
 63 |     r"""Raised when the layout of the document is not supported, such as
 64 |     too many document columns, i.e. more than 3"""
 65 | 
 66 |     def __init__(self, msg: str):
 67 |         self.msg = msg
 68 | 
 69 |     def __repr__(self):
 70 |         return f"UnsupportedDocumentLayoutError(msg={self.msg})"
 71 | 
 72 | 
 73 | class ConversionFailedException(Exception):
 74 |     r"""Raised when the conversion of a doc/docx file to a pdf file fails."""
 75 |     pass
 76 | 
 77 | 
 78 | class UnknownPageCountException(Exception):
 79 |     r"""Raised when the page count of a document cannot be determined."""
 80 |     pass
 81 | 
 82 | 
 83 | class PageCountExceededException(Exception):
 84 |     r"""Raised when the page count of a document exceeds the maximum allowed
 85 |     number of pages."""
 86 |     pass
 87 | 
 88 | 
 89 | class ZipBombException(Exception):
 90 |     r"""Raised when a zip bomb is detected."""
 91 |     pass
 92 | 
 93 | 
 94 | class NoZipFileException(Exception):
 95 |     r"""Raised when a file is not a zip file."""
 96 |     pass
 97 | 
 98 | 
 99 | class CompressedFileSizeExceededException(Exception):
100 |     r"""Raised when a file size exceeds the maximum allowed file size."""
101 |     pass
102 | 
103 | 
104 | class UncompressedFileSizeExceededException(Exception):
105 |     r"""Raised when an uncompressed file size exceeds the maximum allowed
106 |     file size."""
107 |     pass
108 | 
109 | 
110 | class ImageDecompressionBombError(Exception):
111 |     r"""Raised when an image decompression bomb is detected."""
112 |     pass
113 | 
114 | 
115 | class TextTooShortException(Exception):
116 |     r"""Raised when the text of a document is too short."""
117 |     pass
118 | 


--------------------------------------------------------------------------------
/app/src/extensions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/extensions/__init__.py


--------------------------------------------------------------------------------
/app/src/extensions/obj_detection/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/extensions/obj_detection/__init__.py


--------------------------------------------------------------------------------
/app/src/extensions/obj_detection/data_prep/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/extensions/obj_detection/data_prep/__init__.py


--------------------------------------------------------------------------------
/app/src/extensions/obj_detection/data_prep/wordscape_yolo_config_handler.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | from typing import List, Tuple
 3 | from src.extensions.obj_detection.data_prep.wordscape_yolo_formatter import YOLOSettings
 4 | from pathlib import Path
 5 | import settings
 6 | 
 7 | 
 8 | def json_to_config(json_obj, first_key: str) -> YOLOSettings:
 9 |     # make elem_mergings
10 |     elem_mergings_formatted = {}
11 |     elem_mergings_json = json_obj[first_key]["elem_mergings"]
12 |     if len(elem_mergings_json) == 0:
13 |         elem_mergings_formatted = {"masters": {}, "mapping": {}}
14 |     else:
15 |         masters = {}
16 |         for key in elem_mergings_json["masters"]:
17 |             masters[int(key)] = elem_mergings_json["masters"][key]
18 |         mapping = {}
19 |         for key in elem_mergings_json["mapping"]:
20 |             mapping[int(key)] = int(elem_mergings_json["mapping"][key])
21 | 
22 |         elem_mergings_formatted = {"masters": masters, "mapping": mapping}
23 | 
24 |     # check if elem_accepts are defined by the provided JSON
25 |     elem_accepts_base = settings.entities.LABEL_NUMS
26 |     if (
27 |         ("elem_accepts" in json_obj[first_key].keys())
28 |         and (json_obj[first_key]["elem_accepts"] != None)
29 |         and (len(json_obj[first_key]["elem_accepts"]) > 0)
30 |     ):
31 |         elem_accepts_base = json_obj[first_key]["elem_accepts"]
32 | 
33 |     base_settings = YOLOSettings(
34 |         raw_path=Path(json_obj[first_key]["raw_path"]),
35 |         is_validation=json_obj[first_key]["is_validation"],
36 |         max_img=json_obj[first_key]["max_img"],
37 |         elem_drops=json_obj[first_key]["elem_drops"],
38 |         elem_mergings=elem_mergings_formatted,
39 |         elem_accepts=elem_accepts_base,
40 |         scanify=json_obj[first_key]["scanify"],
41 |         quality_threshold=json_obj[first_key]["quality_threshold"],
42 |         language_codes=json_obj[first_key]["language_codes"],
43 |         language_code_threshold=json_obj[first_key]["language_code_threshold"],
44 |     )
45 | 
46 |     # check if there are element minimums defined. If not, return the base settings
47 |     if (
48 |         ("elem_mins" in json_obj[first_key].keys())
49 |         and (json_obj[first_key]["elem_mins"] != None)
50 |         and (len(json_obj[first_key]["elem_mins"]) > 0)
51 |     ):
52 |         settings_list = []
53 |         for elem_type in json_obj[first_key]["elem_mins"].keys():
54 |             setting_modified = copy.deepcopy(base_settings)
55 |             # distinction: elem_drops (dont include in dataset) vs. elems that we require to be in a doc (for filter purposes)
56 |             setting_modified.elem_accepts = [int(elem_type)]
57 |             setting_modified.max_img = json_obj[first_key]["elem_mins"][elem_type]
58 |             settings_list.append(setting_modified)
59 |         return settings_list
60 |     else:
61 |         return [base_settings]
62 | 
63 | 
64 | def parse_config(json_obj) -> Tuple[List[YOLOSettings], List[YOLOSettings]]:
65 |     r"""
66 |     Parse a json config to two YOLOSettings objects for train and validation dataset.
67 |     """
68 | 
69 |     train_settings = json_to_config(json_obj, "train_settings")
70 |     val_settings = json_to_config(json_obj, "val_settings")
71 | 
72 |     return train_settings, val_settings
73 | 


--------------------------------------------------------------------------------
/app/src/extensions/obj_detection/data_prep/yolo_dataset_report.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import yaml
 3 | from pathlib import Path
 4 | import settings
 5 | import json
 6 | 
 7 | 
 8 | def parse_label_folder(folder_path: Path) -> dict:
 9 |     # track total results
10 |     entity_counts = {key: 0 for key in settings.entities.LABEL_NUMS}
11 |     # also track empty label examples
12 |     empty_labels = 0
13 | 
14 |     for txt_file in folder_path.glob("*"):
15 |         # read the txt file
16 |         txt_lines = []
17 |         with open(txt_file, "r") as txt_file_open:
18 |             for line in txt_file_open.readlines():
19 |                 line_list = line.split()
20 |                 txt_lines.append(line_list)
21 | 
22 |         # only count unique bboxes per img
23 |         txt_lines = [list(x) for x in set(tuple(x) for x in txt_lines)]
24 |         # count entity appearances
25 |         for entry in txt_lines:
26 |             entity_counts[int(entry[0])] = entity_counts[int(entry[0])] + 1
27 | 
28 |         if len(txt_lines) == 0:
29 |             empty_labels += 1
30 | 
31 |     entity_counts[-1] = empty_labels
32 |     return entity_counts
33 | 
34 | 
35 | def main():
36 |     arg_parser = argparse.ArgumentParser()
37 |     arg_parser.add_argument(
38 |         "--dataset_path",
39 |         "-dp",
40 |         type=str,
41 |         default="/mnt/DATA/msc-data/yolo_wordscape_experiments/3headers_balanced_quality",
42 |         help="path to dataset to analyze",
43 |     )
44 |     args = arg_parser.parse_args()
45 | 
46 |     # read labels for yolo classes from dataset.yaml
47 |     ds_path = Path(args.dataset_path)
48 |     with open(ds_path / "dataset.yaml", "r") as stream:
49 |         yaml_ds = yaml.safe_load(stream)
50 |     labels = yaml_ds["names"]
51 | 
52 |     # check train and val data
53 |     train_counts = parse_label_folder(ds_path / "train" / "labels")
54 |     val_counts = parse_label_folder(ds_path / "val" / "labels")
55 | 
56 |     # apply labels for report
57 |     train_counts_formatted = {}
58 |     val_counts_formatted = {}
59 |     for i in range(len(labels)):
60 |         train_counts_formatted[labels[i]] = train_counts[i]
61 |         val_counts_formatted[labels[i]] = val_counts[i]
62 |     train_counts_formatted["empty_labels"] = train_counts[-1]
63 |     val_counts_formatted["empty_labels"] = val_counts[-1]
64 | 
65 |     report_dict = {
66 |         "train_counts": train_counts_formatted,
67 |         "val_counts": val_counts_formatted,
68 |     }
69 |     with open(ds_path / "report.json", "w") as report_w:
70 |         json.dump(report_dict, report_w)
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     main()
75 | 


--------------------------------------------------------------------------------
/app/src/extensions/obj_detection/spaceml/move_train_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import shutil
 4 | 
 5 | 
 6 | def main():
 7 |     r"""
 8 |     Utility script to move some train data into a validation folder.
 9 |     """
10 | 
11 |     arg_parser = argparse.ArgumentParser()
12 |     arg_parser.add_argument(
13 |         "--train_dir",
14 |         "-td",
15 |         type=str,
16 |         default=None,
17 |         help="source train dir to move files from",
18 |     )
19 |     arg_parser.add_argument(
20 |         "--val_dir",
21 |         "-vd",
22 |         type=str,
23 |         default=None,
24 |         help="destination val dir to move files to",
25 |     )
26 |     arg_parser.add_argument(
27 |         "--num", "-n", type=int, default=None, help="number of files to move"
28 |     )
29 |     args = arg_parser.parse_args()
30 | 
31 |     meta_paths = []
32 | 
33 |     tar_paths = sorted(
34 |         filter(lambda x: x.endswith(".tar"), os.listdir(args.train_dir + "/multimodal"))
35 |     )[0 : args.num]
36 | 
37 |     for tar_name in tar_paths:
38 |         tar_path = os.path.join(args.train_dir + "/multimodal", tar_name)
39 |         meta_path = (
40 |             "doc_meta_" + tar_name.replace("docs_", "").replace(".tar", "") + ".jsonl"
41 |         )
42 |         meta_paths.append(meta_path)
43 | 
44 |         shutil.move(tar_path, args.val_dir + "/multimodal")
45 | 
46 |     for meta_name in meta_paths:
47 |         meta_path_inner = os.path.join(args.train_dir + "/meta", meta_name)
48 |         shutil.move(meta_path_inner, args.val_dir + "/meta")
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/app/src/extensions/obj_detection/spaceml/move_train_data_singlefiles.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | from pathlib import Path
 4 | import shutil
 5 | import random
 6 | 
 7 | def main():
 8 |     r"""
 9 |     Utility script to move some train data (images and labels) into a different folder.
10 |     """
11 | 
12 |     arg_parser = argparse.ArgumentParser()
13 |     arg_parser.add_argument(
14 |         "--source_dir",
15 |         "-sd",
16 |         type=str,
17 |         default=None,
18 |         help="source dir to move files from",
19 |     )
20 |     arg_parser.add_argument(
21 |         "--dest_dir",
22 |         "-dd",
23 |         type=str,
24 |         default=None,
25 |         help="destination dir to move files to",
26 |     )
27 |     arg_parser.add_argument(
28 |         "--num", "-n", type=int, default=None, help="number of files to move"
29 |     )
30 |     args = arg_parser.parse_args()
31 | 
32 |     img_paths = sorted(
33 |         filter(lambda x: x.endswith(".png") or x.endswith(".jpg"), os.listdir(args.source_dir + "/images"))
34 |     )[0 : args.num]
35 | 
36 |     print(img_paths)
37 | 
38 |     # randomly sample
39 |     img_paths_shuffled = random.sample(img_paths, len(img_paths))
40 | 
41 |     label_paths = []
42 | 
43 |     for img_name in img_paths_shuffled:
44 |         img_path = Path(args.source_dir + "/images") / img_name
45 |         dest_path = Path(args.dest_dir + "/images") / img_name
46 |         shutil.move(img_path, dest_path)
47 |         # print(img_path)
48 |         # print(dest_path)
49 | 
50 |         label_path = img_path.parents[1] / "labels" / img_name.replace('.png', '.txt').replace('.jpg', '.txt')
51 |         label_paths.append(label_path)
52 | 
53 |     for label_path in label_paths:
54 |         dest_path = Path(args.dest_dir + "/labels") / label_path.name
55 |         shutil.move(label_path, dest_path)
56 |         # print(label_path)
57 |         # print(dest_path)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/app/src/extensions/obj_detection/spaceml/ws_yolo_experimentrun.py:
--------------------------------------------------------------------------------
  1 | from ultralytics import YOLO
  2 | import argparse
  3 | import os
  4 | 
  5 | 
  6 | def main():
  7 |     r"""
  8 |     A script to run a YOLO-Wordscript experiment. Arguments are the path to the YOLO dataset yaml, and the GPUs to use.
  9 |     """
 10 |     arg_parser = argparse.ArgumentParser()
 11 |     arg_parser.add_argument(
 12 |         "--config_path",
 13 |         "-cp",
 14 |         type=str,
 15 |         default="/home/valde/GitHub/msc-thesis/data/experiments/baseline/dataset.yaml",
 16 |         help="path to config",
 17 |     )
 18 |     arg_parser.add_argument(
 19 |         "--gpu_usage",
 20 |         "-gu",
 21 |         type=str,
 22 |         default="0,1,2,3",
 23 |         help="Comma separated list of CUDA GPU IDs",
 24 |     )
 25 |     arg_parser.add_argument(
 26 |         "--epochs", "-ep", type=int, default=10, help="number of epochs"
 27 |     )
 28 |     arg_parser.add_argument(
 29 |         "--gpu_batch", "-gb", type=int, default=24, help="batch size per gpu"
 30 |     )
 31 |     arg_parser.add_argument(
 32 |         "--resume_path",
 33 |         "-rp",
 34 |         type=str,
 35 |         default=None,
 36 |         help="Path to weights for resume",
 37 |     )
 38 |     arg_parser.add_argument(
 39 |         "--use_pretrained",
 40 |         "-up",
 41 |         type=bool,
 42 |         default=False,
 43 |         help="Flag to use resume_path not to resume from, but as pretrained weights for a new experiment",
 44 |     )
 45 |     arg_parser.add_argument(
 46 |         "--override_name",
 47 |         "-on",
 48 |         type=str,
 49 |         default=None,
 50 |         help="Optionally override experiment name",
 51 |     )
 52 |     arg_parser.add_argument(
 53 |         "--random_weights",
 54 |         "-rw",
 55 |         type=bool,
 56 |         default=False,
 57 |         help="If set, the model will be initialized with random weights (i.e train fully from scratch)"
 58 |     )
 59 |     arg_parser.add_argument(
 60 |         "--learning_rate",
 61 |         "-lr",
 62 |         type=bool,
 63 |         default=False,
 64 |         help="If set, the model will be trained using learning rate decay"
 65 |     )
 66 |     args = arg_parser.parse_args()
 67 | 
 68 |     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 69 |     os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_usage
 70 | 
 71 |     experiment_name = args.config_path.split("/")[-2]
 72 |     if (args.override_name != None):
 73 |         experiment_name = args.override_name
 74 | 
 75 |     # check wether to resume with these weights, or to use as pretrained
 76 |     res_decision = False
 77 |     if (args.resume_path != None) and (args.use_pretrained == False):
 78 |         res_decision = True
 79 | 
 80 |     model = YOLO("yolov5lu.pt")
 81 |     if args.resume_path != None:
 82 |         model = YOLO(args.resume_path)
 83 |     if args.random_weights == True:
 84 |         # ! important: .yaml means this is just a config, not preloaded weights
 85 |         model = YOLO("yolov5l.yaml")
 86 |     if args.learning_rate == True:
 87 |         model.train(
 88 |             data=args.config_path,
 89 |             lr0 = 1e-3,
 90 |             lrf = 1e-4,
 91 |             epochs=args.epochs,
 92 |             name=experiment_name,
 93 |             device=[int(x) for x in args.gpu_usage.split(",")],
 94 |             batch=len(args.gpu_usage.split(",")) * args.gpu_batch,
 95 |             resume=res_decision,
 96 |         )
 97 |     else:
 98 |         model.train(
 99 |             data=args.config_path,
100 |             epochs=args.epochs,
101 |             name=experiment_name,
102 |             device=[int(x) for x in args.gpu_usage.split(",")],
103 |             batch=len(args.gpu_usage.split(",")) * args.gpu_batch,
104 |             resume=res_decision,
105 |         )
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     main()
110 | 


--------------------------------------------------------------------------------
/app/src/extensions/pretrain/layoutlmv3/data_prep/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/src/extensions/pretrain/layoutlmv3/data_prep/__init__.py


--------------------------------------------------------------------------------
/app/src/quality/__init__.py:
--------------------------------------------------------------------------------
1 | from . import text_normalizer
2 | 


--------------------------------------------------------------------------------
/app/src/quality/perplexity.py:
--------------------------------------------------------------------------------
 1 | """
 2 | code based on
 3 | https://github.com/facebookresearch/cc_net/blob/main/cc_net/text_normalizer.py
 4 | """
 5 | 
 6 | from pathlib import Path
 7 | from sentencepiece import SentencePieceProcessor
 8 | import kenlm
 9 | 
10 | from src.quality import text_normalizer
11 | 
12 | 
13 | def perplexity(log_score, length):
14 |     return 10.0 ** (-log_score / length)
15 | 
16 | 
17 | class SentencePiece:
18 | 
19 |     def __init__(self, model: Path, normalize=True):
20 |         self._normalize = normalize
21 | 
22 |         self._sp = SentencePieceProcessor()
23 |         self._sp.load(str(model))
24 | 
25 |     def tokenize(self, text: str):
26 |         if self._normalize:
27 |             text = text_normalizer.normalize(text)
28 | 
29 |         tokenized = self._sp.encode_as_pieces(text)
30 |         return " ".join(tokenized)
31 | 
32 | 
33 | class LanguageModel:
34 |     def __init__(self, sp_model: Path, lm_model: Path):
35 |         # init models
36 |         self._sp = SentencePiece(sp_model, normalize=True)
37 |         lm_config = kenlm.Config()
38 |         self._lm = kenlm.Model(str(lm_model), lm_config)
39 | 
40 |     def compute_perplexity(self, content: str) -> float:
41 |         # tokenize
42 |         content = self._sp.tokenize(content)
43 | 
44 |         # get lines
45 |         lines = content.split("\n")
46 | 
47 |         doc_log_score, doc_length = 0, 0
48 | 
49 |         for line in lines:
50 |             log_score = self._lm.score(line)
51 |             length = len(line.split()) + 1
52 |             doc_log_score += log_score
53 |             doc_length += length
54 | 
55 |         return perplexity(doc_log_score, doc_length)
56 | 


--------------------------------------------------------------------------------
/app/src/quality/text_normalizer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | code adapted from
 3 | https://github.com/facebookresearch/cc_net/blob/main/cc_net/text_normalizer.py
 4 | """
 5 | import re
 6 | import unicodedata
 7 | 
 8 | DIGIT_RE = re.compile(r"\d")
 9 | UNICODE_PUNCT = {
10 |     "，": ",",
11 |     "。": ".",
12 |     "、": ",",
13 |     "„": '"',
14 |     "”": '"',
15 |     "“": '"',
16 |     "«": '"',
17 |     "»": '"',
18 |     "１": '"',
19 |     "」": '"',
20 |     "「": '"',
21 |     "《": '"',
22 |     "》": '"',
23 |     "´": "'",
24 |     "∶": ":",
25 |     "：": ":",
26 |     "？": "?",
27 |     "！": "!",
28 |     "（": "(",
29 |     "）": ")",
30 |     "；": ";",
31 |     "–": "-",
32 |     "—": " - ",
33 |     "．": ". ",
34 |     "～": "~",
35 |     "’": "'",
36 |     "…": "...",
37 |     "━": "-",
38 |     "〈": "<",
39 |     "〉": ">",
40 |     "【": "[",
41 |     "】": "]",
42 |     "％": "%",
43 |     "►": "-",
44 | }
45 | UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]")
46 | 
47 | NON_PRINTING_CHARS_RE = re.compile(
48 |     f"[{''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))}]"
49 | )
50 | 
51 | 
52 | def strip_accents(line: str) -> str:
53 |     """Strips accents from a piece of text."""
54 |     nfd = unicodedata.normalize("NFD", line)
55 |     output = [c for c in nfd if unicodedata.category(c) != "Mn"]
56 |     if len(output) == line:
57 |         return line
58 |     return "".join(output)
59 | 
60 | 
61 | def replace_unicode_punct(text: str) -> str:
62 |     return "".join((UNICODE_PUNCT.get(c, c) for c in text))
63 | 
64 | 
65 | def remove_non_printing_char(text: str) -> str:
66 |     return NON_PRINTING_CHARS_RE.sub("", text)
67 | 
68 | 
69 | def normalize(line: str) -> str:
70 |     line = line.strip()
71 | 
72 |     if not line:
73 |         return line
74 | 
75 |     line = line.lower()
76 |     line = strip_accents(line)
77 |     line = DIGIT_RE.sub("0", line)
78 |     line = replace_unicode_punct(line)
79 |     line = remove_non_printing_char(line)
80 | 
81 |     return line
82 | 


--------------------------------------------------------------------------------
/app/utilities/checksums.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/app/utilities/checksums.parquet


--------------------------------------------------------------------------------
/app/utilities/compute_checksums.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from concurrent.futures import ProcessPoolExecutor, as_completed
 3 | import functools
 4 | import multiprocessing as mp
 5 | from pathlib import Path
 6 | import tarfile
 7 | from typing import Set
 8 | import hashlib
 9 | import polars as pl
10 | from typing import Dict, List
11 | from tqdm import tqdm
12 | 
13 | # ------ debug
14 | _sources = "/Users/maurice/phd/code/openDoc/WordScape-Data/annotated/cc_main_2022_49/20230601_163415/doc_sources"
15 | _doc_meta = "/Users/maurice/phd/code/openDoc/WordScape-Data/annotated/cc_main_2022_49/20230601_163415/meta_copy/doc.meta.parquet"
16 | # ------ debug
17 | 
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument("--sources", type=str, default=_sources)
20 | parser.add_argument("--doc_meta", type=str, default=_doc_meta)
21 | parser.add_argument("--out_dir", type=str, default=".")
22 | args = parser.parse_args()
23 | 
24 | 
25 | def load_document_ids(meta_fp) -> Set[str]:
26 |     return set((
27 |                    pl.scan_parquet(meta_fp)
28 |                    .select(pl.col("url_hash"))
29 |                ).collect().to_dict()["url_hash"])
30 | 
31 | 
32 | def name_to_id(name: str) -> str:
33 |     return name.replace("doc_", "").split(".")[0]
34 | 
35 | 
36 | def process_single_file(tar_fp: Path, meta_fp: Path) -> Dict[str, List[str]]:
37 |     document_ids = load_document_ids(meta_fp)
38 |     tar = tarfile.open(tar_fp, 'r:gz')
39 | 
40 |     data = {
41 |         "url_hash": [], "bytehash": []
42 |     }
43 | 
44 |     for mem in tar.getmembers():
45 |         url_hash = name_to_id(mem.name)
46 |         if url_hash in document_ids:
47 |             with tar.extractfile(mem) as fobj:
48 |                 checksum = hashlib.sha256(fobj.read()).hexdigest()
49 |             data["url_hash"].append(url_hash)
50 |             data["bytehash"].append(checksum)
51 | 
52 |     return data
53 | 
54 | 
55 | def process_all():
56 |     meta_fp = Path(args.doc_meta)
57 |     source_tars = list(Path(args.sources).glob("*.tar.gz"))
58 | 
59 |     process_fn = functools.partial(process_single_file, meta_fp=meta_fp)
60 | 
61 |     with ProcessPoolExecutor(max_workers=mp.cpu_count() - 4) as executor:
62 |         futures = list(
63 |             executor.submit(process_fn, tar_fp) for tar_fp in source_tars
64 |         )
65 | 
66 |         count = 0
67 |         for future in tqdm(as_completed(futures), total=len(futures)):
68 |             single_data = future.result()
69 |             pl.DataFrame(single_data).write_parquet(
70 |                 f"{args.out_dir}/checksums-{count}.parquet"
71 |             )
72 |             count += 1
73 |             futures.remove(future)
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     process_all()
78 | 


--------------------------------------------------------------------------------
/app/utilities/merge_annotations_metadata.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import concurrent.futures
  3 | 
  4 | import jsonlines
  5 | import joblib
  6 | import pandas as pd
  7 | import pathlib
  8 | from typing import Dict
  9 | from tqdm import tqdm
 10 | 
 11 | FP_PATTERNS = {
 12 |     "page": "*page_*.jsonl",
 13 |     "doc": "*doc_*.jsonl"
 14 | }
 15 | 
 16 | FLATTEN_KWS = [
 17 |     "annotation_sources"
 18 |     "builtin_proportion_per_entity"
 19 | ]
 20 | 
 21 | MAX_ROWS_IN_MEM = 100_000
 22 | 
 23 | parser = argparse.ArgumentParser()
 24 | parser.add_argument("--meta_dir", type=str, default=None)
 25 | args = parser.parse_args()
 26 | 
 27 | 
 28 | def _flatten_obj(obj: Dict[str, int], key: str):
 29 |     if not isinstance(obj, dict):
 30 |         raise ValueError
 31 | 
 32 |     return {f"{key}_{k}": v for k, v in obj.items()}
 33 | 
 34 | 
 35 | def _serialize(obj):
 36 |     if isinstance(obj, list):
 37 |         return str(obj)
 38 |     return obj
 39 | 
 40 | 
 41 | def _to_dataframe(jsonl_fp: pathlib.Path) -> pd.DataFrame:
 42 |     df = pd.DataFrame()
 43 | 
 44 |     data = {}
 45 | 
 46 |     try:
 47 |         with jsonlines.open(jsonl_fp) as reader:
 48 |             for obj in reader:
 49 |                 obj_procsd = {
 50 |                     k: _serialize(v)
 51 |                     for k, v in obj.items() if k not in FLATTEN_KWS
 52 |                 }
 53 |                 for k in FLATTEN_KWS:
 54 | 
 55 |                     if k not in obj:
 56 |                         continue
 57 | 
 58 |                     obj_procsd.update(_flatten_obj(obj[k], k))
 59 | 
 60 |                 if len(data) == 0:
 61 |                     data = {k: [v] for k, v in obj_procsd.items()}
 62 |                     continue
 63 | 
 64 |                 for k in data.keys():
 65 |                     data[k].append(obj_procsd[k])
 66 |     except Exception as e:
 67 |         print(f"Failed loading {jsonl_fp} with {e.__class__.__name__}:\n{e}")
 68 | 
 69 |     return df.from_dict(data)
 70 | 
 71 | 
 72 | def do_merge(level: str, meta_dir: pathlib.Path):
 73 |     print(f"start generating {level}-level metadata file")
 74 |     fp_pattern = FP_PATTERNS[level]
 75 | 
 76 |     meta_files = list(meta_dir.glob(fp_pattern))
 77 | 
 78 |     full_df = pd.DataFrame()
 79 | 
 80 |     full_df_fp = meta_dir / f"{level}.meta.parquet"
 81 |     append = False
 82 | 
 83 |     print(f"start generating {level}-level metadata file; "
 84 |           f"saving to {full_df_fp}")
 85 | 
 86 |     with concurrent.futures.ProcessPoolExecutor(
 87 |             max_workers=joblib.cpu_count() - 1
 88 |     ) as executor:
 89 |         for part_df in (pbar := tqdm(
 90 |                 executor.map(_to_dataframe, meta_files),
 91 |                 total=len(meta_files)
 92 |         )):
 93 |             full_df = pd.concat([full_df, part_df], ignore_index=True)
 94 |             rows_in_mem = len(full_df)
 95 | 
 96 |             if rows_in_mem > MAX_ROWS_IN_MEM:
 97 |                 full_df.to_parquet(
 98 |                     path=full_df_fp, append=append, engine="fastparquet"
 99 |                 )
100 |                 pbar.set_postfix_str(
101 |                     f"wrote to {full_df_fp} with append={append}"
102 |                 )
103 |                 append = True
104 |                 full_df = pd.DataFrame(columns=full_df.columns)
105 | 
106 |     if len(full_df) > 0:
107 |         full_df.to_parquet(
108 |             path=full_df_fp, append=append, engine="fastparquet"
109 |         )
110 | 
111 |     del full_df
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     do_merge(level="doc", meta_dir=pathlib.Path(args.meta_dir))
116 |     do_merge(level="page", meta_dir=pathlib.Path(args.meta_dir))
117 | 


--------------------------------------------------------------------------------
/app/utilities/merge_sources_metadata.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import concurrent.futures
 3 | import joblib
 4 | import pandas as pd
 5 | import pathlib
 6 | from tqdm import tqdm
 7 | 
 8 | MAX_ROWS_IN_MEM = 100_000
 9 | 
10 | 
11 | def get_args():
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("--meta_dir", type=str, default=None)
14 |     args = parser.parse_args()
15 |     return args
16 | 
17 | 
18 | def _load_parquet(meta_fp: pathlib.Path):
19 |     return pd.read_parquet(meta_fp)
20 | 
21 | 
22 | def main():
23 |     args = get_args()
24 |     data_dir = pathlib.Path(args.meta_dir)
25 |     dump_id = data_dir.name
26 | 
27 |     meta_files = list(data_dir.glob("*.parquet"))
28 | 
29 |     print("Found", len(meta_files), "source meta files")
30 | 
31 |     full_df = pd.DataFrame()
32 |     full_df_fp = data_dir.parent / f"sources_{dump_id}.meta.parquet"
33 | 
34 |     append = False
35 | 
36 |     with concurrent.futures.ProcessPoolExecutor(
37 |             max_workers=joblib.cpu_count() - 2
38 |     ) as executor:
39 |         for part_df in (pbar := tqdm(
40 |                 executor.map(_load_parquet, meta_files),
41 |                 total=len(meta_files)
42 |         )):
43 |             full_df = pd.concat([full_df, part_df], ignore_index=True)
44 |             rows_in_mem = len(full_df)
45 | 
46 |             if rows_in_mem > MAX_ROWS_IN_MEM:
47 |                 full_df.to_parquet(
48 |                     path=full_df_fp, append=append, engine="fastparquet"
49 |                 )
50 |                 pbar.set_postfix_str(
51 |                     f"wrote to {full_df_fp} with append={append}"
52 |                 )
53 |                 append = True
54 |                 full_df = pd.DataFrame(columns=full_df.columns)
55 | 
56 |     if len(full_df) > 0:
57 |         full_df.to_parquet(
58 |             path=full_df_fp, append=append, engine="fastparquet"
59 |         )
60 | 
61 |     del full_df
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     main()
66 | 


--------------------------------------------------------------------------------
/app/utilities/run_filter_tars.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pathlib
  3 | import tarfile
  4 | import multiprocessing as mp
  5 | import json
  6 | from typing import List, Tuple, Union
  7 | from pathlib import Path
  8 | 
  9 | import joblib
 10 | 
 11 | parser = argparse.ArgumentParser()
 12 | parser.add_argument("--data_root", type=str, default=None)
 13 | args = parser.parse_args()
 14 | 
 15 | 
 16 | def get_page_id(fn: str) -> str:
 17 |     return fn[fn.find("doc_"):].replace("doc_", "")
 18 | 
 19 | 
 20 | def filter_tar_file(
 21 |         inputs: Tuple[pathlib.Path, List[str]]
 22 | ) -> Union[int, None]:
 23 |     src_tar_fp, whitelist_pages = inputs
 24 | 
 25 |     whitelist_pages = set(get_page_id(p) for p in whitelist_pages)
 26 | 
 27 |     filtered_tar_fn = src_tar_fp.name.replace(".tar.gz", ".filtered.tar.gz")
 28 |     filtered_tar_fp = src_tar_fp.parent / filtered_tar_fn
 29 | 
 30 |     src_tar = tarfile.open(src_tar_fp, 'r:gz')
 31 |     tgt_tar = tarfile.open(filtered_tar_fp, 'w:gz')
 32 | 
 33 |     try:
 34 |         all_jpg_members = set(
 35 |             get_page_id(Path(mem.name).stem) for mem in src_tar.getmembers()
 36 |             if mem.name.endswith(".jpg")
 37 |         )
 38 |         all_txt_members = set(
 39 |             get_page_id(Path(mem.name).stem) for mem in src_tar.getmembers()
 40 |             if mem.name.startswith("text_doc_")
 41 |         )
 42 |         all_ent_members = set(
 43 |             get_page_id(Path(mem.name).stem) for mem in src_tar.getmembers()
 44 |             if mem.name.startswith("entities_doc_")
 45 |         )
 46 |         all_wrd_members = set(
 47 |             get_page_id(Path(mem.name).stem) for mem in src_tar.getmembers()
 48 |             if mem.name.startswith("words_doc_")
 49 |         )
 50 | 
 51 |         all_page_ids = all_jpg_members & all_txt_members \
 52 |                        & all_ent_members & all_wrd_members
 53 | 
 54 |         filtered_pages = all_page_ids & whitelist_pages
 55 | 
 56 |         # write all matching members to target tar
 57 |         num_files = 0
 58 |         for mem in src_tar.getmembers():
 59 |             num_files += 1
 60 |             page_id = get_page_id(Path(mem.name).stem)
 61 |             if page_id not in filtered_pages:
 62 |                 continue
 63 | 
 64 |             fobj = src_tar.extractfile(mem)
 65 |             fobj.seek(0)
 66 | 
 67 |             # write to target tar
 68 |             tgt_tar.addfile(mem, fobj)
 69 | 
 70 |     except Exception as e:
 71 |         print("Error processing: ", src_tar_fp)
 72 |         tgt_tar.close()
 73 |         src_tar.close()
 74 |         filtered_tar_fp.unlink(missing_ok=True)
 75 |         print(e)
 76 |         return 0
 77 | 
 78 |     num_filtered_files = len(filtered_pages) * 4
 79 |     print("Processed: ", src_tar_fp)
 80 |     print(f"Total files: {num_files}, Filtered files: {num_filtered_files}")
 81 | 
 82 |     tgt_tar.close()
 83 |     src_tar.close()
 84 | 
 85 |     return 1
 86 | 
 87 | 
 88 | def main():
 89 |     data_root = pathlib.Path(args.data_root)
 90 |     annotations_dir = data_root / "multimodal"
 91 |     paths = list(annotations_dir.glob("*.tar.gz"))
 92 |     total_paths = len(paths)
 93 | 
 94 |     if total_paths == 0:
 95 |         print("No files found in: ", args.data_root)
 96 |         return
 97 | 
 98 |     # load whitelisted urls
 99 |     with open(data_root / "whitelist_pages.json", 'r') as f:
100 |         whitelist_pages = json.load(f)
101 | 
102 |     # construct inputs
103 |     inputs = list()
104 |     for path in paths:
105 |         shard_id = path.name.replace("docs_", "").replace(".tar.gz", "")
106 |         try:
107 |             inputs.append((path, whitelist_pages[shard_id]))
108 |         except KeyError:
109 |             print("No whitelist for: ", shard_id)
110 |             continue
111 | 
112 |     with mp.Pool(processes=joblib.cpu_count() - 1) as pool:
113 |         res_codes = pool.map(filter_tar_file, inputs)
114 | 
115 |     print("Total files: ", total_paths)
116 |     print("Total filtered files: ", sum(res_codes))
117 | 
118 | 
119 | if __name__ == '__main__':
120 |     main()
121 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/data/.gitkeep


--------------------------------------------------------------------------------
/docs/wordscape.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DS3Lab/WordScape/84cb7d552e5e0bfd47fee2d7e23de909287e6e6e/docs/wordscape.png


--------------------------------------------------------------------------------