├── imgs
    ├── e2e_results.png
    ├── methodology.png
    ├── ctp_comparison.png
    ├── sample_textbox.jpg
    ├── ctp_model_results.png
    ├── detection_benchmark.png
    ├── replication_results.png
    └── recognition_benchmark.png
├── text_det_configs
    ├── schedule_adam_step_6e_custom.py
    ├── comics_speech_bubble_dataset.py
    └── fcenet_r50dcnv2_fpn_1500e_ctw1500_custom.py
├── text_recog_configs
    ├── schedule_adam_step_6e_custom.py
    ├── master_custom_dataset.py
    └── comic_speech_bubble_dataset.py
├── text_extractor.py
└── README.md


/imgs/e2e_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gsoykan/comics_text_plus/HEAD/imgs/e2e_results.png


--------------------------------------------------------------------------------
/imgs/methodology.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gsoykan/comics_text_plus/HEAD/imgs/methodology.png


--------------------------------------------------------------------------------
/imgs/ctp_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gsoykan/comics_text_plus/HEAD/imgs/ctp_comparison.png


--------------------------------------------------------------------------------
/imgs/sample_textbox.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gsoykan/comics_text_plus/HEAD/imgs/sample_textbox.jpg


--------------------------------------------------------------------------------
/imgs/ctp_model_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gsoykan/comics_text_plus/HEAD/imgs/ctp_model_results.png


--------------------------------------------------------------------------------
/imgs/detection_benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gsoykan/comics_text_plus/HEAD/imgs/detection_benchmark.png


--------------------------------------------------------------------------------
/imgs/replication_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gsoykan/comics_text_plus/HEAD/imgs/replication_results.png


--------------------------------------------------------------------------------
/imgs/recognition_benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gsoykan/comics_text_plus/HEAD/imgs/recognition_benchmark.png


--------------------------------------------------------------------------------
/text_det_configs/schedule_adam_step_6e_custom.py:
--------------------------------------------------------------------------------
 1 | # should be placed under mmocr/configs/_base_/schedules/schedule_adam_step_6e_custom.py
 2 | 
 3 | # optimizer
 4 | optimizer = dict(type='Adam', lr=1e-4)
 5 | optimizer_config = dict(grad_clip=dict(max_norm=0.5))
 6 | # learning policy
 7 | lr_config = dict(policy='step', step=[3, 4])
 8 | # lr_config = dict(
 9 | #    policy='step',
10 | #    step=[3, 4],
11 | #    warmup='linear',
12 | #    warmup_iters=1,
13 | #    warmup_ratio=0.001,
14 | #    warmup_by_epoch=True)
15 | 
16 | total_epochs = 6
17 | 


--------------------------------------------------------------------------------
/text_recog_configs/schedule_adam_step_6e_custom.py:
--------------------------------------------------------------------------------
 1 | # should be placed under mmocr/configs/_base_/schedules/schedule_adam_step_6e_custom.py
 2 | 
 3 | # optimizer
 4 | optimizer = dict(type='Adam', lr=1e-4)
 5 | optimizer_config = dict(grad_clip=dict(max_norm=0.5))
 6 | # learning policy
 7 | lr_config = dict(policy='step', step=[3, 4])
 8 | # lr_config = dict(
 9 | #    policy='step',
10 | #    step=[3, 4],
11 | #    warmup='linear',
12 | #    warmup_iters=1,
13 | #    warmup_ratio=0.001,
14 | #    warmup_by_epoch=True)
15 | 
16 | total_epochs = 6
17 | 


--------------------------------------------------------------------------------
/text_det_configs/comics_speech_bubble_dataset.py:
--------------------------------------------------------------------------------
 1 | # should be placed under mmocr/configs/_base_/det_datasets/comics_speech_bubble_dataset.py
 2 | 
 3 | root = 'tests/data/comics_speech_bubble_dataset'
 4 | 
 5 | # dataset with type='IcdarDataset'
 6 | train = dict(
 7 |     type='IcdarDataset',
 8 |     ann_file=f'{root}/train/instances_train.json',
 9 |     img_prefix=f'{root}/train/imgs',
10 |     pipeline=None)
11 | 
12 | val = dict(
13 |     type='IcdarDataset',
14 |     ann_file=f'{root}/val/instances_test.json',
15 |     img_prefix=f'{root}/val/imgs',
16 |     pipeline=None,
17 |     test_mode=True)
18 | 
19 | test = dict(
20 |     type='IcdarDataset',
21 |     ann_file=f'{root}/test/instances_test.json',
22 |     img_prefix=f'{root}/test/imgs',
23 |     pipeline=None,
24 |     test_mode=True)
25 | 
26 | train_list = [train]
27 | 
28 | test_list = [test]
29 | 
30 | val_list = [val]
31 | 


--------------------------------------------------------------------------------
/text_recog_configs/master_custom_dataset.py:
--------------------------------------------------------------------------------
 1 | # should be placed under mmocr/configs/textrecog/master/master_custom_dataset.py
 2 | 
 3 | _base_ = [
 4 |     '../../_base_/runtime_10e.py',
 5 |     '../../_base_/schedules/schedule_adam_step_6e_custom.py',
 6 |     '../../_base_/recog_datasets/comic_speech_bubble_dataset.py',
 7 |     '../../_base_/recog_models/master.py',
 8 |     '../../_base_/recog_pipelines/master_pipeline.py',
 9 | ]
10 | 
11 | train_list = {{_base_.train_list}}
12 | test_list = {{_base_.test_list}}
13 | val_list = {{_base_.val_list}}
14 | 
15 | train_pipeline = {{_base_.train_pipeline}}
16 | test_pipeline = {{_base_.test_pipeline}}
17 | 
18 | load_from = '/scratch/users/gsoykan20/projects/mmocr/work_dirs/base_models/master_r31_12e_ST_MJ_SA-787edd36.pth'
19 | 
20 | data = dict(
21 |     samples_per_gpu=128,
22 |     workers_per_gpu=5,
23 |     train=dict(
24 |         type='UniformConcatDataset',
25 |         datasets=train_list,
26 |         pipeline=train_pipeline),
27 |     val=dict(
28 |         type='UniformConcatDataset',
29 |         datasets=val_list,
30 |         pipeline=test_pipeline),
31 |     test=dict(
32 |         type='UniformConcatDataset',
33 |         datasets=test_list,
34 |         pipeline=test_pipeline))
35 | 
36 | evaluation = dict(
37 |     interval=1,
38 |     metric="acc",
39 |     save_best="0_1-N.E.D", #"0_char_precision",
40 |     rule="greater"
41 | )  # for best saving
42 | checkpoint_config = dict(interval=100)
43 | 


--------------------------------------------------------------------------------
/text_det_configs/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom.py:
--------------------------------------------------------------------------------
 1 | # should be placed under mmocr/configs/textdet/fcenet/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom.py
 2 | _base_ = [
 3 |     '../../_base_/default_runtime.py',
 4 |     '../../_base_/schedules/schedule_adam_step_6e_custom.py',
 5 |     '../../_base_/det_models/fcenet_r50dcnv2_fpn.py',
 6 |     '../../_base_/det_datasets/comics_speech_bubble_dataset.py',
 7 |     '../../_base_/det_pipelines/fcenet_pipeline.py'
 8 | ]
 9 | 
10 | train_list = {{_base_.train_list}}
11 | val_list = {{_base_.val_list}}
12 | test_list = {{_base_.test_list}}
13 | 
14 | train_pipeline_ctw1500 = {{_base_.train_pipeline_ctw1500}}
15 | test_pipeline_ctw1500 = {{_base_.test_pipeline_ctw1500}}
16 | 
17 | load_from = '/scratch/users/gsoykan20/projects/mmocr/work_dirs/base_models/fcenet_r50dcnv2_fpn_1500e_ctw1500_20211022-e326d7ec.pth'
18 | 
19 | data = dict(
20 |     samples_per_gpu=24,
21 |     workers_per_gpu=5,
22 |     val_dataloader=dict(samples_per_gpu=1),
23 |     test_dataloader=dict(samples_per_gpu=1),
24 |     train=dict(
25 |         type='UniformConcatDataset',
26 |         datasets=train_list,
27 |         pipeline=train_pipeline_ctw1500),
28 |     val=dict(
29 |         type='UniformConcatDataset',
30 |         datasets=val_list,
31 |         pipeline=test_pipeline_ctw1500),
32 |     test=dict(
33 |         type='UniformConcatDataset',
34 |         datasets=test_list,
35 |         pipeline=test_pipeline_ctw1500))
36 | 
37 | evaluation = dict(
38 |     interval=1,
39 |     metric='hmean-iou',
40 |     save_best='0_hmean-iou:hmean',
41 |     rule='greater')
42 | checkpoint_config = dict(interval=100)  # for saving regardless
43 | 


--------------------------------------------------------------------------------
/text_recog_configs/comic_speech_bubble_dataset.py:
--------------------------------------------------------------------------------
 1 | # should be placed under mmocr/configs/_base_/recog_datasets/comic_speech_bubble_dataset.py
 2 | 
 3 | dataset_type = 'OCRDataset'
 4 | 
 5 | root = 'tests/data/ocr_comics_speech_bubble_dataset'
 6 | 
 7 | train_img_prefix = f'{root}/train/imgs'
 8 | train_anno_file1 = f'{root}/train/label.txt'
 9 | 
10 | test_img_prefix = f'{root}/test/imgs'
11 | test_anno_file1 = f'{root}/test/label.txt'
12 | 
13 | val_img_prefix = f'{root}/val/imgs'
14 | val_anno_file1 = f'{root}/val/label.txt'
15 | 
16 | train = dict(
17 |     type=dataset_type,
18 |     img_prefix=train_img_prefix,
19 |     ann_file=train_anno_file1,
20 |     loader=dict(
21 |         type='HardDiskLoader',
22 |         repeat=1,
23 |         parser=dict(
24 |             type='LineStrParser',
25 |             keys=['filename', 'text'],
26 |             keys_idx=[0, 1],
27 |             separator=' ')),
28 |     pipeline=None,
29 |     test_mode=False)
30 | 
31 | val = dict(
32 |     type=dataset_type,
33 |     img_prefix=val_img_prefix,
34 |     ann_file=val_anno_file1,
35 |     loader=dict(
36 |         type='HardDiskLoader',
37 |         repeat=1,
38 |         parser=dict(
39 |             type='LineStrParser',
40 |             keys=['filename', 'text'],
41 |             keys_idx=[0, 1],
42 |             separator=' ')),
43 |     pipeline=None,
44 |     test_mode=True)
45 | 
46 | test = dict(
47 |     type=dataset_type,
48 |     img_prefix=test_img_prefix,
49 |     ann_file=test_anno_file1,
50 |     loader=dict(
51 |         type='HardDiskLoader',
52 |         repeat=1,
53 |         parser=dict(
54 |             type='LineStrParser',
55 |             keys=['filename', 'text'],
56 |             keys_idx=[0, 1],
57 |             separator=' ')),
58 |     pipeline=None,
59 |     test_mode=True)
60 | 
61 | train_list = [train]
62 | 
63 | test_list = [test]
64 | 
65 | val_list = [val]
66 | 
67 | 


--------------------------------------------------------------------------------
/text_extractor.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | import os
 3 | from pathlib import Path
 4 | from typing import Optional
 5 | 
 6 | import torch
 7 | from mmocr.utils import stitch_boxes_into_lines
 8 | from mmocr.utils.ocr import MMOCR
 9 | 
10 | 
11 | class TextExtractor:
12 |     def __init__(self,
13 |                  model_config_dir: str = os.path.join(str(Path.cwd()), 'configs/'),
14 |                  batch_mode: bool = True,
15 |                  det: str = 'FCE_CTW_DCNv2',
16 |                  recog: str = 'MASTER',
17 |                  det_config: Optional[str] = None,
18 |                  det_ckpt: Optional[str] = None,
19 |                  recog_config: Optional[str] = None,
20 |                  recog_ckpt: Optional[str] = None):
21 |         """
22 |         :param model_config_dir: 'configs' path of MMOCR, required to start MMOCR object. 
23 |         """
24 |         self.batch_mode = batch_mode
25 |         self.ocr = MMOCR(det=det,
26 |                          recog=recog,
27 |                          config_dir=model_config_dir,
28 |                          batch_mode=batch_mode,
29 |                          det_config=det_config,
30 |                          det_ckpt=det_ckpt,
31 |                          recog_ckpt=recog_ckpt,
32 |                          recog_config=recog_config)
33 | 
34 |     @torch.no_grad()
35 |     def extract_text(self,
36 |                      image):
37 |         """
38 |         Extract texts from given image path
39 |         :param image: img path
40 |         :return: (ocr_text)
41 |         """
42 |         torch.cuda.empty_cache()
43 |         gc.collect()
44 |         results = self.ocr.readtext(image,
45 |                                     print_result=False,
46 |                                     imshow=False,
47 |                                     details=True,
48 |                                     merge=False,
49 |                                     batch_mode=self.batch_mode)
50 |         gc.collect()
51 |         torch.cuda.empty_cache()
52 |         stitched_text = TextExtractor.sort_ocr_results_and_extract_text(results).lower()
53 |         return stitched_text
54 | 
55 |     @staticmethod
56 |     def sort_ocr_results_and_extract_text(results,
57 |                                           box_stitch_max_x_dist=50,
58 |                                           min_y_overlap_ratio=0.8,
59 |                                           text_score_threshold=0.1):
60 |         boxes = results[0]['result']
61 |         boxes = list(filter(lambda box: box['text_score'] > text_score_threshold, boxes))
62 | 
63 |         def get_centroid_from_box(box):
64 |             x1 = box[0]
65 |             y1 = box[1]
66 |             x2 = box[2]
67 |             y2 = box[5]
68 |             return (x1 + x2) / 2, (y1 + y2) / 2
69 | 
70 |         boxes_in_lines = stitch_boxes_into_lines(boxes,
71 |                                                  max_x_dist=box_stitch_max_x_dist,
72 |                                                  min_y_overlap_ratio=min_y_overlap_ratio)
73 |         sorted_results = sorted(boxes_in_lines, key=lambda x: get_centroid_from_box(x['box'])[1])
74 |         stitched_text = " ".join(list(map(lambda box_line: box_line['text'], sorted_results)))
75 |         return stitched_text
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## You can easily run the models with [comics-ocr](https://github.com/gsoykan/comics_ocr) package!
  2 | 
  3 | # A Comprehensive Gold Standard and Benchmark for Comics Text Detection and Recognition
  4 | 
  5 | The purpose of this work is to enable research on comics by improving the text quality of the largest comics dataset shared in [COMICS](https://arxiv.org/abs/1611.05118). During the process of generating high-quality text data, text detection and recognition models are trained and selected to create an end-to-end SOTA OCR pipeline for comics. The models are trained with custom-labeled data that we also share for text detection and recognition tasks.
  6 | 
  7 | ![COMICS vs COMICS TEXT+ Comparison](imgs/methodology.png)
  8 | 
  9 | ## Description
 10 | 
 11 | This repository includes pointers to the code and data described in [A Comprehensive Gold Standard and Benchmark for Comics Text Detection and Recognition](http://arxiv.org/abs/2212.14674)
 12 | 
 13 | ## Getting Started
 14 | 
 15 | - **'COMICS TEXT+' OCR data** can be accessed [here](https://drive.google.com/drive/folders/1Vq8RJQITh9NzjR0jpwvOP2cqbQoyq7qS?usp=sharing). The main version includes both raw text data and post-processed data with two columns, whereas the simplified version includes a single column of post-processed text. Check [Dependencies](#dependencies) to find out how you can get panels of textboxes.
 16 | 
 17 | - **'COMICS TEXT+' Text Detection Dataset** can be accessed [here](https://drive.google.com/drive/folders/1a--6MNhPSqEZN3bMhlooLJ1ZIcj5CCBg?usp=sharing). Check [Execution Information](#execution-information) to train your models with it.
 18 | 
 19 | - **'COMICS TEXT+' Text Recognition Dataset** can be accessed [here](https://drive.google.com/drive/folders/1BRQS6UJPzo9zaUAJiLDUr2xCAIc7-2rx?usp=sharing). Check [Execution Information](#execution-information) to train your models with it.
 20 | 
 21 | - **Finetuned text detection model, FCENet** can be accessed [here](https://drive.google.com/drive/folders/1fZTbT-VsWJ9KiuvVYzV5q5qggoeoYzxr?usp=sharing). This is fine-tuned with 'COMICS Text+: Text Detection' dataset and it is our most performant model.
 22 | 
 23 | - **Finetuned text recognition model, MASTER** can be accessed [here](https://drive.google.com/drive/folders/1zczO8XOpNBkepDreTPaqaj31pk_wHRCj?usp=sharing). This is fine-tuned with 'COMICS Text+: Text Recognition' dataset and it is our most performant model.
 24 | 
 25 | - **Ground Truth** Data for evaluation: texts of 500 random textboxes are prepared. GT is used for evaluation and comparison between COMICS and COMICS TEXT+. GT can be accessed [here](https://drive.google.com/drive/folders/1MjlXQF9GhNS3ZgSFqZw2j_X3uUjDpSKb?usp=sharing)
 26 | 
 27 | ### Dependencies
 28 | 
 29 | - **[MMOCR](https://github.com/open-mmlab/mmocr)**: Version 0.6.0 is used for this work. You can check the original repository for instructions on how to set up the MMOCR toolkit. Models and evaluation kits of MMOCR are used for this work.
 30 | - **[labelme](https://github.com/wkentaro/labelme)** We modified 'labelme' to support annotation of text detection and text recognition by enabling it to get predictions from our detection and recognition models. This makes the annotation process faster since all you need to do is adjust the predictions to label. **The modified version of 'labelme' can be found [here](https://github.com/gsoykan/labelme)**
 31 | - **[The Amazing Mysteries of the Gutter: Drawing Inferences Between Panels in Comic Book Narratives.](https://github.com/miyyer/comics)** is the paper that COMICS dataset is shared. If you want to access the panel images then you should download 'extracted panel images' from [here](https://obj.umiacs.umd.edu/comics/index.html).
 32 | 
 33 | ### Execution Information
 34 | 
 35 | - **Text detection training & testing:** Use the configs shared in ./text_det_configs and place them under their indicated locations. Do not forget to download the data.
 36 | 
 37 | ```
 38 | #  in the appropriate environment with MMOCR toolkit run the below commands
 39 | # Training
 40 | python tools/train.py {config_path e.g. fcenet_r50dcnv2_fpn_1500e_ctw1500_custom} --load-from {pretrained_model_path}
 41 | # Testing
 42 | python tools/test.py {config_path} {fine_tuned_model_path} --eval hmean-iou
 43 | ```
 44 | 
 45 | - **Text recognition training & testing:** Use the configs shared in ./text_recong_configs and place them under their indicated locations. Do not forget to download the data.
 46 | 
 47 | ```
 48 | #  in the appropriate environment with MMOCR toolkit run the below commands
 49 | # Training
 50 | python tools/train.py {config_path e.g. master_custom_dataset} --load-from {pretrained_model_path}
 51 | # Testing
 52 | python tools/test.py {config_path} {fine_tuned_model_path} --eval --eval acc
 53 | ```
 54 | 
 55 | - **Using end-to-end models:** text_extractor.py can be used to extract text from a speech bubble or narrative boxes.
 56 | 
 57 | ```python
 58 | ocr_detector_config="./mmocr/work_dirs/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom.py",
 59 | ocr_detector_checkpoint='./mmocr/work_dirs/fcenet_r50dcnv2_fpn_1500e_ctw1500_custom/best_0_hmean-iou:hmean_epoch_5.pth',
 60 | recog_config='./mmocr/work_dirs/master_custom_dataset/master_custom_dataset.py',
 61 | ocr_recognition_checkpoint='./mmocr/work_dirs/master_custom_dataset/best_0_1-N.E.D_epoch_4.pth',
 62 | det='FCE_CTW_DCNv2',
 63 | recog='MASTER'
 64 | 
 65 | text_extractor = TextExtractor(batch_mode=True,
 66 |                               det=det,
 67 |                               det_ckpt=ocr_detector_checkpoint,
 68 |                               det_config=ocr_detector_config,
 69 |                               recog=recog,
 70 |                               recog_ckpt=ocr_recognition_checkpoint,
 71 |                               recog_config=recog_config)
 72 | textbox_img_path = './imgs/sample_textbox.jpg'
 73 | ocr_text = text_extractor.extract_text(textbox_img_path)
 74 | print(ocr_text)
 75 | ```
 76 | 
 77 | ### Results
 78 | 
 79 | ![Text Detection Benchmarking Results](imgs/detection_benchmark.png)
 80 | 
 81 | ![Text Recognition Benchmarking Results](imgs/recognition_benchmark.png)
 82 | 
 83 | ![e2e Benchmarking Results](imgs/e2e_results.png)
 84 | 
 85 | ![COMICS vs COMICS TEXT+ Comparison](imgs/ctp_comparison.png)
 86 | 
 87 | We replicated the model presented in [The Amazing Mysteries of the Gutter: Drawing Inferences Between Panels in Comic Book Narratives.](https://github.com/miyyer/comics) to see whether improvement on text quality would affect the results for Cloze Style Tasks. With **COMICS Text+**, we achieve SOTA results and can see improvement on our replcation results in almost all of the cases that relies heavily on text.
 88 | 
 89 | ![Replication results of Cloze Tasks](imgs/replication_results.png)
 90 | 
 91 | ![Results of cloze tasks with COMICS Text+](imgs/ctp_model_results.png)
 92 | 
 93 | ## Authors
 94 | 
 95 | Gürkan Soykan  
 96 | [twitter](https://twitter.com/grknsoykan)
 97 | [LinkedIn](https://www.linkedin.com/in/gurkan-soykan/)
 98 | 
 99 | ## License
100 | 
101 | This project is licensed under the [NAME HERE] License - see the LICENSE.md file for details
102 | 
103 | ## Acknowledgments
104 | 
105 | - [COMICS](https://github.com/miyyer/comics)
106 | - [labelme](https://github.com/wkentaro/labelme)
107 | - [MMOCR](https://github.com/open-mmlab/mmocr)
108 | 


--------------------------------------------------------------------------------