├── .github
    └── example.png
├── .gitignore
├── .gitmodules
├── README.md
├── data
    └── example_1_ocr_scan.jpg
├── docker-compose.yml
├── labeled
    └── .gitkeep
├── labeling
    ├── Dockerfile
    ├── configs
    │   ├── horizontal-layout.xml
    │   ├── labeling-config.xml
    │   └── vertical-layout.xml
    └── requirements.txt
└── modeling
    ├── Dockerfile
    ├── requirements.txt
    └── tools
        └── model.py


/.github/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Layout-Parser/annotation-service/50dbde27bb916b51ec1cd40bbe72fc9397042b15/.github/example.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # folder
  2 | # data
  3 | # data/
  4 | # data*/
  5 | generated/
  6 | generated*/
  7 | credential
  8 | credential/
  9 | model
 10 | model/
 11 | result
 12 | result*/
 13 | 
 14 | # Mac Finder Configurations
 15 | .DS_Store
 16 | 
 17 | # IDEA configurations
 18 | .idea/
 19 | 
 20 | # IPython checkpoints
 21 | .ipynb_checkpoints/
 22 | log
 23 | 
 24 | # Visual Studio Code
 25 | .vscode/
 26 | 
 27 | # Byte-compiled / optimized / DLL files
 28 | __pycache__/
 29 | *.py[cod]
 30 | *$py.class
 31 | 
 32 | # C extensions
 33 | *.so
 34 | 
 35 | # Distribution / packaging
 36 | .Python
 37 | build/
 38 | develop-eggs/
 39 | dist/
 40 | downloads/
 41 | eggs/
 42 | .eggs/
 43 | lib64/
 44 | parts/
 45 | sdist/
 46 | var/
 47 | wheels/
 48 | *.egg-info/
 49 | .installed.cfg
 50 | *.egg
 51 | MANIFEST
 52 | 
 53 | # PyInstaller
 54 | #  Usually these files are written by a python script from a template
 55 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 56 | *.manifest
 57 | *.spec
 58 | 
 59 | # Installer logs
 60 | pip-log.txt
 61 | pip-delete-this-directory.txt
 62 | 
 63 | # Unit test / coverage reports
 64 | htmlcov/
 65 | .tox/
 66 | .coverage
 67 | .coverage.*
 68 | .cache
 69 | nosetests.xml
 70 | coverage.xml
 71 | *.cover
 72 | .hypothesis/
 73 | .pytest_cache/
 74 | 
 75 | # Translations
 76 | *.mo
 77 | *.pot
 78 | 
 79 | # Django stuff:
 80 | *.log
 81 | local_settings.py
 82 | db.sqlite3
 83 | 
 84 | # Flask stuff:
 85 | instance/
 86 | .webassets-cache
 87 | 
 88 | # Scrapy stuff:
 89 | .scrapy
 90 | 
 91 | # Sphinx documentation
 92 | docs/_build/
 93 | 
 94 | # PyBuilder
 95 | target/
 96 | 
 97 | # Jupyter Notebook
 98 | .ipynb_checkpoints
 99 | 
100 | # IPython
101 | profile_default/
102 | ipython_config.py
103 | 
104 | # pyenv
105 | .python-version
106 | 
107 | # celery beat schedule file
108 | celerybeat-schedule
109 | 
110 | # SageMath parsed files
111 | *.sage.py
112 | 
113 | # Environments
114 | .env
115 | .venv
116 | env/
117 | venv/
118 | ENV/
119 | env.bak/
120 | venv.bak/
121 | 
122 | # Spyder project settings
123 | .spyderproject
124 | .spyproject
125 | 
126 | # Rope project settings
127 | .ropeproject
128 | 
129 | # mkdocs documentation
130 | /site
131 | 
132 | # mypy
133 | .mypy_cache/
134 | .dmypy.json
135 | dmypy.json
136 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "src/label-studio"]
 2 | 	path = src/label-studio
 3 | 	url = git@github.com:Layout-Parser/label-studio.git
 4 | [submodule "src/label-studio-converter"]
 5 | 	path = src/label-studio-converter
 6 | 	url = git@github.com:dell-research-harvard/label-studio-converter.git
 7 | [submodule "src/Detectron2_AL"]
 8 | 	path = src/Detectron2_AL
 9 | 	url = git@github.com:lolipopshock/Detectron2_AL.git
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Layout Parser Annotation Service
 2 | 
 3 | ![Illustration of the Anootation Service](./.github/example.png)
 4 | 
 5 | ## Usage 
 6 | 
 7 | We package all the layout annotation service (the annotation interface and active learning modeling server) inside docker containers. The installation process is very straightforward and simple: 
 8 | 
 9 | 1. Install Docker on your computer, following the [official instructions](https://www.docker.com/get-started).
10 | 2. Clone this repository to your computer. 
11 |     ```bash
12 |     git clone git@github.com:Layout-Parser/annotation-service.git
13 |     cd annotation-service
14 |     ```
15 | 3. Configure the annotation folders (see details in the section below) and start the docker container
16 |     ```bash
17 |     DATA=./data CONFIG=labeling-config.xml MODEL=model.py docker-compose up --build -d
18 |     ```
19 | 4. Go to [localhost:8080](localhost:8080) and start annotating. 
20 | 5. Export the completed annotations via Label-Studio's [export function](http://localhost:8080/export), or you can find the annotation folder directly at [`labeled`](./labeled). 
21 | 
22 | ## Configuration
23 | 
24 | In the 3rd command, the environmental variables `DATA`, `CONFIG`, and `MODEL` are used to set the labeling data directory, Label Studio configuration file, and ML backend model file, respectively. 
25 | 
26 | - `DATA` is for the folder containing all the images for labeling. By default, `DATA=./data`. 
27 | - `CONFIG` is the configuration file for initializing the label-studio interface. The default value is `CONFIG=horizontal-layout.xml`, and you could find more examples in [`labeling/configs`](./labeling/configs).
28 | - `MODEL` is for the script that generates the model prediction. The default value is `MODEL=model.py`.
29 | 
30 | ## TODO 
31 | 
32 | - [ ] Enable the Active Learning Detectron2 model backend.


--------------------------------------------------------------------------------
/data/example_1_ocr_scan.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Layout-Parser/annotation-service/50dbde27bb916b51ec1cd40bbe72fc9397042b15/data/example_1_ocr_scan.jpg


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.2"
 2 | 
 3 | services:
 4 |   labeling:
 5 |     container_name: labeling_container
 6 |     build: 
 7 |       context: ./labeling
 8 |     ports: 
 9 |       - 8080:8080
10 |     depends_on:
11 |       - modeling
12 |     volumes: 
13 |       - ${DATA:-./data}:/data
14 |       - ./labeled:/labeled
15 |     command: > 
16 |       bash -c "
17 |       label-studio init /labeled
18 |       --input-path=/data 
19 |       --input-format=image-dir 
20 |       --allow-serving-local-files 
21 |       --force 
22 |       --label-config=configs/${CONFIG:-horizontal-layout.xml} 
23 |       --port 8080 
24 |       --ml-backends http://modeling_container:9090 &&
25 |       label-studio start /labeled
26 |       --port 8080 
27 |       --log-level DEBUG"
28 |     restart: always
29 |   modeling:
30 |     container_name: modeling_container
31 |     build: 
32 |       context: ./modeling
33 |     ports: 
34 |       - 9090:9090
35 |     command: >
36 |       bash -c "
37 |       label-studio-ml init modeling_backend 
38 |       --script tools/${MODEL:-model.py}
39 |       --force &&
40 |       label-studio-ml start modeling_backend 
41 |       --port 9090"
42 |     restart: always


--------------------------------------------------------------------------------
/labeled/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Layout-Parser/annotation-service/50dbde27bb916b51ec1cd40bbe72fc9397042b15/labeled/.gitkeep


--------------------------------------------------------------------------------
/labeling/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8-slim
 2 | 
 3 | WORKDIR /annotation_service_labeling
 4 | 
 5 | RUN apt-get update && apt-get install -y build-essential
 6 | RUN apt-get install ffmpeg libsm6 libxext6 -y
 7 | RUN apt-get install -y git
 8 | 
 9 | COPY requirements.txt /annotation_service_labeling
10 | RUN pip install -r requirements.txt
11 | RUN pip install git+https://github.com/Layout-Parser/label-studio.git
12 | 
13 | COPY configs /annotation_service_labeling/configs
14 | 
15 | EXPOSE 8080


--------------------------------------------------------------------------------
/labeling/configs/horizontal-layout.xml:
--------------------------------------------------------------------------------
 1 |  <View>
 2 |   <Image name="image" value="$image"
 3 |       zoom="true" zoomControl="true" 
 4 |       negativeZoom="true"></Image>
 5 |   <RectangleLabels name="label" toName="image">
 6 | 		<Label value="title" background="green"></Label>
 7 |     <Label value="address" background="yellow"></Label>
 8 |     <Label value="text" background="red"></Label>
 9 |     <Label value="number" background="blue"></Label>
10 |   </RectangleLabels>
11 | </View>


--------------------------------------------------------------------------------
/labeling/configs/labeling-config.xml:
--------------------------------------------------------------------------------
 1 | <View style="display: flex;">
 2 | 
 3 |     <View style="width: 150px; padding: 0 1em; margin-right: 0.5em; background: #f1f1f1; border-radius: 3px">
 4 |     <RectangleLabels name="label" toName="image" canRotate="true">
 5 |         <Label value="newspaper_header" background="blue"></Label>
 6 |         <Label value="masthead" background="blue"></Label>
 7 |         <Label value="article" background="green"></Label>
 8 |         <Label value="headline" background="yellow"></Label>
 9 | 	    <Label value="author" background="cyan"></Label>        
10 |         <Label value="photograph" background="purple"></Label>
11 |         <Label value="cartoon_or_advertisement" background="red"></Label>      
12 |         <Label value="image_caption" background="pink"></Label>
13 |         <Label value="page_number" background="grey"></Label>
14 |         <Label value="table" background="grey"></Label>
15 |     </RectangleLabels>
16 |     </View>
17 | 
18 |     <Image name="image" value="$image"
19 |         zoom="true" zoomControl="true" 
20 |         negativeZoom="true" maxWidth="1440px">
21 |     </Image>
22 | </View>
23 | 


--------------------------------------------------------------------------------
/labeling/configs/vertical-layout.xml:
--------------------------------------------------------------------------------
 1 | <View>
 2 |   <View style="width: 150px; padding: 0 1em; margin-right: 0.5em; background: #f1f1f1; border-radius: 3px">
 3 |   <RectangleLabels name="label" toName="image">
 4 |     <Label value="header" background="blue"></Label>
 5 |     <Label value="text" background="green"></Label>
 6 |     <Label value="variable" background="red"></Label>
 7 |   </RectangleLabels>
 8 |   </View>
 9 | 
10 |   <View style="width: 500px">
11 |     <Image name="image" value="$image"
12 |         zoom="true" zoomControl="true" 
13 |         negativeZoom="true" rotateControl="true"
14 |         width="60%">
15 |     </Image>
16 |   </View>
17 | </View>


--------------------------------------------------------------------------------
/labeling/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | label-studio-converter==0.0.17


--------------------------------------------------------------------------------
/modeling/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8-slim
 2 | 
 3 | WORKDIR /annotation_service_modeling
 4 | 
 5 | RUN apt-get update && apt-get install -y build-essential
 6 | RUN apt-get install ffmpeg libsm6 libxext6 -y
 7 | RUN apt-get install -y git
 8 | 
 9 | COPY requirements.txt /annotation_service_modeling
10 | RUN pip install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
11 | RUN pip install -r requirements.txt
12 | RUN pip install -U detectron2==0.1.1 -f \
13 |     https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.4/index.html
14 | RUN pip install git+https://github.com/Layout-Parser/label-studio.git
15 | 
16 | COPY tools /annotation_service_modeling/tools
17 | 
18 | EXPOSE 9090


--------------------------------------------------------------------------------
/modeling/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | pycocotools>=2.0.1
4 | layoutparser==0.2.0
5 | google-api-core==1.22.2
6 | google-cloud-core==1.4.1


--------------------------------------------------------------------------------
/modeling/tools/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.optim as optim
  4 | import time
  5 | import os
  6 | import numpy as np
  7 | import requests
  8 | import io
  9 | import hashlib
 10 | import urllib
 11 | import cv2
 12 | 
 13 | from PIL import Image
 14 | from torch.utils.data import Dataset, DataLoader
 15 | from torchvision import models, transforms
 16 | 
 17 | from label_studio.ml import LabelStudioMLBase
 18 | from label_studio.ml.utils import get_single_tag_keys, get_choice, is_skipped
 19 | 
 20 | 
 21 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 22 | 
 23 | 
 24 | import layoutparser as lp
 25 | 
 26 | image_cache_dir = os.path.join(os.path.dirname(__file__), 'image-cache')
 27 | os.makedirs(image_cache_dir, exist_ok=True)
 28 | 
 29 | 
 30 | def load_image_from_url(url):
 31 |     # is_local_file = url.startswith('http://localhost:') and '/data/' in url
 32 |     is_local_file = True
 33 |     if is_local_file:
 34 |         filename, dir_path = url.split('/data/')[1].split('?d=')
 35 |         dir_path = str(urllib.parse.unquote_plus(dir_path))
 36 |         filepath = os.path.join(dir_path, filename)
 37 |         return cv2.imread(filepath)
 38 |     else:
 39 |         cached_file = os.path.join(image_cache_dir, hashlib.md5(url.encode()).hexdigest())
 40 |         if os.path.exists(cached_file):
 41 |             with open(cached_file, mode='rb') as f:
 42 |                 image = Image.open(f).convert('RGB')
 43 |         else:
 44 |             r = requests.get(url, stream=True)
 45 |             r.raise_for_status()
 46 |             with io.BytesIO(r.content) as f:
 47 |                 image = Image.open(f).convert('RGB')
 48 |             with io.open(cached_file, mode='wb') as fout:
 49 |                 fout.write(r.content)
 50 |         return image_transforms(image)
 51 | 
 52 | def convert_block_to_value(block, image_height, image_width):
 53 | 
 54 | 
 55 |     return  {
 56 |             "height": block.height / image_height*100,
 57 |             "rectanglelabels": [str(block.type)],
 58 |             "rotation": 0,
 59 |             "width":  block.width / image_width*100,
 60 |             "x":      block.coordinates[0] / image_width*100,
 61 |             "y":      block.coordinates[1] / image_height*100,
 62 |             "score":  block.score
 63 |         }
 64 | 
 65 | 
 66 | class ObjectDetectionAPI(LabelStudioMLBase):
 67 | 
 68 |     def __init__(self, freeze_extractor=False, **kwargs):
 69 | 
 70 |         super(ObjectDetectionAPI, self).__init__(**kwargs)
 71 | 
 72 |         # label_map_list = os.environ['LABEL_MAP'].split()
 73 |         # {int(label_map_list[i]): str(label_map_list[i+1]) for i in range(0, len(label_map_list), 2)}
 74 |         
 75 |         self.from_name, self.to_name, self.value, self.classes =\
 76 |             get_single_tag_keys(self.parsed_label_config, 'RectangleLabels', 'Image')
 77 |         self.freeze_extractor = freeze_extractor
 78 |     
 79 |         self.model = lp.Detectron2LayoutModel(
 80 |             config_path = 'https://www.dropbox.com/s/raubm858djy3u17/config.yaml?dl=1',
 81 |             model_path  = 'https://www.dropbox.com/s/bitxe8occzb865u/model_final.pth?dl=1',
 82 |             ### PLEASE REMEMBER TO CHANGE `dl=0` INTO `dl=1` IN THE END 
 83 |             ### OF DROPBOX LINKS 
 84 |             extra_config=["MODEL.ROI_HEADS.NMS_THRESH_TEST", 0.2,
 85 |                           "MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
 86 |             label_map={0: "headline", 1: "article", 2: "newspaper_header", 3: "masthead", 
 87 |                 4: "author", 5: "photograph", 6: "image_caption", 7: "page_number", 8: "table", 
 88 |                 9: "cartoon_or_advertisement"}
 89 |         )
 90 | 
 91 |     def reset_model(self):
 92 |         ## self.model = ImageClassifier(len(self.classes), self.freeze_extractor)
 93 |         pass
 94 | 
 95 |     def predict(self, tasks, **kwargs):
 96 | 
 97 |         image_urls = [task['data'][self.value] for task in tasks]
 98 |         images = [load_image_from_url(url) for url in image_urls]
 99 |         layouts = [self.model.detect(image) for image in images]  
100 | 
101 |         predictions = []
102 |         for image, layout in zip(images, layouts):
103 |             height, width = image.shape[:2]
104 | 
105 |             result = [
106 |                 {
107 |                 'from_name': self.from_name,
108 |                 'to_name': self.to_name,
109 |                 "original_height": height,
110 |                 "original_width": width,
111 |                 "source": "$image",
112 |                 'type': 'rectanglelabels',
113 |                 "value": convert_block_to_value(block, height, width)
114 |                 } for block in layout
115 |             ]
116 | 
117 |             predictions.append({'result': result})
118 | 
119 |         return predictions
120 | 
121 |     def fit(self, completions, workdir=None, 
122 |             batch_size=32, num_epochs=10, **kwargs):
123 |         image_urls, image_classes = [], []
124 |         print('Collecting completions...')
125 |         # for completion in completions:
126 |         #     if is_skipped(completion):
127 |         #         continue
128 |         #     image_urls.append(completion['data'][self.value])
129 |         #     image_classes.append(get_choice(completion))
130 | 
131 |         print('Creating dataset...')
132 |         # dataset = ImageClassifierDataset(image_urls, image_classes)
133 |         # dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size)
134 | 
135 |         print('Train model...')
136 |         # self.reset_model()
137 |         # self.model.train(dataloader, num_epochs=num_epochs)
138 | 
139 |         print('Save model...')
140 |         # model_path = os.path.join(workdir, 'model.pt')
141 |         # self.model.save(model_path)
142 | 
143 |         return {'model_path': None, 'classes': None}


--------------------------------------------------------------------------------