├── Dataset ├── test │ ├── .gitattributes │ └── test_coco.json ├── train │ ├── .gitattributes │ └── train_coco.json └── .gitattributes ├── requirements.txt ├── Docker └── Dockerfile ├── utils ├── tools.py ├── config_latex.py └── walker_download.py ├── Experiments ├── Dataset-Comparison │ ├── IBEM-TS10.csv │ ├── IBEM-TS11.csv │ ├── FormulaNet.csv │ └── Marmot.csv ├── Out-of-Sample │ ├── FormulaNet.csv │ └── IBEM.csv └── Labeling-Quality │ ├── IBEM.csv │ ├── FormulaNet.csv │ └── Marmot.csv ├── README.md ├── download.py ├── Baseline ├── FCOS-50.py └── FCOS-101.py └── LICENSE /Dataset/test/.gitattributes: -------------------------------------------------------------------------------- 1 | train_coco_all.json filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /Dataset/train/.gitattributes: -------------------------------------------------------------------------------- 1 | train_coco_all.json filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pathlib 2 | numpy 3 | tqdm 4 | arxiv 5 | pdf2image 6 | opencv-python 7 | regex -------------------------------------------------------------------------------- /Dataset/.gitattributes: -------------------------------------------------------------------------------- 1 | train/train_coco.json filter=lfs diff=lfs merge=lfs -text 2 | test/test_coco.json filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /Dataset/test/test_coco.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:94d7e62eb5da35fa64d972a0654ac02579ef460499fc8d0c6ad10dc1cc564273 3 | size 31194430 4 | -------------------------------------------------------------------------------- /Dataset/train/train_coco.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e8ef2f7b82e3544ecb6deb4a34d4c61bbb6424dfdd6f3462a08519e5b9c04fdd 3 | size 595773884 4 | -------------------------------------------------------------------------------- /Docker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG Platform="amd64" 2 | 3 | FROM scmxzhaw/texlive-full:$Platform 4 | 5 | # install dependencies 6 | RUN apt update && apt install -y python3.8 pip poppler-utils git 7 | 8 | # clone repository 9 | RUN git clone https://github.com/felix-schmitt/FormulaNet.git 10 | 11 | # prepare python env 12 | WORKDIR FormulaNet/ 13 | RUN pip install -r requirements.txt 14 | 15 | CMD ["python3", "./download.py"] -------------------------------------------------------------------------------- /utils/tools.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import json 3 | 4 | def resize_image(img_path, img_size, new_image): 5 | img = cv2.imread(str(img_path)) 6 | h, w, _ = img.shape 7 | if img_size: 8 | ratio_old = h / w 9 | ratio_new = img_size['h'] / img_size['w'] 10 | if ratio_old < ratio_new: 11 | h = int(ratio_old * img_size['w']) 12 | dh = img_size['h'] - h 13 | w = img_size['w'] 14 | dw = 0 15 | else: 16 | w = int(1 / ratio_old * img_size['h']) 17 | dw = img_size['w'] - w 18 | h = img_size['h'] 19 | dh = 0 20 | img = cv2.resize(img, (w, h), interpolation=cv2.INTER_AREA) 21 | img = cv2.copyMakeBorder(img, int(dh / 2), dh - int(dh / 2), int(dw / 2), dw - int(dw / 2), 22 | cv2.BORDER_CONSTANT, value=[255, 255, 255]) 23 | cv2.imwrite(new_image, img) 24 | 25 | def load_json(json_file): 26 | with open(json_file) as f: 27 | file = json.load(f) 28 | pages = [image['file_name'] for image in file['images']] 29 | f.close() 30 | return pages -------------------------------------------------------------------------------- /Experiments/Dataset-Comparison/IBEM-TS10.csv: -------------------------------------------------------------------------------- 1 | ,Inline,,,,,Display,,,, 2 | page_id,CL,WP,NP,SGT,NGT,CL,WP,NP,SGT,NGT 3 | 3055-5,5,0,0,0,0,4,0,0,0,0 4 | 3055-18,19,0,0,0,0,7,0,0,0,0 5 | 9084-12,18,0,0,0,0,5,0,0,0,0 6 | 9153-2,13,0,0,0,0,0,0,0,0,0 7 | 9154-17,10,0,1,2,3,5,0,0,0,0 8 | 11042-3,0,4,0,3,0,0,0,0,0,0 9 | 11078-8,11,0,1,0,0,1,1,2,0,2 10 | 12032-1,5,0,8,0,8,0,0,0,0,0 11 | 12032-4,20,0,0,0,0,0,0,0,0,0 12 | 12032-10,8,0,0,0,0,4,0,0,0,0 13 | 12090-6,10,0,0,0,0,6,0,0,0,0 14 | 12090-7,7,0,0,0,0,7,0,0,0,0 15 | 12090-13,12,0,0,0,0,6,0,0,0,0 16 | 12145-6,32,5,1,3,0,6,0,0,0,0 17 | 12145-10,23,0,0,0,0,3,0,0,0,0 18 | 12145-15,33,0,0,0,0,3,0,0,0,0 19 | 12145-17,20,0,1,0,1,3,0,0,0,0 20 | 12145-28,48,2,1,0,0,1,0,0,0,0 21 | 12145-44,12,0,0,0,0,5,1,0,0,0 22 | 102083-5,28,3,0,0,0,4,0,0,0,0 23 | 102083-12,20,13,9,0,0,2,0,0,0,0 24 | 103018-4,24,0,1,0,0,5,3,1,2,1 25 | 103048-5,6,0,0,0,0,2,0,0,0,0 26 | 104191-4,9,0,0,0,0,8,0,0,0,0 27 | 104191-10,8,0,0,0,0,6,0,0,0,0 28 | 106198-8,2,0,0,0,0,0,0,0,0,0 29 | 106229-1,2,0,0,0,0,0,0,0,0,0 30 | 106229-13,38,3,2,0,0,0,0,1,0,0 31 | 10059-8,11,0,0,0,0,4,1,2,1,2 32 | 110101-3,13,0,1,0,0,1,2,1,2,1 33 | 111110-7,15,0,0,0,0,6,0,0,0,0 34 | 111196-13,0,0,0,0,0,0,0,0,0,0 35 | 201099-6,17,0,5,0,0,6,0,0,0,0 36 | 201099-18,19,1,0,0,0,6,0,1,0,0 37 | 201099-21,0,0,0,0,0,0,0,0,0,0 38 | 203047-5,12,2,1,0,0,3,0,0,0,0 39 | 203077-2,18,0,0,0,0,0,0,0,0,0 40 | 203077-8,19,0,0,0,0,2,0,0,0,0 41 | 204209-10,18,0,0,0,0,3,0,0,0,0 42 | 209229-32,11,3,0,0,0,5,0,0,0,0 43 | 210144-13,20,1,1,1,0,5,0,0,0,0 44 | 210144-15,11,0,0,0,0,2,0,0,0,0 45 | 210310-7,30,0,1,0,1,3,0,0,0,0 46 | 210310-18,75,3,4,3,3,1,0,0,0,0 47 | 210310-20,42,2,2,1,0,2,0,0,0,0 48 | 301041-1,12,3,5,3,2,8,4,1,2,0 49 | 301041-4,8,2,1,2,1,3,0,0,0,0 50 | 303137-5,5,0,0,0,0,0,0,0,0,0 51 | 303265-2,40,2,1,0,0,3,0,0,0,0 52 | 303265-3,23,1,1,0,0,3,0,0,0,0 53 | Total,862,50,48,18,19,159,12,9,7,6 -------------------------------------------------------------------------------- /Experiments/Dataset-Comparison/IBEM-TS11.csv: -------------------------------------------------------------------------------- 1 | ,Inline,,,,,Display,,,, 2 | page id,CL,WP,NP,SGT,NGT,CL,WP,NP,SGT,NGT 3 | 5222-5,25,1,2,0,0,5,0,0,0,0 4 | 6023-13,23,1,0,1,0,1,0,0,0,0 5 | 11145-1,5,1,0,0,0,0,0,0,0,0 6 | 11145-5,3,0,0,0,0,5,5,2,4,2 7 | 11145-16,13,0,0,0,0,9,0,0,0,0 8 | 12005-3,25,0,0,0,0,4,0,0,0,0 9 | 12196-11,16,1,0,0,0,3,0,0,0,0 10 | 12269-3,18,0,0,0,0,4,2,1,2,1 11 | 12269-11,9,0,0,0,0,8,0,0,0,0 12 | 107101-6,7,0,0,0,0,8,7,2,6,2 13 | 107101-8,8,0,0,0,0,5,4,2,4,2 14 | 108094-24,7,0,0,0,0,0,0,0,0,0 15 | 109178-4,17,0,0,0,0,7,0,0,0,0 16 | 109178-6,17,0,2,0,0,5,2,1,2,1 17 | 109178-8,14,0,1,0,0,8,0,0,0,0 18 | 110097-9,4,0,0,0,0,4,0,0,0,0 19 | 110097-14,7,0,0,0,0,0,1,3,1,3 20 | 111093-9,2,0,0,0,0,0,0,0,0,0 21 | 111093-10,0,0,0,0,0,0,0,0,0,0 22 | 112211-3,26,0,0,0,0,2,0,0,0,0 23 | 112211-11,33,0,0,0,0,3,5,2,5,2 24 | 201049-5,7,1,2,0,0,3,0,1,0,0 25 | 202095-3,12,0,0,0,0,1,0,0,0,0 26 | 202095-4,15,0,1,0,0,6,0,0,0,0 27 | 202136-3,15,0,1,0,1,7,0,1,0,0 28 | 203054-3,17,0,2,0,2,4,3,1,3,1 29 | 203054-14,20,1,1,0,0,6,0,0,0,0 30 | 203064-3,6,2,0,2,0,0,0,0,0,0 31 | 203064-10,23,1,0,0,0,4,2,1,2,1 32 | 203064-19,4,0,0,0,0,10,3,1,3,1 33 | 203078-7,10,2,2,2,2,3,0,0,0,0 34 | 205216-4,16,0,0,0,0,2,3,1,3,1 35 | 206149-3,11,0,0,0,0,0,0,0,0,0 36 | 207074-17,23,0,0,0,0,4,0,0,0,0 37 | 207074-20,32,0,0,0,0,5,0,0,0,0 38 | 207074-27,17,0,0,0,0,4,0,0,0,0 39 | 207248-7,19,0,0,0,0,5,0,0,0,0 40 | 208180-2,0,0,0,0,0,2,0,0,0,0 41 | 208186-7,15,1,8,0,5,1,0,0,0,0 42 | 208186-17,0,0,0,0,0,0,0,0,0,0 43 | 208218-6,5,1,0,0,0,0,0,0,0,0 44 | 209061-25,0,0,0,0,0,3,0,0,0,0 45 | 20918-13,0,1,0,0,0,0,0,0,0,0 46 | 211049-2,16,1,0,1,0,1,0,0,0,0 47 | 212100-11,36,0,0,0,0,2,0,0,0,0 48 | 212287-2,20,0,11,0,10,2,0,0,0,0 49 | 212287-9,5,0,1,0,1,8,0,0,0,0 50 | 301004-8,3,0,0,0,0,6,1,2,1,2 51 | 302193-8,42,0,0,0,0,4,0,0,0,0 52 | 304006-9,0,0,0,0,0,0,0,0,0,0 53 | ,668,15,34,6,21,174,38,21,36,19 -------------------------------------------------------------------------------- /Experiments/Dataset-Comparison/FormulaNet.csv: -------------------------------------------------------------------------------- 1 | ,Inline,,,,,Display,,,, 2 | page id,CL,WP,NP,SGT,NGT,CL,WP,NP,SGT,NGT 3 | 2007-24,18,0,0,0,0,6,0,0,0,0 4 | 2111-13,34,0,0,0,0,3,0,0,0,0 5 | 4066-41,22,2,0,1,0,4,0,0,0,0 6 | 6120-11,11,0,0,0,0,4,0,0,0,0 7 | 6178-28,22,2,2,0,0,4,0,0,0,0 8 | 7181-9,15,0,0,0,0,2,0,0,0,0 9 | 8071-9,19,14,6,10,5,2,2,0,0,0 10 | 8075-9,7,0,0,0,0,7,0,0,0,0 11 | 8239-29,16,0,3,0,1,4,0,0,0,0 12 | 9153-4,21,0,0,0,0,6,0,0,0,0 13 | 10293-7,46,0,3,0,0,4,0,0,0,0 14 | 10293-46,19,1,0,0,0,5,0,0,0,0 15 | 11032-44,9,1,0,0,0,5,0,0,0,0 16 | 11155-0,3,0,0,0,0,0,0,0,0,0 17 | 12032-12,24,0,0,0,0,3,0,0,0,0 18 | 12143-20,15,0,0,0,0,1,0,0,0,0 19 | 201066-6,11,0,0,0,0,4,0,0,0,0 20 | 201168-7,24,3,1,0,0,0,0,0,0,0 21 | 202008-8,41,1,3,0,0,7,0,0,0,0 22 | 202078-13,1,1,0,0,0,3,0,0,0,0 23 | 202113-11,6,0,0,0,0,0,0,0,0,0 24 | 203139-11,4,0,0,0,0,6,0,0,0,0 25 | 204094-2,36,2,2,0,0,2,0,0,0,0 26 | 205054-10,1,1,0,0,0,5,6,2,0,0 27 | 205113-6,39,3,5,0,0,2,0,0,0,0 28 | 205161-8,14,0,0,0,0,2,0,0,0,0 29 | 205190-12,9,1,0,0,0,5,0,0,0,0 30 | 205288-13,14,0,1,0,0,4,1,0,0,0 31 | 206148-5,15,0,0,0,0,2,0,0,0,0 32 | 207186-7,11,0,0,0,0,7,0,0,0,0 33 | 207249-48,6,0,1,0,0,4,0,0,0,0 34 | 208016-22,8,0,0,0,0,5,0,0,0,0 35 | 208016-23,5,1,0,0,0,5,0,0,0,0 36 | 208053-2,18,0,2,0,2,4,0,0,0,0 37 | 209122-18,27,0,1,0,0,2,0,0,0,0 38 | 209235-7,17,0,2,0,0,5,0,0,0,0 39 | 210005-3,22,0,0,0,0,6,0,0,0,0 40 | 210398-30,7,0,0,0,0,8,0,0,0,0 41 | 212149-21,37,1,2,0,1,3,0,0,0,0 42 | 212156-4,24,0,1,0,0,2,0,0,0,0 43 | 212163-6,14,0,0,0,0,8,0,0,0,0 44 | 212314-55,19,5,0,5,0,0,0,0,0,0 45 | 212564-8,24,1,2,0,0,5,0,0,0,0 46 | 301106-10,13,0,0,0,0,2,0,0,0,0 47 | 301193-3,23,0,1,0,0,4,0,0,0,0 48 | 303058-7,16,4,1,0,0,4,0,0,0,0 49 | 304150-25,2,2,0,0,0,4,0,0,0,0 50 | 305024-13,15,1,1,0,0,6,0,0,0,0 51 | 305227-48,10,3,0,2,0,0,0,0,0,0 52 | 306045-9,23,1,6,0,2,5,0,0,0,0 53 | Total,857,51,46,18,11,191,9,2,0,0 -------------------------------------------------------------------------------- /Experiments/Out-of-Sample/FormulaNet.csv: -------------------------------------------------------------------------------- 1 | ,Inline,,,,,Display,,,, 2 | page_id,detected,correct,WD,WL,ML,detected,correct,WD,WL,ML 3 | 2202113-1,29,22,5,2,2,0,0,0,0,0 4 | 2108119-23,18,18,0,0,3,6,4,2,0,0 5 | 2108124-25,19,16,1,2,4,3,2,1,0,0 6 | 2108138-7,15,14,0,1,0,0,0,0,0,0 7 | 2108139-46,41,36,5,0,1,6,6,0,0,0 8 | 2109013-10,19,0,0,19,0,0,0,0,0,0 9 | 2109015-11,3,3,0,0,4,0,0,0,0,0 10 | 2109046-2,50,37,8,5,8,5,3,2,0,1 11 | 2109075-19,45,42,3,0,0,2,2,0,0,0 12 | 2109076-1,0,0,0,0,0,0,0,0,0,0 13 | 2109083-20,16,16,0,0,1,6,4,2,0,0 14 | 2109102-17,32,25,6,1,5,0,0,0,0,0 15 | 2109107-5,8,8,0,0,1,0,0,0,0,0 16 | 2109109-3,46,35,11,0,3,1,1,0,0,0 17 | 2109114-45,1,1,0,0,10,3,3,0,0,0 18 | 2109121-25,51,43,3,5,9,0,0,0,0,0 19 | 2109128-41,0,0,0,0,0,0,0,0,0,0 20 | 2109131-4,2,2,0,0,1,0,0,0,0,0 21 | 2109145-14,0,0,0,0,0,0,0,0,0,0 22 | 2110013-2,35,32,2,1,3,3,3,0,0,0 23 | 2110020-9,13,7,5,0,23,0,0,0,0,0 24 | 2110031-6,0,0,0,0,0,0,0,0,0,0 25 | 2110037-32,3,3,0,0,0,0,0,0,0,0 26 | 2110042-0,0,0,0,0,0,0,0,0,0,0 27 | 2110055-31,18,11,0,7,0,4,3,1,0,0 28 | 2110057-21,49,45,3,1,2,0,0,0,0,0 29 | 2110086-6,32,28,3,1,3,2,2,0,0,0 30 | 2110100-9,19,16,3,0,1,6,5,1,0,1 31 | 2110140-27,46,45,1,0,0,2,1,1,0,0 32 | 2110156-5,0,0,0,0,0,0,0,0,0,0 33 | 2110156-75,26,26,0,0,0,3,2,1,0,0 34 | 2110156-163,19,19,0,0,0,2,2,0,0,0 35 | 2111015-7,20,20,0,0,0,3,3,0,0,0 36 | 2111037-2,1,0,0,1,0,0,0,0,0,0 37 | 2111122-1,27,16,8,3,9,0,0,0,0,1 38 | 2112122-3,16,14,2,0,1,6,5,1,0,0 39 | 2112131-15,27,25,0,2,1,7,6,1,0,0 40 | 2112134-83,19,15,3,1,5,6,6,0,0,0 41 | 2112143-4,5,4,0,0,0,1,1,0,0,0 42 | 2201004-18,36,30,2,4,3,1,1,0,0,0 43 | 2201004-49,99,91,7,1,5,0,0,0,0,0 44 | 2201041-7,4,4,0,0,20,2,2,0,0,0 45 | 2201041-9,0,0,0,0,22,3,3,0,0,0 46 | 2201053-3,1,0,0,1,0,1,0,0,1,0 47 | 2201072-4,25,22,3,0,10,5,5,0,0,0 48 | 2201109-43,16,15,1,0,11,3,2,1,0,0 49 | 2201115-2,16,14,2,0,2,0,0,0,0,0 50 | 2201123-37,17,17,0,0,0,2,2,0,0,0 51 | 2201126-5,33,26,4,3,1,2,2,0,0,1 52 | 2202058-1,18,14,4,0,0,2,2,0,0,0 53 | Total,1035,877,95,61,174,98,83,14,1,4 -------------------------------------------------------------------------------- /Experiments/Out-of-Sample/IBEM.csv: -------------------------------------------------------------------------------- 1 | ,Inline,,,,,Display,,,, 2 | page_id,detected,correct,WD,WL,ML,detected,correct,WD,WL,ML 3 | 2202113-1,24,19,3,2,7,0,0,0,0,0 4 | 2108119-23,18,18,0,0,3,7,4,2,1,0 5 | 2108124-25,19,11,2,6,8,2,1,1,0,1 6 | 2108138-7,15,14,0,1,0,0,0,0,0,0 7 | 2108139-46,38,25,13,0,4,6,6,0,0,0 8 | 2109013-10,0,0,0,0,0,0,0,0,0,0 9 | 2109015-11,17,7,0,10,0,0,0,0,0,0 10 | 2109046-2,55,30,17,8,6,5,2,3,0,1 11 | 2109075-19,43,32,11,0,2,2,2,0,0,0 12 | 2109076-1,0,0,0,0,0,0,0,0,0,0 13 | 2109083-20,13,12,1,0,4,6,4,2,0,0 14 | 2109102-17,35,23,8,4,5,0,0,0,0,0 15 | 2109107-5,8,7,1,0,1,0,0,0,0,0 16 | 2109109-3,46,29,13,4,5,1,1,0,0,0 17 | 2109114-45,11,10,1,0,0,3,3,0,0,0 18 | 2109121-25,45,31,12,2,14,0,0,0,0,0 19 | 2109128-41,0,0,0,0,0,0,0,0,0,0 20 | 2109131-4,2,2,0,0,1,0,0,0,0,0 21 | 2109145-14,0,0,0,0,0,0,0,0,0,0 22 | 2110013-2,36,25,6,5,6,4,3,0,1,0 23 | 2110020-9,29,2,8,19,25,0,0,0,0,0 24 | 2110031-6,0,0,0,0,0,0,0,0,0,0 25 | 2110037-32,4,3,0,1,0,0,0,0,0,0 26 | 2110042-0,0,0,0,0,0,0,0,0,0,0 27 | 2110055-31,18,11,0,7,0,4,3,1,0,0 28 | 2110057-21,49,40,9,0,1,0,0,0,0,0 29 | 2110086-6,33,27,6,0,1,2,2,0,0,0 30 | 2110100-9,19,15,3,1,2,6,6,0,0,1 31 | 2110140-27,44,42,2,0,2,2,1,1,0,0 32 | 2110156-5,0,0,0,0,0,0,0,0,0,0 33 | 2110156-75,26,22,4,0,0,3,3,0,0,0 34 | 2110156-163,22,15,3,4,1,2,1,1,0,0 35 | 2111015-7,20,18,2,0,0,3,0,3,0,0 36 | 2111037-2,2,0,0,2,0,0,0,0,0,0 37 | 2111122-1,21,11,6,4,16,1,1,0,0,0 38 | 2112122-3,15,13,2,0,2,6,4,2,0,0 39 | 2112131-15,27,26,0,1,0,6,6,0,0,1 40 | 2112134-83,14,9,5,0,9,6,4,2,0,0 41 | 2112143-4,13,4,0,9,0,1,1,0,0,0 42 | 2201004-18,33,27,2,4,6,0,0,0,0,1 43 | 2201004-49,96,75,21,0,7,0,0,0,0,0 44 | 2201041-7,21,17,4,0,3,1,0,1,0,1 45 | 2201041-9,23,19,3,1,0,3,3,0,0,0 46 | 2201053-3,53,0,0,53,0,0,0,0,0,0 47 | 2201072-4,30,14,15,1,6,4,4,0,0,1 48 | 2201109-43,25,20,1,4,6,3,3,0,0,0 49 | 2201115-2,15,11,4,0,3,0,0,0,0,0 50 | 2201123-37,19,16,1,2,0,2,2,0,0,0 51 | 2201126-5,33,25,4,4,2,3,2,1,0,0 52 | 2202058-1,20,8,9,5,1,2,2,0,0,0 53 | Total,1149,785,202,164,159,96,74,20,2,7 -------------------------------------------------------------------------------- /Experiments/Dataset-Comparison/Marmot.csv: -------------------------------------------------------------------------------- 1 | ,Inline,,,,,Display,,,, 2 | page id,CL,WP,NP,SGT,NGT,CL,WP,NP,SGT,NGT 3 | 101112007-11,9,8,1,5,1,3,0,0,0,0 4 | 101112016-8,33,14,5,10,1,0,1,5,1,3 5 | 101112016-12,36,2,5,1,0,1,1,4,1,4 6 | 101112084-22,1,0,4,0,0,5,0,0,0,0 7 | 101112102-2,5,21,2,17,1,1,0,0,0,0 8 | 101112110-3,37,4,0,3,0,1,0,2,0,0 9 | 101112111-5,9,3,3,1,2,0,0,0,0,0 10 | 101112128-3,2,1,0,1,0,2,0,0,0,0 11 | 101112134-59,5,3,2,3,0,2,0,0,0,0 12 | 101112137-10,13,11,6,11,0,0,0,4,0,4 13 | 101114574-2,6,23,0,22,0,4,0,0,0,0 14 | 101162189-4,21,12,0,12,0,0,1,0,0,0 15 | 101162197-2,17,10,1,9,1,4,0,0,0,0 16 | 101162203-5,30,34,11,31,6,1,2,2,0,0 17 | 101162246-4,50,2,9,2,8,2,0,0,0,0 18 | 101162246-18,33,1,2,1,0,2,0,0,0,0 19 | 101162258-9,31,6,7,2,1,2,0,0,0,0 20 | 101162264-31,3,3,3,3,1,1,0,0,0,0 21 | 101162281-9,65,7,4,1,2,2,0,0,0,0 22 | 101162296-4,25,3,2,3,1,7,0,0,0,0 23 | 101162300-6,53,5,16,0,0,3,2,2,0,0 24 | 101162348-19,15,23,7,23,4,7,0,0,0,0 25 | 101162363-19,9,0,0,0,0,3,0,0,0,0 26 | 101162367-17,9,5,0,5,0,3,0,0,0,0 27 | 101162371-5,8,11,5,11,5,3,0,0,0,0 28 | 101172117-7,26,15,2,12,1,3,0,0,0,0 29 | 101172148-7,20,9,3,9,3,4,0,0,0,0 30 | 101172186-15,5,8,0,8,0,2,0,8,0,8 31 | 101182119-9,7,5,3,2,0,0,0,0,0,0 32 | 1011202100-3,4,1,0,1,0,1,0,0,0,0 33 | 1011226625-37,18,7,5,4,2,1,0,0,0,0 34 | 1011303568-7,0,0,13,0,0,0,0,5,0,4 35 | 1011303568-9,0,0,19,0,0,0,0,4,0,0 36 | 1011781138-3,14,14,2,14,0,3,0,0,0,0 37 | 10111016521-31,7,0,0,0,0,1,0,0,0,0 38 | 10111069602-20,13,10,6,10,5,7,0,0,0,0 39 | 10111445167-9,36,25,2,23,0,2,0,0,0,0 40 | 10111476776-4,29,26,3,21,0,4,0,1,0,0 41 | 10111483166-10,16,7,2,5,2,5,0,0,0,0 42 | 10111501297-4,26,5,0,5,0,7,0,0,0,0 43 | 10111534083-28,2,0,0,0,0,1,0,0,0,0 44 | 10111619629-5,0,13,0,13,0,4,0,0,0,0 45 | 10111619629-10,0,22,0,22,0,6,0,0,0,0 46 | 10111864283-8,7,6,0,1,0,2,0,0,0,0 47 | 10111921813-3,13,19,4,19,0,2,0,0,0,0 48 | 10111921813-4,19,9,2,6,0,0,0,1,0,1 49 | 10111921818-2,4,1,5,1,0,2,0,0,0,0 50 | 10111931812-41,10,0,2,0,0,1,0,0,0,0 51 | 10111931813-9,24,5,0,5,0,3,0,0,0,0 52 | 10111931818-2,1,4,2,0,0,0,0,1,0,0 53 | Total,826,423,170,358,47,120,7,39,2,24 -------------------------------------------------------------------------------- /Experiments/Labeling-Quality/IBEM.csv: -------------------------------------------------------------------------------- 1 | ,Inline,,,,Display,,,, 2 | page_id,CL,WL,WD,ML,CL,WL,WD,ML,PWE 3 | 1133-14,19,0,0,0,4,0,0,0,0 4 | 1213-7,15,0,3,0,5,0,0,0,1 5 | 2194-12,10,0,0,0,2,0,2,2,1 6 | 3079-8,4,0,0,0,4,0,1,1,1 7 | 3126-20,10,0,0,0,7,2,2,0,1 8 | 3221-8,3,3,0,0,1,0,0,0,1 9 | 5070-15,8,0,0,0,1,0,0,0,0 10 | 5174-4,27,0,0,0,4,0,0,0,0 11 | 5260-3,17,0,0,0,0,0,1,2,1 12 | 6023-15,13,0,0,0,1,0,0,0,0 13 | 6108-18,39,0,0,0,4,0,0,0,0 14 | 7117-9,4,0,0,0,0,0,0,0,0 15 | 7180-2,20,0,0,0,5,0,0,0,0 16 | 9070-5,16,0,0,0,3,0,0,0,0 17 | 10052-4,24,0,0,0,4,0,0,0,0 18 | 10218-22,21,0,0,0,5,0,0,0,0 19 | 11135-3,43,0,0,0,4,1,1,0,1 20 | 11285-4,3,2,0,0,3,0,0,0,1 21 | 12040-3,32,0,0,0,3,0,0,0,0 22 | 12090-1,3,0,0,0,0,0,0,0,0 23 | 12145-41,68,0,0,0,4,0,0,0,0 24 | 101033-8,24,0,0,0,2,0,0,0,0 25 | 101053-2,31,0,2,0,3,0,0,0,1 26 | 101226-1,1,0,0,8,0,0,0,0,1 27 | 102116-1,2,0,0,0,0,0,0,0,0 28 | 102158-1,0,0,0,0,0,0,0,0,0 29 | 103165-5,10,2,2,0,2,0,0,0,1 30 | 103181-4,25,0,0,0,6,0,0,0,0 31 | 106014-2,6,0,1,0,0,0,0,0,1 32 | 106055-5,18,0,0,2,7,0,0,0,1 33 | 106176-6,25,0,3,0,5,0,0,0,1 34 | 106213-5,9,0,0,0,8,0,0,0,0 35 | 106277-6,19,0,0,0,1,0,0,0,0 36 | 107068-10,36,0,0,0,5,0,0,0,0 37 | 107101-9,12,0,0,0,6,0,2,2,1 38 | 107120-22,8,0,0,0,8,0,0,0,0 39 | 107132-7,6,0,0,0,0,0,3,5,1 40 | 108107-6,13,0,0,0,2,0,1,4,1 41 | 109012-45,0,0,0,0,0,0,0,0,0 42 | 109075-18,2,0,0,0,0,0,0,0,0 43 | 109097-5,17,0,0,6,0,0,0,0,1 44 | 110046-10,28,0,0,0,3,0,0,0,0 45 | 110060-6,15,1,0,0,7,0,0,0,1 46 | 110097-11,9,0,0,0,5,0,0,0,0 47 | 110101-4,16,1,0,0,2,0,2,2,1 48 | 110101-8,23,0,0,0,6,0,0,0,0 49 | 110125-9,4,0,0,0,9,0,0,0,0 50 | 110125-15,6,0,0,0,10,0,0,0,0 51 | 110192-2,16,0,0,2,1,0,0,0,1 52 | 110251-11,9,2,0,0,2,3,3,0,1 53 | 111110-3,12,0,0,0,10,0,0,0,0 54 | 111110-10,2,0,0,0,0,0,0,0,0 55 | 111132-2,28,0,0,3,8,0,0,0,1 56 | 111196-9,2,0,0,0,8,0,0,0,0 57 | 111286-11,18,0,0,0,1,0,0,0,0 58 | 201003-6,0,12,0,0,0,0,0,0,1 59 | 201005-8,9,3,0,0,4,0,0,0,1 60 | 201010-18,80,3,0,0,2,0,0,0,1 61 | 203062-9,23,0,0,0,7,0,0,0,0 62 | 203064-11,11,0,0,2,1,0,0,0,1 63 | 203064-17,11,0,0,0,1,0,3,5,1 64 | 203090-2,16,0,0,1,9,0,0,0,1 65 | 203195-8,12,0,0,0,6,0,0,0,0 66 | 204059-6,12,0,0,0,0,0,0,0,0 67 | 204071-1,0,0,0,0,0,0,0,0,0 68 | 205023-12,4,2,0,0,0,0,0,0,1 69 | 205115-5,10,0,0,0,4,0,0,0,0 70 | 205123-42,0,0,0,0,0,0,0,0,0 71 | 206097-9,17,0,0,0,7,0,0,0,0 72 | 206235-4,23,0,0,0,6,0,0,0,0 73 | 207074-5,17,1,0,0,3,0,0,0,1 74 | 207192-19,15,0,0,1,2,0,0,0,1 75 | 207276-57,13,0,0,2,3,0,0,0,1 76 | 207276-95,1,0,0,0,0,0,0,0,0 77 | 208122-7,33,0,0,0,4,0,0,0,0 78 | 209229-9,25,0,1,4,2,0,0,0,1 79 | 209235-2,14,0,0,0,0,0,0,0,0 80 | 210128-6,26,0,0,4,0,0,1,1,1 81 | 110188-2,21,0,0,0,1,0,0,0,0 82 | 210288-8,9,0,0,0,6,0,0,0,0 83 | 211013-7,7,0,0,0,2,0,0,0,0 84 | 211037-8,15,0,0,0,1,0,0,0,0 85 | 211148-8,3,0,0,0,0,0,0,0,0 86 | 211192-3,2,0,0,0,7,0,0,0,0 87 | 211274-5,25,0,0,0,3,0,0,0,0 88 | 211287-8,29,0,0,1,4,0,2,3,1 89 | 212100-7,45,1,1,0,2,0,0,0,1 90 | 212100-21,0,0,0,0,0,0,0,0,0 91 | 212156-7,23,0,0,0,8,0,0,0,0 92 | 212156-17,0,0,0,0,0,0,0,0,0 93 | 301004-11,16,0,0,0,4,1,1,0,1 94 | 301156-4,19,0,0,0,0,0,0,0,0 95 | 301245-4,9,0,0,0,6,0,0,0,0 96 | 302022-15,14,0,0,0,5,0,0,0,0 97 | 303123-7,1,0,0,0,6,0,0,0,0 98 | 303157-8,6,0,0,0,4,0,0,0,0 99 | 304054-4,18,0,0,0,2,0,4,6,1 100 | 304075-12,32,0,0,0,2,0,0,0,0 101 | 304091-22,0,0,0,0,0,4,1,0,1 102 | 304253-6,16,0,3,0,1,0,0,0,1 103 | Total,1533,33,16,36,316,11,30,33,41 -------------------------------------------------------------------------------- /Experiments/Labeling-Quality/FormulaNet.csv: -------------------------------------------------------------------------------- 1 | ,Inline,,,,Display,,,, 2 | page_id,CL,WL,WD,ML,CL,WL,WD,ML,PWE 3 | 204035-13,21,1,0,0,5,0,0,1,1 4 | 1113-0,0,0,0,0,0,0,0,0,0 5 | 1222-10,38,0,0,0,2,0,0,0,0 6 | 2116-126,23,0,1,21,7,0,0,0,1 7 | 3046-8,40,0,0,0,6,0,0,0,0 8 | 3232-16,18,1,0,0,5,0,0,0,1 9 | 3241-10,21,0,0,0,4,0,0,0,0 10 | 4092-0,8,0,0,0,0,0,0,0,0 11 | 4098-51,33,0,1,0,1,0,0,0,1 12 | 5057-97,18,0,0,1,4,0,0,0,1 13 | 5144-6,51,0,0,0,3,0,0,0,0 14 | 5187-38,23,0,1,0,7,0,0,0,1 15 | 6179-22,18,0,0,0,6,0,0,0,0 16 | 6196-50,20,0,1,0,7,0,0,0,1 17 | 9112-15,23,0,0,0,0,0,0,0,0 18 | 9235-8,19,0,0,0,3,0,0,0,0 19 | 9238-4,35,0,0,0,6,0,0,0,0 20 | 10148-8,38,0,0,0,4,0,0,0,0 21 | 10232-13,21,0,0,0,7,0,0,0,0 22 | 11002-30,13,0,0,0,4,0,0,0,0 23 | 11078-1,2,0,0,0,0,0,0,0,0 24 | 11372-22,7,0,0,0,0,0,0,0,0 25 | 112247-22,2,0,0,0,2,0,0,0,0 26 | 12269-12,7,0,0,0,7,0,1,0,1 27 | 201141-11,22,0,0,0,7,0,0,0,0 28 | 201153-6,17,0,0,0,6,0,0,0,0 29 | 201154-9,11,0,0,0,6,0,0,0,0 30 | 201198-29,19,0,1,2,4,0,0,0,1 31 | 202126-8,18,0,0,0,5,0,0,0,0 32 | 202145-11,25,0,0,0,5,0,0,0,0 33 | 202166-12,26,0,0,0,6,0,0,0,0 34 | 203024-13,27,0,0,0,1,0,0,0,0 35 | 203120-1,15,0,0,0,3,0,0,0,0 36 | 204045-18,35,0,0,0,3,0,0,0,0 37 | 204185-12,20,0,0,0,2,0,0,0,0 38 | 204203-40,1,0,0,0,0,0,0,0,0 39 | 204247-5,8,0,0,0,0,0,0,0,0 40 | 205005-13,5,0,0,0,1,0,0,0,0 41 | 205086-7,13,0,0,0,8,0,0,0,0 42 | 205281-33,4,0,0,0,4,0,0,0,0 43 | 205300-0,7,0,0,0,0,0,0,0,0 44 | 206046-3,20,0,0,0,3,0,0,0,0 45 | 206078-89,7,0,0,0,5,0,0,0,0 46 | 206249-29,10,0,0,0,7,0,0,0,0 47 | 206257-6,18,0,0,0,3,0,0,0,0 48 | 207011-7,3,0,0,0,0,0,0,0,0 49 | 207042-26,15,3,0,0,7,1,0,0,1 50 | 207076-6,13,0,0,0,3,0,0,0,0 51 | 207107-1,3,0,0,0,0,0,0,0,0 52 | 207110-17,13,0,0,0,4,0,0,0,0 53 | 207168-22,8,0,0,0,0,0,0,0,0 54 | 207274-1,14,0,0,0,4,0,0,0,0 55 | 209020-3,29,0,0,0,3,0,0,0,0 56 | 209098-2,16,0,0,0,3,0,0,0,0 57 | 209104-2,22,0,0,0,0,0,0,0,0 58 | 209122-18,28,0,0,0,2,0,0,0,0 59 | 209137-31,0,0,0,0,1,0,0,0,0 60 | 209144-4,22,0,0,0,5,0,1,0,1 61 | 209161-19,18,0,0,0,5,0,0,0,0 62 | 209239-7,7,0,0,0,4,0,0,0,0 63 | 209321-6,7,0,0,0,6,0,0,0,0 64 | 210178-39,19,0,0,0,2,0,0,0,0 65 | 210188-3,6,0,0,0,6,0,0,0,0 66 | 210202-46,4,0,0,0,0,0,0,0,0 67 | 210245-4,24,0,0,0,5,0,0,0,0 68 | 210270-12,2,0,0,0,3,0,0,0,0 69 | 210283-2,3,0,0,0,6,0,0,0,0 70 | 211068-17,21,0,1,0,0,0,0,0,1 71 | 211080-21,29,0,0,0,0,0,0,0,0 72 | 211134-12,13,0,0,0,3,0,0,0,0 73 | 211147-65,28,0,0,0,2,0,0,0,0 74 | 211242-38,11,0,0,0,5,0,0,0,0 75 | 212005-6,20,0,0,0,0,0,0,0,0 76 | 212035-31,8,0,0,0,5,0,0,0,0 77 | 212043-7,9,0,0,0,2,0,0,0,0 78 | 212081-39,11,0,0,0,7,0,0,0,0 79 | 212096-1,6,0,0,0,0,0,0,0,0 80 | 212238-0,1,0,0,0,0,0,0,0,0 81 | 212239-21,28,0,0,0,5,0,0,0,0 82 | 212275-15,6,0,0,0,5,0,0,0,0 83 | 212275-30,4,0,0,0,9,0,0,0,0 84 | 212275-45,9,0,0,0,10,0,0,0,0 85 | 212343-6,11,0,0,0,6,0,0,0,0 86 | 301125-24,8,0,0,0,0,0,0,0,0 87 | 302059-2,17,0,0,0,6,0,1,0,1 88 | 302102-0,3,2,0,0,0,0,0,0,1 89 | 303115-28,12,0,0,0,3,0,0,0,0 90 | 303119-20,7,0,0,0,3,0,0,0,0 91 | 303153-7,5,0,0,0,5,0,0,0,0 92 | 303221-5,11,0,0,0,4,0,0,0,0 93 | 303269-1,2,0,0,0,0,0,0,0,0 94 | 304028-6,20,0,0,0,4,0,0,0,0 95 | 304195-17,19,0,0,0,4,0,0,0,0 96 | 305022-6,10,0,0,0,4,0,1,1,1 97 | 305057-3,29,0,0,0,4,0,0,0,0 98 | 305117-0,0,0,0,0,0,0,0,0,0 99 | 305135-25,4,0,0,0,8,0,0,0,0 100 | 305189-3,41,0,0,0,5,0,0,0,0 101 | 305227-37,14,0,0,0,14,0,0,0,0 102 | 306020-15,9,0,0,0,4,0,2,0,1 103 | Total,1529,7,6,24,365,1,6,2,16 -------------------------------------------------------------------------------- /Experiments/Labeling-Quality/Marmot.csv: -------------------------------------------------------------------------------- 1 | ,Inline,,,,Display,,,, 2 | page_id,CL,WL,WD,ML,CL,WL,WD,ML,PWE 3 | 101112004-5,0,0,0,0,1,0,0,0,0 4 | 101112005-4,20,0,2,0,4,0,0,0,1 5 | 101112005-12,16,0,4,2,5,0,0,0,1 6 | 101112022-6,22,0,2,0,2,0,0,0,1 7 | 101112076-38,7,0,0,4,1,0,0,0,1 8 | 101112081-3,8,0,1,5,3,0,0,1,1 9 | 101112085-4,42,0,0,2,2,0,0,0,1 10 | 101112089-4,58,0,0,4,3,4,3,0,1 11 | 101112110-3,36,0,0,8,3,0,0,0,1 12 | 101112111-5,12,0,0,0,0,0,0,3,1 13 | 101112130-4,21,0,1,4,8,0,0,0,1 14 | 101112133-4,36,0,4,14,5,0,0,0,1 15 | 101112139-22,8,0,0,5,2,1,1,0,1 16 | 101112139-102,3,0,0,3,1,0,0,0,1 17 | 101112147-9,31,0,0,14,4,0,0,0,1 18 | 101112151-3,34,0,0,12,5,0,0,1,1 19 | 101114574-17,13,0,0,5,8,0,0,0,1 20 | 101162186-9,7,0,0,8,1,0,0,0,1 21 | 101162189-9,31,0,0,14,4,0,0,0,1 22 | 101162190-4,20,1,0,1,2,0,0,1,1 23 | 101162191-23,7,0,0,1,1,0,0,0,1 24 | 101162197-2,17,0,1,10,4,0,0,0,1 25 | 101162202-3,7,0,0,7,1,0,0,0,1 26 | 101162203-5,45,0,0,29,1,0,2,0,1 27 | 101162211-2,41,0,0,3,10,0,0,0,1 28 | 101162231-4,10,0,2,4,2,0,1,0,1 29 | 101162231-7,9,0,1,4,1,0,0,0,1 30 | 101162231-9,15,0,1,0,3,0,3,0,1 31 | 101162245-14,0,0,0,2,2,0,0,0,1 32 | 101162550-22,5,0,0,1,2,0,0,0,1 33 | 101162250-52,13,0,0,1,1,0,0,0,1 34 | 101162258-5,26,0,0,2,1,0,0,0,1 35 | 101162258-9,33,0,3,1,2,0,0,0,1 36 | 101162258-14,46,0,0,0,1,0,0,0,0 37 | 101162264-30,16,0,1,0,5,0,1,0,1 38 | 101162264-31,5,1,0,1,1,0,0,0,1 39 | 101162264-49,12,0,0,0,1,0,0,0,0 40 | 101162264-51,19,0,0,0,2,1,1,0,1 41 | 101162264-64,6,0,0,0,4,0,0,0,0 42 | 101162280-14,11,0,0,0,2,0,0,0,0 43 | 101162281-7,29,0,0,0,5,0,0,0,0 44 | 101162286-5,8,0,0,0,1,0,0,0,0 45 | 101162296-3,41,0,0,0,7,0,0,0,0 46 | 101162308-4,19,0,0,2,3,2,1,0,1 47 | 1011623133-2,4,0,0,0,0,0,0,0,0 48 | 101162317-2,2,0,0,0,2,0,0,0,0 49 | 101162319-23,23,0,4,3,4,2,2,0,1 50 | 101162319-24,5,0,1,1,3,4,1,0,1 51 | 101162330-13,8,0,0,4,3,0,0,0,1 52 | 101162335-106,1,0,0,4,2,2,2,0,1 53 | 101162341-21,18,0,0,0,10,0,0,0,0 54 | 101162342-6,7,0,1,0,12,0,0,0,1 55 | 101162349-6,10,0,0,2,0,1,1,0,1 56 | 101162363-14,7,1,0,2,2,0,0,1,1 57 | 101162382-5,38,0,1,2,2,0,0,0,1 58 | 101172125-6,1,0,0,1,0,0,0,0,1 59 | 101172148-13,22,0,0,3,4,0,1,0,1 60 | 101172161-2,16,0,0,20,10,0,0,0,1 61 | 101172165-2,23,0,0,4,11,0,1,0,1 62 | 101172186-13,13,0,0,2,0,5,2,0,1 63 | 101172186-15,5,0,0,7,2,7,1,0,1 64 | 101172193-5,20,0,0,8,3,5,1,0,1 65 | 101182179-6,0,0,0,0,0,1,1,0,1 66 | 1011113137-3,24,0,1,8,7,0,0,0,1 67 | 1011202100-3,4,0,0,2,1,0,0,0,1 68 | 1011226625-37,21,2,1,4,1,0,0,0,1 69 | 1011586850-3,74,0,6,7,3,0,0,0,1 70 | 1011602672-31,22,0,1,15,3,0,0,0,1 71 | 1011602672-32,26,0,1,2,4,1,1,0,1 72 | 1011755300-14,32,0,0,14,0,0,1,0,1 73 | 1011781138-3,16,0,1,12,3,0,0,0,1 74 | 1011792274-3,17,0,0,2,4,2,2,0,1 75 | 1011856686-6,1,0,0,5,3,0,0,0,1 76 | 1011856686-7,20,0,0,5,4,0,0,0,1 77 | 10111016521-38,6,0,0,2,5,1,1,0,1 78 | 10111024697-50,13,0,0,16,3,0,0,0,1 79 | 10111069602-20,17,0,2,4,7,0,0,0,1 80 | 10111107699-17,19,0,1,1,2,0,0,0,1 81 | 10111384863-22,41,0,7,11,2,0,1,0,1 82 | 10111403847-6,2,0,0,3,8,3,0,0,1 83 | 10111403847-8,1,1,0,4,8,0,0,0,1 84 | 10111403847-9,0,0,0,4,8,3,0,0,1 85 | 10111416335-15,19,0,6,8,4,0,0,0,1 86 | 10111431869-7,32,0,0,5,2,2,1,0,1 87 | 10111438311-6,26,0,2,10,4,0,0,0,1 88 | 10111438311-15,11,0,1,2,5,1,1,0,1 89 | 10111476776-3,16,0,3,9,4,0,0,0,1 90 | 1011149517-5,17,0,1,8,2,0,0,0,1 91 | 10111501297-8,18,0,0,4,6,0,0,0,1 92 | 10111545547-7,13,0,0,3,4,0,2,1,1 93 | 10111547694-5,15,1,0,7,3,0,0,0,1 94 | 10111547694-15,3,2,0,0,1,0,0,0,1 95 | 10111655288-11,6,0,0,2,4,0,0,0,1 96 | 10111921804-7,33,0,0,11,0,0,0,0,1 97 | 10111921804-11,21,0,0,17,2,0,0,0,1 98 | 10111921804-16,33,0,0,14,2,0,0,0,1 99 | 10111921813-4,20,0,1,5,1,0,0,0,1 100 | 10111931806-2,23,0,1,5,18,1,1,0,1 101 | 10111931808-3,46,0,0,8,18,0,0,0,1 102 | 10111931812-41,12,0,0,2,1,0,0,0,1 103 | Total,1808,9,66,477,354,49,37,8,89 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![CC BY 4.0][cc-by-shield]][cc-by] 2 | # FormulaNet 3 | 4 | FormulaNet is a new large-scale Mathematical Formula Detection dataset. It consists of 46'672 pages of STEM documents from [arXiv](arxiv.org) and has 5 | 13 types of labels. The dataset is split into a [train](Dataset/train) set of 44'338 pages and a [validation](Dataset/val) set of 2'334 pages. Due to 6 | copyrights reasons, we can only provide the [list](urls.txt) of papers, which must be downloaded and processed. 7 | 8 | ## Labels 9 | 10 | * inline formulae 11 | * display formulae 12 | * headers 13 | * tables 14 | * figures 15 | * paragraphs 16 | * captions 17 | * footnotes 18 | * lists 19 | * bibliographies 20 | * display formulae reference number 21 | * display formulae with reference number 22 | * footnote reference number 23 | 24 | ## Get FormulaNet (Docker option recommended) 25 | 26 | ### Docker option 27 | **Prerequisites** 28 | * Docker 29 | * Clone the repository 30 | ```shell 31 | git clone https://github.com/felix-schmitt/FormulaNet.git 32 | ``` 33 | * Get the annotation files via [Dropbox](https://www.dropbox.com/sh/9yjb1lkv9dnmdev/AABBH7QFVA888scAu4Rgj1sja?dl=0) 34 | 35 | The file structure should look like this: 36 | 37 | . 38 | ├── ... 39 | ├── Dataset 40 | │ ├── train 41 | │ │ ├── img # empty folder 42 | │ │ └── train_coco.json 43 | │ └── test 44 | │ ├── img # empty folder 45 | │ └── test_coco.json 46 | └── ... 47 | 48 | **build dockerfile (amd64 and arm64 supported)** 49 | ```shell 50 | docker build -t formulanet --build-arg Platform='amd64' . 51 | ``` 52 | 53 | **run the container with mounting the FormulaNet Folder** 54 | ```shell 55 | docker run -v ~//Dataset:/FormulaNet/Dataset formulanet 56 | ``` 57 | 58 | ### Classic option 59 | 60 | **Prerequisites** 61 | * Ubuntu 20.04.5 LTS is recommended 62 | * A LaTeX installation with texlive-full (2019 is recommended) is required 63 | * Clone the repository 64 | ```shell 65 | git clone https://github.com/felix-schmitt/FormulaNet.git 66 | ``` 67 | * Get the annotation files via [Dropbox](https://www.dropbox.com/sh/9yjb1lkv9dnmdev/AABBH7QFVA888scAu4Rgj1sja?dl=0) 68 | 69 | The file structure should look like this: 70 | 71 | . 72 | ├── ... 73 | ├── Dataset 74 | │ ├── train 75 | │ │ ├── img # empty folder 76 | │ │ └── train_coco.json 77 | │ └── test 78 | │ ├── img # empty folder 79 | │ └── test_coco.json 80 | └── ... 81 | 82 | **Install the python environment (recommended Python 3.8)** 83 | ```shell 84 | pip install -r requirements.txt 85 | ``` 86 | **run the script** 87 | ```shell 88 | python download.py 89 | ``` 90 | 91 | ## Baseline Model 92 | 93 | | Model | mAP | mAP@50 | mAP@75 | mAP@inline | mAP@display | 94 | |----------|------------|------------|------------|------------|-------------| 95 | | FCOS-50 | 0.754±0.03 | 0.921±0.02 | 0.84±0.02 | 0.752±0.02 | 0.755±0.02 | 96 | | FCOS-101 | 0.755±0.03 | 0.920±0.02 | 0.841±0.02 | 0.756±0.02 | 0.749±0.03 | 97 | 98 | The results can be reproduced by using these config files ([FCOS-50](Baseline/FCOS-50.py), [FCOS-101](Baseline/FCOS-101.py)) and the github repo [Yuxiang1995/ICDAR2021_MFD](https://github.com/Yuxiang1995/ICDAR2021_MFD). 99 | 100 | ## License 101 | This work is licensed under a 102 | [Creative Commons Attribution 4.0 International License][cc-by]. 103 | 104 | [![CC BY 4.0][cc-by-image]][cc-by] 105 | 106 | [cc-by]: http://creativecommons.org/licenses/by/4.0/ 107 | [cc-by-image]: https://i.creativecommons.org/l/by/4.0/88x31.png 108 | [cc-by-shield]: https://img.shields.io/badge/License-CC%20BY%204.0-lightgrey.svg 109 | ## Citation 110 | 111 | ### FormulaNet: A Benchmark Dataset for Mathematical Formula Detection 112 | 113 | Felix M. Schmitt-Koopmann, Elaine M. Huang, Hans-Peter Hutter, Thilo Stadelmann, Alireza Darvishy 114 | 115 | [https://ieeexplore.ieee.org/document/9869643](https://ieeexplore.ieee.org/document/9869643) 116 | 117 | ``` 118 | @ARTICLE{9869643, 119 | author={Schmitt-Koopmann, Felix M. and Huang, Elaine M. and Hutter, Hans-Peter and 120 | Stadelmann, Thilo and Darvishy, Alireza}, 121 | journal={IEEE Access}, 122 | title={FormulaNet: A Benchmark Dataset for Mathematical Formula Detection}, 123 | year={2022}, 124 | volume={10}, 125 | number={}, 126 | pages={91588-91596}, 127 | doi={10.1109/ACCESS.2022.3202639}} 128 | ``` 129 | -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import tarfile 3 | import numpy as np 4 | import shutil 5 | from tqdm import tqdm 6 | import arxiv 7 | from utils.walker_download import color_latex_code, compile_pdf 8 | from utils.tools import resize_image, load_json 9 | import pdf2image 10 | import time 11 | import os 12 | 13 | 14 | def download(url_file): 15 | # get paper ids 16 | paper_ids = [] 17 | with open(url_file) as f: 18 | line = f.readline() 19 | while line: 20 | paper_ids.append(line.split("\t")[1]) 21 | line = f.readline() 22 | temp = Path("temp") 23 | if temp.exists(): 24 | shutil.rmtree(str(temp)) 25 | train_pages = load_json(Path("Dataset/train/train_coco.json")) 26 | test_pages = load_json(Path("Dataset/test/test_coco.json")) 27 | train_img = Path("Dataset/train/img") 28 | test_img = Path("Dataset/test/img") 29 | train_img.mkdir(exist_ok=True) 30 | test_img.mkdir(exist_ok=True) 31 | with tqdm(paper_ids, postfix="download images") as pbar: 32 | for paper_id in pbar: 33 | pbar.set_description(f"download {paper_id}") 34 | paper_number = paper_id.split("/")[1].split("v")[0] 35 | # get train and val pages 36 | train_p = [page for page in train_pages if page.split('_')[0] == paper_number] 37 | test_p = [page for page in test_pages if page.split('_')[0] == paper_number] 38 | # continue if all images are already available 39 | if all([Path(train_img / page).exists() for page in train_p]) and all([Path(test_img / page).exists() for page in test_p]): 40 | continue 41 | # get paper 42 | start_time = time.time() 43 | paper = next(arxiv.Search(id_list=[paper_id]).results()) 44 | # download paper 45 | if temp.exists(): 46 | shutil.rmtree(temp) 47 | temp.mkdir() 48 | if not paper.download_source(dirpath="temp", filename="temp.tar.gz"): 49 | print(f"""Could not download {paper_id} 50 | Please try to download an arxiv paper with this code: 51 | import arxiv 52 | paper = next(arxiv.Search(id_list=["hep-th/0001001v2"]).results()) 53 | paper.download_source(dirpath="temp", filename="temp.tar.gz") 54 | """) 55 | tar = tarfile.open(temp/"temp.tar.gz", "r:gz") 56 | tar.extractall(temp) 57 | tar.close() 58 | # rename largest tex file 59 | tex_files = sorted(temp.glob('*.tex')) 60 | if len(tex_files) > 0: 61 | tex_sizes = [tex_file.stat().st_size for tex_file in tex_files] 62 | os.rename(f"{tex_files[np.argmax(tex_sizes)]}", f"temp/{paper_number}.tex") 63 | else: 64 | with_out_suffix = [f for f in sorted(temp.glob("*")) if f.name.split(".")[0] == f.name] 65 | with_out_suffix_sizes = [with_out_s.stat().st_size for with_out_s in with_out_suffix] 66 | os.rename(f"{with_out_suffix[np.argmax(with_out_suffix_sizes)]}", f"temp/{paper_number}.tex") 67 | data = Path(f"temp/{paper_number}.tex").read_text() 68 | modified_data = color_latex_code(data) 69 | modified_file = temp / ('vanilla_' + paper_number + '.tex') 70 | modified_file.write_text(modified_data) 71 | compile_pdf(modified_file, waiting_time=120) 72 | if not modified_file.with_suffix(".pdf").exists(): 73 | print(f"""Could not create PDF of {paper_number}. 74 | Try to increase the waiting_time and check the log to identify missing latex packages""") 75 | continue 76 | images = Path("temp/images") 77 | if images.exists(): 78 | shutil.rmtree(images) 79 | images.mkdir() 80 | if not pdf2image.convert_from_path(modified_file.with_suffix(".pdf"), output_folder=images, fmt="jpeg", dpi=300): 81 | print(f"Could not create images of {paper_number}") 82 | continue 83 | image_list = sorted(images.glob("*.jpg")) 84 | for page in train_p: 85 | page_number = int(page.split("page")[1].split(".")[0]) 86 | resize_image(image_list[page_number], {'w': 1447, 'h': 2048}, str(train_img / page)) 87 | for page in test_p: 88 | page_number = int(page.split("page")[1].split(".")[0]) 89 | resize_image(image_list[page_number], {'w': 1447, 'h': 2048}, str(test_img / page)) 90 | shutil.rmtree(str(temp)) 91 | wait_time = start_time-time.time()+5 92 | if wait_time > 0: 93 | time.sleep(wait_time) 94 | 95 | 96 | if __name__ == '__main__': 97 | download("urls.txt") 98 | -------------------------------------------------------------------------------- /Baseline/FCOS-50.py: -------------------------------------------------------------------------------- 1 | dataset_type = 'CocoDataset' 2 | data_root = 'data/FormulaNet/' 3 | classes = ('inline', 'display') 4 | img_norm_cfg = dict( 5 | mean=[123.68, 116.779, 103.939], std=[58.393, 57.12, 57.375], to_rgb=True) 6 | train_pipeline = [ 7 | dict(type='LoadImageFromFile'), 8 | dict(type='LoadAnnotations', with_bbox=True), 9 | dict(type='Resize', img_scale=(1447, 2048), keep_ratio=True), 10 | dict(type='RandomCrop', crop_size=(1600, 1440)), 11 | dict(type='RandomFlip', flip_ratio=0.5), 12 | dict( 13 | type='Normalize', 14 | mean=[123.675, 116.28, 103.53], 15 | std=[58.395, 57.12, 57.375], 16 | to_rgb=True), 17 | dict(type='Pad', size_divisor=32), 18 | dict(type='DefaultFormatBundle'), 19 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) 20 | ] 21 | test_pipeline = [ 22 | dict(type='LoadImageFromFile'), 23 | dict( 24 | type='MultiScaleFlipAug', 25 | img_scale=(1447, 2048), 26 | flip=True, 27 | transforms=[ 28 | dict(type='Resize', keep_ratio=True), 29 | dict(type='RandomFlip'), 30 | dict( 31 | type='Normalize', 32 | mean=[123.675, 116.28, 103.53], 33 | std=[58.395, 57.12, 57.375], 34 | to_rgb=True), 35 | dict(type='Pad', size_divisor=32), 36 | dict(type='ImageToTensor', keys=['img']), 37 | dict(type='Collect', keys=['img']) 38 | ]) 39 | ] 40 | data = dict( 41 | samples_per_gpu=5, 42 | workers_per_gpu=2, 43 | train=[ 44 | dict( 45 | type='CocoDataset', 46 | ann_file='data/FormulaNet/train/train_coco_all.json', 47 | img_prefix='data/FormulaNet/train/img/', 48 | classes=('inline', 'display'), 49 | pipeline=[ 50 | dict(type='LoadImageFromFile'), 51 | dict(type='LoadAnnotations', with_bbox=True), 52 | dict(type='Resize', img_scale=(1447, 2048), keep_ratio=True), 53 | dict(type='RandomCrop', crop_size=(1600, 1440)), 54 | dict(type='RandomFlip', flip_ratio=0.5), 55 | dict( 56 | type='Normalize', 57 | mean=[123.675, 116.28, 103.53], 58 | std=[58.395, 57.12, 57.375], 59 | to_rgb=True), 60 | dict(type='Pad', size_divisor=32), 61 | dict(type='DefaultFormatBundle'), 62 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) 63 | ]) 64 | ], 65 | val=dict( 66 | type='CocoDataset', 67 | ann_file='data/FormulaNet/val/train_coco_all.json', 68 | img_prefix='data/FormulaNet/val/img/', 69 | classes=('inline', 'display'), 70 | pipeline=[ 71 | dict(type='LoadImageFromFile'), 72 | dict( 73 | type='MultiScaleFlipAug', 74 | img_scale=(1447, 2048), 75 | flip=True, 76 | transforms=[ 77 | dict(type='Resize', keep_ratio=True), 78 | dict(type='RandomFlip'), 79 | dict( 80 | type='Normalize', 81 | mean=[123.675, 116.28, 103.53], 82 | std=[58.395, 57.12, 57.375], 83 | to_rgb=True), 84 | dict(type='Pad', size_divisor=32), 85 | dict(type='ImageToTensor', keys=['img']), 86 | dict(type='Collect', keys=['img']) 87 | ]) 88 | ]), 89 | ) 90 | evaluation = dict( 91 | interval=1, metric='bbox', classwise=True, save_best='bbox_mAP_75') 92 | test = dict(create_images=True, threshold=0.5) 93 | optimizer_config = dict(grad_clip=dict(max_norm=15, norm_type=2)) 94 | lr_config = dict( 95 | policy='step', 96 | warmup='linear', 97 | warmup_iters=500, 98 | warmup_ratio=0.001, 99 | step=[2, 8, 12, 16, 20]) 100 | total_epochs = 24 101 | checkpoint_config = dict(interval=1) 102 | log_config = dict(interval=50, hooks=[]) 103 | dist_params = dict(backend='nccl') 104 | log_level = 'INFO' 105 | load_from = None 106 | resume_from = None 107 | workflow = [('train', 1)] 108 | norm_cfg = dict(type='BN', requires_grad=True) 109 | model = dict( 110 | type='GFL', 111 | pretrained='open-mmlab://resnest50', 112 | backbone=dict( 113 | type='ResNeSt', 114 | stem_channels=64, 115 | depth=50, 116 | radix=2, 117 | reduction_factor=4, 118 | avg_down_stride=True, 119 | num_stages=4, 120 | out_indices=(0, 1, 2, 3), 121 | frozen_stages=1, 122 | norm_cfg=dict(type='BN', requires_grad=True), 123 | norm_eval=False, 124 | dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), 125 | stage_with_dcn=(False, True, True, True), 126 | with_cp=True, 127 | style='pytorch'), 128 | neck=dict( 129 | type='FPN', 130 | in_channels=[256, 512, 1024, 2048], 131 | out_channels=256, 132 | add_extra_convs='on_output', 133 | num_outs=5), 134 | bbox_head=dict( 135 | type='GFLHead', 136 | num_classes=2, 137 | in_channels=256, 138 | stacked_convs=4, 139 | feat_channels=256, 140 | anchor_generator=dict( 141 | type='AnchorGenerator', 142 | ratios=[1.0], 143 | octave_base_scale=8, 144 | scales_per_octave=1, 145 | strides=[8, 16, 32, 64, 128]), 146 | loss_cls=dict( 147 | type='QualityFocalLoss', 148 | use_sigmoid=True, 149 | beta=2.0, 150 | loss_weight=1.0), 151 | loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25), 152 | reg_max=24, 153 | loss_bbox=dict(type='GIoULoss', loss_weight=2.0))) 154 | train_cfg = dict( 155 | assigner=dict(type='ATSSAssigner', topk=9), 156 | allowed_border=-1, 157 | pos_weight=-1, 158 | debug=False) 159 | test_cfg = dict( 160 | nms_pre=1000, 161 | min_bbox_size=0, 162 | score_thr=0.5, 163 | nms=dict(type='nms', iou_threshold=0.4), 164 | max_per_img=100) 165 | optimizer = dict(type='Ranger', lr=0.0001) 166 | work_dir = 'checkpoints/' 167 | gpu_ids = range(0, 1) 168 | -------------------------------------------------------------------------------- /Baseline/FCOS-101.py: -------------------------------------------------------------------------------- 1 | dataset_type = 'CocoDataset' 2 | data_root = 'data/FormulaNet/' 3 | classes = ('inline', 'display') 4 | img_norm_cfg = dict( 5 | mean=[123.68, 116.779, 103.939], std=[58.393, 57.12, 57.375], to_rgb=True) 6 | train_pipeline = [ 7 | dict(type='LoadImageFromFile'), 8 | dict(type='LoadAnnotations', with_bbox=True), 9 | dict(type='Resize', img_scale=(1447, 2048), keep_ratio=True), 10 | dict(type='RandomCrop', crop_size=(1600, 1440)), 11 | dict(type='RandomFlip', flip_ratio=0.5), 12 | dict( 13 | type='Normalize', 14 | mean=[123.675, 116.28, 103.53], 15 | std=[58.395, 57.12, 57.375], 16 | to_rgb=True), 17 | dict(type='Pad', size_divisor=32), 18 | dict(type='DefaultFormatBundle'), 19 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) 20 | ] 21 | test_pipeline = [ 22 | dict(type='LoadImageFromFile'), 23 | dict( 24 | type='MultiScaleFlipAug', 25 | img_scale=(1447, 2048), 26 | flip=True, 27 | transforms=[ 28 | dict(type='Resize', keep_ratio=True), 29 | dict(type='RandomFlip'), 30 | dict( 31 | type='Normalize', 32 | mean=[123.675, 116.28, 103.53], 33 | std=[58.395, 57.12, 57.375], 34 | to_rgb=True), 35 | dict(type='Pad', size_divisor=32), 36 | dict(type='ImageToTensor', keys=['img']), 37 | dict(type='Collect', keys=['img']) 38 | ]) 39 | ] 40 | data = dict( 41 | samples_per_gpu=5, 42 | workers_per_gpu=2, 43 | train=[ 44 | dict( 45 | type='CocoDataset', 46 | ann_file='data/FormulaNet/train/train_coco_all.json', 47 | img_prefix='data/FormulaNet/train/img/', 48 | classes=('inline', 'display'), 49 | pipeline=[ 50 | dict(type='LoadImageFromFile'), 51 | dict(type='LoadAnnotations', with_bbox=True), 52 | dict(type='Resize', img_scale=(1447, 2048), keep_ratio=True), 53 | dict(type='RandomCrop', crop_size=(1600, 1440)), 54 | dict(type='RandomFlip', flip_ratio=0.5), 55 | dict( 56 | type='Normalize', 57 | mean=[123.675, 116.28, 103.53], 58 | std=[58.395, 57.12, 57.375], 59 | to_rgb=True), 60 | dict(type='Pad', size_divisor=32), 61 | dict(type='DefaultFormatBundle'), 62 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) 63 | ]) 64 | ], 65 | val=dict( 66 | type='CocoDataset', 67 | ann_file='data/FormulaNet/val/train_coco_all.json', 68 | img_prefix='data/FormulaNet/val/img/', 69 | classes=('inline', 'display'), 70 | pipeline=[ 71 | dict(type='LoadImageFromFile'), 72 | dict( 73 | type='MultiScaleFlipAug', 74 | img_scale=(1447, 2048), 75 | flip=True, 76 | transforms=[ 77 | dict(type='Resize', keep_ratio=True), 78 | dict(type='RandomFlip'), 79 | dict( 80 | type='Normalize', 81 | mean=[123.675, 116.28, 103.53], 82 | std=[58.395, 57.12, 57.375], 83 | to_rgb=True), 84 | dict(type='Pad', size_divisor=32), 85 | dict(type='ImageToTensor', keys=['img']), 86 | dict(type='Collect', keys=['img']) 87 | ]) 88 | ]), 89 | ) 90 | evaluation = dict( 91 | interval=1, metric='bbox', classwise=True, save_best='bbox_mAP_75') 92 | test = dict(create_images=True, threshold=0.5) 93 | optimizer_config = dict(grad_clip=dict(max_norm=15, norm_type=2)) 94 | lr_config = dict( 95 | policy='step', 96 | warmup='linear', 97 | warmup_iters=500, 98 | warmup_ratio=0.001, 99 | step=[2, 8, 12, 16, 20]) 100 | total_epochs = 24 101 | checkpoint_config = dict(interval=1) 102 | log_config = dict(interval=50, hooks=[]) 103 | dist_params = dict(backend='nccl') 104 | log_level = 'INFO' 105 | load_from = None 106 | resume_from = None 107 | workflow = [('train', 1)] 108 | norm_cfg = dict(type='BN', requires_grad=True) 109 | model = dict( 110 | type='GFL', 111 | pretrained='open-mmlab://resnest101', 112 | backbone=dict( 113 | type='ResNeSt', 114 | stem_channels=128, 115 | depth=101, 116 | radix=2, 117 | reduction_factor=4, 118 | avg_down_stride=True, 119 | num_stages=4, 120 | out_indices=(0, 1, 2, 3), 121 | frozen_stages=1, 122 | norm_cfg=dict(type='BN', requires_grad=True), 123 | norm_eval=False, 124 | dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False), 125 | stage_with_dcn=(False, True, True, True), 126 | with_cp=True, 127 | style='pytorch'), 128 | neck=dict( 129 | type='FPN', 130 | in_channels=[256, 512, 1024, 2048], 131 | out_channels=256, 132 | add_extra_convs='on_output', 133 | num_outs=5), 134 | bbox_head=dict( 135 | type='GFLHead', 136 | num_classes=2, 137 | in_channels=256, 138 | stacked_convs=4, 139 | feat_channels=256, 140 | anchor_generator=dict( 141 | type='AnchorGenerator', 142 | ratios=[1.0], 143 | octave_base_scale=8, 144 | scales_per_octave=1, 145 | strides=[8, 16, 32, 64, 128]), 146 | loss_cls=dict( 147 | type='QualityFocalLoss', 148 | use_sigmoid=True, 149 | beta=2.0, 150 | loss_weight=1.0), 151 | loss_dfl=dict(type='DistributionFocalLoss', loss_weight=0.25), 152 | reg_max=24, 153 | loss_bbox=dict(type='GIoULoss', loss_weight=2.0))) 154 | train_cfg = dict( 155 | assigner=dict(type='ATSSAssigner', topk=9), 156 | allowed_border=-1, 157 | pos_weight=-1, 158 | debug=False) 159 | test_cfg = dict( 160 | nms_pre=1000, 161 | min_bbox_size=0, 162 | score_thr=0.5, 163 | nms=dict(type='nms', iou_threshold=0.4), 164 | max_per_img=100) 165 | optimizer = dict(type='Ranger', lr=0.0001) 166 | fp16 = dict(loss_scale='dynamic') 167 | work_dir = 'checkpoints/' 168 | gpu_ids = range(0, 1) 169 | -------------------------------------------------------------------------------- /utils/config_latex.py: -------------------------------------------------------------------------------- 1 | """regrex search""" 2 | import copy 3 | 4 | color_patterns = { 5 | # double_dollar_only: r"(? 1 and (possible_footnote[0] == "^" and possible_footnote[1:].isnumeric() or possible_footnote[0] == "^" and len(possible_footnote) < 5): 115 | return True 116 | if possible_footnote == "^\\dagger": 117 | return True 118 | return False 119 | 120 | 121 | """color entries""" 122 | color = { 123 | 'display': {'start': "\\textcolor{display}{", 'end': "}"}, 124 | 'display_lyx': {'start': "\\textcolor{display}{", 'end': "}"}, 125 | 'inline': {'start': "\\textcolor{inline}{", 'end': "}"}, 126 | 'section': {'start': "\\textcolor{header1}{", 'end': "}", 'start_correction': -9}, 127 | 'section2': {'start': "\\textcolor{header1*}{", 'end': "}", 'start_correction': -10}, 128 | 'section3': {'start': "\\textcolor{header1}{", 'end': "}", 'start_correction': -11}, 129 | 'section4': {'start': "\\textcolor{header1}{", 'end': "}", 'start_correction': 0}, 130 | 'subsection': {'start': "\\textcolor{header2}{", 'end': "}", 'start_correction': -12}, 131 | 'subsection2': {'start': "\\textcolor{header2*}{", 'end': "}", 'start_correction': -13}, 132 | 'subsection3': {'start': "\\textcolor{header2}{", 'end': "}", 'start_correction': 0}, 133 | 'subsubsection': {'start': "\\textcolor{header3}{", 'end': "}", 'start_correction': -15}, 134 | 'subsubsection2': {'start': "\\textcolor{header3*}{", 'end': "}", 'start_correction': -16}, 135 | 'subsubsection3': {'start': "\\textcolor{header3}{", 'end': "}", 'start_correction': 0}, 136 | 'figure': {'start': "", 'end': ", cfbox=figure 1pt 0pt"}, 137 | 'figure2': {'start': "[cfbox=figure 1pt 0pt]", 'end': ""}, 138 | 'figure3': {'start': "", 'end': ", cfbox=figure 1pt 0pt", 'end_correction': 0}, 139 | 'figure4': {'start': "cfbox=figure 1pt 0pt, ", 'end': ""}, 140 | 'figure5': {'start': "\\fcolorbox{figure}{white}{", 'end': "}", 'start_correction': -15, 'end_correction': 13}, 141 | 'figure6': {'start': "cfbox=figure 1pt 0pt, ", 'end': ""}, 142 | 'figure7': {'start': "\\fcolorbox{figure}{white}{", 'end': "}", 'start_correction': -9}, 143 | 'caption': {'start': "\\textcolor{caption}{", 'end': "}", 'start_correction': -9}, 144 | 'caption2': {'start': "\\textcolor{caption}{", 'end': "}"}, 145 | 'footnote': {'start': "\\textcolor{footnote_color}{", 'end': "}"}, 146 | 'footnote2': {'start': "\\textcolor{footnote_color}{", 'end': "}"}, 147 | 'footnote3': {'start': "\\textcolor{footnote_color}{", 'end': "}"}, 148 | 'table': {'start': "\\textcolor{table}{", 'end': "}", 'start_correction': -15, 'end_correction': 13}, 149 | 'table2': {'start': "\\color{table}", 'end': "\\color{black}", 'start_correction': 0, 'end_correction': 0}, 150 | 'table3': {'start': "\\textcolor{table}{", 'end': "}", 'start_correction': -17, 'end_correction': 15}, 151 | 'list': {'start': "\\textcolor{list}{", 'end': "}", 'start_correction': -15, 'end_correction': 13} 152 | } 153 | 154 | """remove custom margin expressions""" 155 | # custom margin expression. Remove to make the format more similar 156 | custom_margin = ['\\textwidth', '\\textheight', '\\hoffset', '\\voffset', '\\oddsidemargin', '\\parindent', 157 | '\\evensidemargin', '\\topmargin'] 158 | 159 | """new latex header""" 160 | _LATEX_PACKAGES = r""" 161 | \usepackage{xcolor} 162 | \usepackage{etoolbox} 163 | \usepackage{sectsty} 164 | \usepackage[export]{adjustbox} 165 | \usepackage[font={color=caption,bf}]{caption} 166 | """ 167 | _LATEX_COLOR = r""" 168 | \definecolor{display}{RGB}{255,0,255} 169 | \definecolor{inline0}{RGB}{0,255,0} 170 | \definecolor{inline1}{RGB}{0,250,0} 171 | \definecolor{inline2}{RGB}{0,245,0} 172 | \definecolor{inline3}{RGB}{0,240,0} 173 | \definecolor{inline4}{RGB}{0,235,0} 174 | \definecolor{inline5}{RGB}{0,230,0} 175 | \definecolor{inline6}{RGB}{0,225,0} 176 | \definecolor{inline7}{RGB}{0,220,0} 177 | \definecolor{inline8}{RGB}{0,215,0} 178 | \definecolor{inline9}{RGB}{0,210,0} 179 | \definecolor{inline10}{RGB}{0,205,0} 180 | \definecolor{inline11}{RGB}{0,200,0} 181 | \definecolor{inline12}{RGB}{0,195,0} 182 | \definecolor{inline13}{RGB}{0,190,0} 183 | \definecolor{inline14}{RGB}{0,185,0} 184 | \definecolor{inline15}{RGB}{0,180,0} 185 | \definecolor{inline16}{RGB}{0,175,0} 186 | \definecolor{inline17}{RGB}{0,170,0} 187 | \definecolor{inline18}{RGB}{0,165,0} 188 | \definecolor{inline19}{RGB}{0,160,0} 189 | \definecolor{table}{RGB}{0,0,255} 190 | \definecolor{figure}{RGB}{255,0,0} 191 | \definecolor{header1}{RGB}{240,255,0} 192 | \definecolor{header2}{RGB}{220,255,0} 193 | \definecolor{header3}{RGB}{200,255,0} 194 | \definecolor{header1*}{RGB}{230,255,0} 195 | \definecolor{header2*}{RGB}{210,255,0} 196 | \definecolor{header3*}{RGB}{190,255,0} 197 | \definecolor{caption}{RGB}{0,240,255} 198 | \definecolor{footnote_color}{RGB}{0,120,240} 199 | \definecolor{bibliography}{RGB}{255,100,0} 200 | \definecolor{list}{RGB}{255,0,120} 201 | """ 202 | 203 | _LATEX_NO_COLOR = r""" 204 | \definecolor{display}{RGB}{0,0,0} 205 | \definecolor{inline}{RGB}{0,0,0} 206 | \definecolor{inline1}{RGB}{0,0,0} 207 | \definecolor{inline2}{RGB}{0,0,0} 208 | \definecolor{inline3}{RGB}{0,0,0} 209 | \definecolor{inline4}{RGB}{0,0,0} 210 | \definecolor{inline5}{RGB}{0,0,0} 211 | \definecolor{inline6}{RGB}{0,0,0} 212 | \definecolor{inline7}{RGB}{0,0,0} 213 | \definecolor{inline8}{RGB}{0,0,0} 214 | \definecolor{inline9}{RGB}{0,0,0} 215 | \definecolor{inline10}{RGB}{0,0,0} 216 | \definecolor{inline11}{RGB}{0,0,0} 217 | \definecolor{inline12}{RGB}{0,0,0} 218 | \definecolor{inline13}{RGB}{0,0,0} 219 | \definecolor{inline14}{RGB}{0,0,0} 220 | \definecolor{inline15}{RGB}{0,0,0} 221 | \definecolor{inline16}{RGB}{0,0,0} 222 | \definecolor{inline17}{RGB}{0,0,0} 223 | \definecolor{inline18}{RGB}{0,0,0} 224 | \definecolor{inline19}{RGB}{0,0,0} 225 | \definecolor{table}{RGB}{0,0,0} 226 | \definecolor{figure}{RGB}{255,255,255} 227 | \definecolor{header1}{RGB}{0,0,0} 228 | \definecolor{header2}{RGB}{0,0,0} 229 | \definecolor{header3}{RGB}{0,0,0} 230 | \definecolor{caption}{RGB}{0,0,0} 231 | \definecolor{footnote_color}{RGB}{0,0,0} 232 | \definecolor{bibliography}{RGB}{0,0,0} 233 | \definecolor{list}{RGB}{0,0,0} 234 | """ 235 | _LATEX_ENV = r""" 236 | \renewcommand\footnoterule{\color{footnote_color}\kern-3pt \hrule width 2in \kern 2.6pt} 237 | \renewcommand\thefootnote{\textcolor{footnote_color}{\arabic{footnote}}} 238 | \AtBeginEnvironment{eqnarray}{\color{display}} 239 | \AtBeginEnvironment{eqnarray*}{\color{display}} 240 | \AtBeginEnvironment{equation}{\color{display}} 241 | \AtBeginEnvironment{equation*}{\color{display}} 242 | \AtBeginEnvironment{align}{\color{display}} 243 | \AtBeginEnvironment{align*}{\color{display}} 244 | \AtBeginEnvironment{flalign}{\color{display}} 245 | \AtBeginEnvironment{flalign*}{\color{display}} 246 | \AtBeginEnvironment{gather}{\color{display}} 247 | \AtBeginEnvironment{gather*}{\color{display}} 248 | \AtBeginEnvironment{multline}{\color{display}} 249 | \AtBeginEnvironment{multline*}{\color{display}} 250 | \AtBeginEnvironment{alignat}{\color{display}} 251 | \AtBeginEnvironment{alignat*}{\color{display}} 252 | \AtBeginEnvironment{eqalign}{\color{display}} 253 | \AtBeginEnvironment{eqalign*}{\color{display}} 254 | \AtBeginEnvironment{eqalignno}{\color{display}} 255 | \AtBeginEnvironment{eqalignno*}{\color{display}} 256 | \AtBeginEnvironment{aligned}{\color{display}} 257 | \AtBeginEnvironment{aligned*}{\color{display}} 258 | \AtBeginEnvironment{be}{\color{display}} 259 | \AtBeginEnvironment{bea}{\color{display}} 260 | \AtBeginEnvironment{beq}{\color{display}} 261 | \AtBeginEnvironment{beeq}{\color{display}} 262 | \AtBeginEnvironment{ber}{\color{display}} 263 | \AtBeginEnvironment{beqa}{\color{display}} 264 | \AtBeginEnvironment{displaymath}{\color{display}} 265 | \AtBeginEnvironment{subeqnarray}{\color{display}} 266 | \AtBeginEnvironment{figure}{\color{caption}} 267 | \AtBeginEnvironment{figure*}{\color{caption}} 268 | \AtBeginEnvironment{table}{\color{table}} 269 | \AtBeginEnvironment{thebibliography}{\color{bibliography}} 270 | \AtBeginEnvironment{itemize}{\color{list}} 271 | \AtBeginEnvironment{enumerate}{\color{list}} 272 | \AtBeginEnvironment{description}{\color{list}} 273 | """ 274 | 275 | LATEX_COLOR = _LATEX_PACKAGES + _LATEX_COLOR + _LATEX_ENV 276 | LATEX_NO_COLOR = _LATEX_PACKAGES + _LATEX_NO_COLOR + _LATEX_ENV 277 | -------------------------------------------------------------------------------- /utils/walker_download.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | import copy 4 | from typing import Tuple 5 | import numpy as np 6 | from pathlib import Path 7 | import utils.config_latex as config_latex 8 | import regex as re 9 | 10 | 11 | def process_environment(data: str, search_pattern: str, env_type: str, ignore_pos: list = None) -> Tuple[list, str]: 12 | """Process LaTeX environment.""" 13 | 14 | try: 15 | start_pos = re.search(r"\\begin\{document\}", data).end() 16 | except AttributeError: 17 | start_pos = 0 18 | if env_type == 'inline': 19 | start_pos = 0 20 | vertices_ptrs = [] 21 | ignore_pos = ignore_pos or [] 22 | regex = re.compile(search_pattern) 23 | start_pattern = False 24 | if ").*?" in search_pattern and env_type != 'display': 25 | start_pattern = search_pattern[9:search_pattern.find(").*?")].replace("\\{", "{") 26 | end_pattern = False 27 | if env_type == 'inline' or env_type == 'footnote': 28 | end_pattern = '$' 29 | if search_pattern[search_pattern.find('.*?(?=') + 6:-1] == '\\}': 30 | end_pattern = '}' 31 | while start_pos <= len(data): 32 | pattern = regex.search(data, pos=start_pos) 33 | # if found a pattern 34 | if pattern: 35 | # get start end position 36 | start, end = pattern.span() 37 | if (env_type == 'display_lyx' or env_type == 'inline_lyx') and data[start - 3:start - 2] == '\\': 38 | start_pos = start + 1 39 | continue 40 | if (env_type == 'display_lyx' or env_type == 'inline_lyx') and data[end - 1:end] == '\\': 41 | end_pattern_found = False 42 | while not end_pattern_found: 43 | end += 2 44 | if env_type == 'inline_lyx': 45 | end_new = data[end:].find('\\)') 46 | else: 47 | end_new = data[end:].find('\\]') 48 | end += end_new 49 | if data[end - 1:end] != '\\': 50 | end_pattern_found = True 51 | if env_type == 'caption2': 52 | if end - start < 30: 53 | start = end + 2 54 | end = start + data[start:].find('}') 55 | start += 1 56 | 57 | # check that no open brackets and $ exists 58 | while end_pattern: 59 | open_brackets = len( 60 | [match for match in re.finditer('{', data[start:end]) if data[start + match.span()[0] - 1] != '\\']) 61 | closed_brackets = len( 62 | [match for match in re.finditer('}', data[start:end]) if data[start + match.span()[0] - 1] != '\\']) 63 | dollar_signs = len([match for match in re.finditer('\$', data[start:end]) if 64 | data[start + match.span()[0] - 1] != '\\']) 65 | if open_brackets > closed_brackets or dollar_signs % 2 != 0: 66 | end += 1 67 | new_end = data[end:].find(end_pattern) 68 | if new_end == -1: 69 | break 70 | if end_pattern == '$': 71 | new_end += 1 72 | end += new_end 73 | else: 74 | break 75 | if env_type == 'display': 76 | start -= 2 77 | end += 2 78 | found = pattern.group() 79 | # make further content checks 80 | skip = config_latex.ignore_content(found, env_type) 81 | # check if $ is a $ sign instead of the inline env 82 | if (env_type == 'inline' or env_type == 'display') and not skip: 83 | # fixes $ signs in latex code 84 | if data[start - 1:start + 1] == '\\$' and data[start - 2:start + 1] != '\\\\$': 85 | start_pos = start + 2 86 | continue 87 | # adds word before _123 88 | if 'inline' in env_type: 89 | temp = copy.deepcopy(data[start:end]) 90 | temp = temp.replace('$', '').replace(" ", "").replace("{", '').replace("}", "") 91 | if len(temp) >= 2 and temp[0] == '_' and data[start - 1] != ' ': 92 | word_before = data[start - 10:start].split()[-1] 93 | start -= len(word_before) 94 | 95 | # check ignore positions 96 | end_ignore_pos = False 97 | for pos in ignore_pos: 98 | if pos[1] >= start >= pos[0]: 99 | skip = True 100 | end_ignore_pos = pos[1] 101 | if start == end_ignore_pos: 102 | end_ignore_pos = False 103 | break 104 | if env_type == 'figure2': 105 | start -= 1 106 | # check if definition of latex function 107 | if start_pattern: 108 | before = data[start - 15 - len(start_pattern):start - len(start_pattern)] 109 | before = before.replace(" ", "") 110 | if """\\newcommand{""" in before or """\\renewcommand{""" in before or """\\def""" in before: 111 | skip = True 112 | # if skip make no coloring 113 | if skip: 114 | if end_ignore_pos: 115 | start_pos = end_ignore_pos 116 | elif found == '': 117 | start_pos = end + 1 118 | else: 119 | start_pos = end 120 | continue 121 | else: 122 | vertices_ptrs.append((start, end)) 123 | start_pos = end 124 | if 'footnote' in env_type and found == '' or start == end: 125 | start_pos = end + 1 126 | if env_type == 'display': 127 | start_pos += 2 128 | else: 129 | break 130 | return vertices_ptrs 131 | 132 | 133 | def color_latex_code(data): 134 | # copy 135 | old_latex = copy.deepcopy(data) 136 | new_latex = [] 137 | 138 | # modify header 139 | for index, row in enumerate(old_latex.split("\n")): 140 | if row.startswith("%"): 141 | row = '%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%' 142 | new_latex.append(row) 143 | continue 144 | if '%' in row: 145 | temp_i = row.find('%') 146 | if row[temp_i - 1] != "\\": 147 | row = row[:temp_i] 148 | if all([characters in [" "] for characters in row]): 149 | continue 150 | if "\\def\\thefootnote" in row: 151 | continue 152 | if '\\providecommand{\\LyX}' in row or '\\newcommand{\\LyX}' in row: 153 | lyx = True 154 | if "documentclass" in row: 155 | temp_i = row.find(']{') 156 | if temp_i > 0: 157 | temp_i += 2 158 | # row = f"{row[:temp_i]}article{row[-1]}" 159 | new_latex.append(row) 160 | new_latex.append(config_latex.LATEX_NO_COLOR) 161 | elif '\\usepackage[dvips]{graphicx}' in row: 162 | new_latex.append('\\usepackage{graphicx}') 163 | elif '\\usepackage{hyperref}' in row: 164 | new_latex.append('\\usepackage[hidelinks]{hyperref}') 165 | elif "documentstyle" in row: 166 | return "old_latex_version" 167 | else: 168 | new_latex.append(row) 169 | new_latex = "\n".join(new_latex) 170 | 171 | # colorize all non math text 172 | ignore_pos = get_ignore_positions(new_latex) 173 | color_patterns_non_math = {} 174 | for search_element in ["\\begin ", "\\end "]: 175 | new_latex = new_latex.replace(search_element, search_element[:-1]) 176 | for key, value in config_latex.color_patterns.items(): 177 | if 'inline' not in key and 'display' not in key: 178 | color_patterns_non_math[key] = value 179 | for key in color_patterns_non_math.keys(): 180 | color_positions = process_environment(new_latex, config_latex.color_patterns[key], str(key), ignore_pos[key]) 181 | new_latex = color_it(new_latex, {key: color_positions}) 182 | 183 | # colorize display math formulas 184 | ignore_pos = get_ignore_positions(new_latex) 185 | color_patterns_math = {} 186 | for key, value in config_latex.color_patterns.items(): 187 | if 'inline' in key or 'display' in key: 188 | color_patterns_math[key] = value 189 | color_positions = {key: process_environment(new_latex, config_latex.color_patterns[key], str(key), ignore_pos[key]) 190 | for key in color_patterns_math.keys()} 191 | color_positions = combine_color_positions(color_positions) 192 | new_latex = color_it(new_latex, color_positions) 193 | return new_latex 194 | 195 | 196 | def combine_color_positions(color_positions): 197 | for key in color_positions.keys(): 198 | if not color_positions[key]: 199 | continue 200 | while True: 201 | start = -1 202 | for ci, c in enumerate(color_positions[key]): 203 | if c[0] <= start: 204 | color_positions[key][ci - 1] = (min(color_positions[key][ci - 1][0], color_positions[key][ci][0]), 205 | max(color_positions[key][ci - 1][1], color_positions[key][ci][1])) 206 | color_positions[key].pop(ci) 207 | break 208 | start = c[1] 209 | if ci == len(color_positions[key]) - 1: 210 | break 211 | return color_positions 212 | 213 | 214 | def get_ignore_positions(new_latex): 215 | ignore_positions = {} 216 | for key in config_latex.ignore_patterns.keys(): 217 | ignore_pos = {k: find_ignore_pos(new_latex, config_latex.ignore_patterns[key][k]) 218 | for k in config_latex.ignore_patterns[key].keys()} 219 | if ", " in key: 220 | key = key.split(", ") 221 | else: 222 | key = [key] 223 | for k in key: 224 | if k not in ignore_positions: 225 | ignore_positions[k] = {} 226 | for kk in ignore_pos.keys(): 227 | ignore_positions[k][kk] = ignore_pos[kk] 228 | for key in ignore_positions: 229 | if 'bea' in ignore_positions[key]: 230 | for pos_i, pos in enumerate(ignore_positions[key]['bea']): 231 | if new_latex[pos[0]:pos[1]].find('\\beq') > -1: 232 | ignore_positions[key]['bea'][pos_i] = (pos[0], pos[0] + new_latex[pos[0]:pos[1]].find('\\beq')) 233 | ignore_pos = {key: [] for key in config_latex.color_patterns.keys()} 234 | for key in ignore_positions.keys(): 235 | for v in ignore_positions[key].values(): 236 | ignore_pos[key] += v 237 | return ignore_pos 238 | 239 | 240 | def find_ignore_pos(data: str, search_pattern: str) -> Tuple[list, str]: 241 | """Process LaTeX environment.""" 242 | vertices_ptrs = [] 243 | start_pos = 0 244 | regex = re.compile(search_pattern) 245 | end_pattern = f"(?s)(?={search_pattern[search_pattern.find('.*?(?=') + 6:]}" 246 | if search_pattern == r"(?s)(?<=\$\$).*?(?=\$\$)": 247 | end_pattern == r"(?s)(?=\$\$)" 248 | while start_pos <= len(data): 249 | pattern = regex.search(data, pos=start_pos) 250 | # if found a pattern 251 | if pattern: 252 | # get start end position 253 | start, end = pattern.span() 254 | # check that no open brackets and $ exists 255 | while end_pattern: 256 | open_brackets = len([match for match in re.finditer('{', data[start:end]) if 257 | data[start + match.span()[0] - 1] != '\\']) 258 | closed_brackets = len([match for match in re.finditer('}', data[start:end]) if 259 | data[start + match.span()[0] - 1] != '\\']) 260 | dollar_signs = len([match for match in re.finditer('\$', data[start:end]) if 261 | data[start + match.span()[0] - 1] != '\\']) 262 | if open_brackets > closed_brackets or dollar_signs % 2 != 0: 263 | new_end = re.search(end_pattern, data, pos=end + 1) 264 | if new_end: 265 | end = new_end.span()[0] 266 | else: 267 | break 268 | else: 269 | break 270 | if start == end: 271 | start_pos = end + 1 272 | continue 273 | found = data[start:end] 274 | if '\\begin{document}' in found: 275 | start_pos = start + 1 276 | continue 277 | vertices_ptrs.append((start, end)) 278 | start_pos = end 279 | else: 280 | break 281 | return vertices_ptrs 282 | 283 | 284 | def color_it(data, color_positions): 285 | start_pos = 0 286 | result = "" 287 | while start_pos < len(data): 288 | color_positions = {key: value for key, value in color_positions.items() if value} 289 | if color_positions: 290 | key = list(color_positions.keys())[np.argmin([value[0][0] for key, value in color_positions.items()])] 291 | start = color_positions[key][0][0] 292 | end = color_positions[key][0][1] 293 | if not 'inline' in key: 294 | if 'start_correction' in config_latex.color[key]: 295 | start += config_latex.color[key]['start_correction'] 296 | if 'end_correction' in config_latex.color[key]: 297 | end += config_latex.color[key]['end_correction'] 298 | if key == 'inline_lyx': 299 | start -= 2 300 | end += 2 301 | if key == 'figure3': 302 | end += data[start:].find('}') - 1 303 | if '}' in data[end - 1:end]: 304 | end -= 1 305 | head = data[start_pos:start] 306 | found = data[start:end] 307 | if 'inline' in key: 308 | result += f"{head}{config_latex.color['inline']['start']}{found}{config_latex.color['inline']['end']}" 309 | else: 310 | result += f"{head}{config_latex.color[key]['start']}{found}{config_latex.color[key]['end']}" 311 | start_pos = end 312 | color_positions[key].pop(0) 313 | 314 | for k in color_positions.keys(): 315 | rerun = True 316 | while rerun: 317 | rerun = False 318 | if color_positions[k] and color_positions[k][0][0] < start_pos: 319 | color_positions[k].pop(0) 320 | rerun = True 321 | else: 322 | result += f"{data[start_pos:]}" 323 | start_pos = len(data) 324 | return result 325 | 326 | 327 | def color_with_regex(main_tex: Path, config): 328 | """Process Latex source code and set color for math environments.""" 329 | 330 | try: 331 | data = main_tex.read_text() 332 | except UnicodeDecodeError: 333 | shutil.rmtree(main_tex.parent) 334 | return 'Error' 335 | # NOTE: Make a black and white version 336 | vanilla_data = color_latex_code(data, True, config) 337 | # remove old latex versions 338 | if vanilla_data == 'old_latex_version': 339 | shutil.rmtree(main_tex.parent) 340 | return 'old_latex_version' 341 | vanilla_out = main_tex.parent / f"vanilla_{main_tex.name}" 342 | vanilla_out.write_text(vanilla_data) 343 | 344 | colored_data = color_latex_code(data, False, config) 345 | colored_out = main_tex.parent / f"colored_{main_tex.name}" 346 | colored_out.write_text(colored_data) 347 | return 'done' 348 | 349 | 350 | def compile_pdf(source_dir, waiting_time): 351 | """Compile source directory.""" 352 | if source_dir is None: 353 | return False 354 | 355 | if source_dir.is_file(): 356 | source_dir = source_dir.parent 357 | 358 | latexmk = subprocess.Popen(["latexmk", "-pdfdvi", "-interaction=nonstopmode", "-quiet", "-f"], cwd=source_dir, 359 | stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) 360 | try: 361 | latexmk.wait(timeout=waiting_time) # increase time 362 | except subprocess.TimeoutExpired: 363 | latexmk.kill() 364 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution 4.0 International Public License 58 | 59 | By exercising the Licensed Rights (defined below), You accept and agree 60 | to be bound by the terms and conditions of this Creative Commons 61 | Attribution 4.0 International Public License ("Public License"). To the 62 | extent this Public License may be interpreted as a contract, You are 63 | granted the Licensed Rights in consideration of Your acceptance of 64 | these terms and conditions, and the Licensor grants You such rights in 65 | consideration of benefits the Licensor receives from making the 66 | Licensed Material available under these terms and conditions. 67 | 68 | 69 | Section 1 -- Definitions. 70 | 71 | a. Adapted Material means material subject to Copyright and Similar 72 | Rights that is derived from or based upon the Licensed Material 73 | and in which the Licensed Material is translated, altered, 74 | arranged, transformed, or otherwise modified in a manner requiring 75 | permission under the Copyright and Similar Rights held by the 76 | Licensor. For purposes of this Public License, where the Licensed 77 | Material is a musical work, performance, or sound recording, 78 | Adapted Material is always produced where the Licensed Material is 79 | synched in timed relation with a moving image. 80 | 81 | b. Adapter's License means the license You apply to Your Copyright 82 | and Similar Rights in Your contributions to Adapted Material in 83 | accordance with the terms and conditions of this Public License. 84 | 85 | c. Copyright and Similar Rights means copyright and/or similar rights 86 | closely related to copyright including, without limitation, 87 | performance, broadcast, sound recording, and Sui Generis Database 88 | Rights, without regard to how the rights are labeled or 89 | categorized. For purposes of this Public License, the rights 90 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 91 | Rights. 92 | 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. Share means to provide material to the public by any means or 116 | process that requires permission under the Licensed Rights, such 117 | as reproduction, public display, public performance, distribution, 118 | dissemination, communication, or importation, and to make material 119 | available to the public including in ways that members of the 120 | public may access the material from a place and at a time 121 | individually chosen by them. 122 | 123 | j. Sui Generis Database Rights means rights other than copyright 124 | resulting from Directive 96/9/EC of the European Parliament and of 125 | the Council of 11 March 1996 on the legal protection of databases, 126 | as amended and/or succeeded, as well as other essentially 127 | equivalent rights anywhere in the world. 128 | 129 | k. You means the individual or entity exercising the Licensed Rights 130 | under this Public License. Your has a corresponding meaning. 131 | 132 | 133 | Section 2 -- Scope. 134 | 135 | a. License grant. 136 | 137 | 1. Subject to the terms and conditions of this Public License, 138 | the Licensor hereby grants You a worldwide, royalty-free, 139 | non-sublicensable, non-exclusive, irrevocable license to 140 | exercise the Licensed Rights in the Licensed Material to: 141 | 142 | a. reproduce and Share the Licensed Material, in whole or 143 | in part; and 144 | 145 | b. produce, reproduce, and Share Adapted Material. 146 | 147 | 2. Exceptions and Limitations. For the avoidance of doubt, where 148 | Exceptions and Limitations apply to Your use, this Public 149 | License does not apply, and You do not need to comply with 150 | its terms and conditions. 151 | 152 | 3. Term. The term of this Public License is specified in Section 153 | 6(a). 154 | 155 | 4. Media and formats; technical modifications allowed. The 156 | Licensor authorizes You to exercise the Licensed Rights in 157 | all media and formats whether now known or hereafter created, 158 | and to make technical modifications necessary to do so. The 159 | Licensor waives and/or agrees not to assert any right or 160 | authority to forbid You from making technical modifications 161 | necessary to exercise the Licensed Rights, including 162 | technical modifications necessary to circumvent Effective 163 | Technological Measures. For purposes of this Public License, 164 | simply making modifications authorized by this Section 2(a) 165 | (4) never produces Adapted Material. 166 | 167 | 5. Downstream recipients. 168 | 169 | a. Offer from the Licensor -- Licensed Material. Every 170 | recipient of the Licensed Material automatically 171 | receives an offer from the Licensor to exercise the 172 | Licensed Rights under the terms and conditions of this 173 | Public License. 174 | 175 | b. No downstream restrictions. You may not offer or impose 176 | any additional or different terms or conditions on, or 177 | apply any Effective Technological Measures to, the 178 | Licensed Material if doing so restricts exercise of the 179 | Licensed Rights by any recipient of the Licensed 180 | Material. 181 | 182 | 6. No endorsement. Nothing in this Public License constitutes or 183 | may be construed as permission to assert or imply that You 184 | are, or that Your use of the Licensed Material is, connected 185 | with, or sponsored, endorsed, or granted official status by, 186 | the Licensor or others designated to receive attribution as 187 | provided in Section 3(a)(1)(A)(i). 188 | 189 | b. Other rights. 190 | 191 | 1. Moral rights, such as the right of integrity, are not 192 | licensed under this Public License, nor are publicity, 193 | privacy, and/or other similar personality rights; however, to 194 | the extent possible, the Licensor waives and/or agrees not to 195 | assert any such rights held by the Licensor to the limited 196 | extent necessary to allow You to exercise the Licensed 197 | Rights, but not otherwise. 198 | 199 | 2. Patent and trademark rights are not licensed under this 200 | Public License. 201 | 202 | 3. To the extent possible, the Licensor waives any right to 203 | collect royalties from You for the exercise of the Licensed 204 | Rights, whether directly or through a collecting society 205 | under any voluntary or waivable statutory or compulsory 206 | licensing scheme. In all other cases the Licensor expressly 207 | reserves any right to collect such royalties. 208 | 209 | 210 | Section 3 -- License Conditions. 211 | 212 | Your exercise of the Licensed Rights is expressly made subject to the 213 | following conditions. 214 | 215 | a. Attribution. 216 | 217 | 1. If You Share the Licensed Material (including in modified 218 | form), You must: 219 | 220 | a. retain the following if it is supplied by the Licensor 221 | with the Licensed Material: 222 | 223 | i. identification of the creator(s) of the Licensed 224 | Material and any others designated to receive 225 | attribution, in any reasonable manner requested by 226 | the Licensor (including by pseudonym if 227 | designated); 228 | 229 | ii. a copyright notice; 230 | 231 | iii. a notice that refers to this Public License; 232 | 233 | iv. a notice that refers to the disclaimer of 234 | warranties; 235 | 236 | v. a URI or hyperlink to the Licensed Material to the 237 | extent reasonably practicable; 238 | 239 | b. indicate if You modified the Licensed Material and 240 | retain an indication of any previous modifications; and 241 | 242 | c. indicate the Licensed Material is licensed under this 243 | Public License, and include the text of, or the URI or 244 | hyperlink to, this Public License. 245 | 246 | 2. You may satisfy the conditions in Section 3(a)(1) in any 247 | reasonable manner based on the medium, means, and context in 248 | which You Share the Licensed Material. For example, it may be 249 | reasonable to satisfy the conditions by providing a URI or 250 | hyperlink to a resource that includes the required 251 | information. 252 | 253 | 3. If requested by the Licensor, You must remove any of the 254 | information required by Section 3(a)(1)(A) to the extent 255 | reasonably practicable. 256 | 257 | 4. If You Share Adapted Material You produce, the Adapter's 258 | License You apply must not prevent recipients of the Adapted 259 | Material from complying with this Public License. 260 | 261 | 262 | Section 4 -- Sui Generis Database Rights. 263 | 264 | Where the Licensed Rights include Sui Generis Database Rights that 265 | apply to Your use of the Licensed Material: 266 | 267 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 268 | to extract, reuse, reproduce, and Share all or a substantial 269 | portion of the contents of the database; 270 | 271 | b. if You include all or a substantial portion of the database 272 | contents in a database in which You have Sui Generis Database 273 | Rights, then the database in which You have Sui Generis Database 274 | Rights (but not its individual contents) is Adapted Material; and 275 | 276 | c. You must comply with the conditions in Section 3(a) if You Share 277 | all or a substantial portion of the contents of the database. 278 | 279 | For the avoidance of doubt, this Section 4 supplements and does not 280 | replace Your obligations under this Public License where the Licensed 281 | Rights include other Copyright and Similar Rights. 282 | 283 | 284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 285 | 286 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 287 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 288 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 289 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 290 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 291 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 292 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 293 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 294 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 295 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 296 | 297 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 298 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 299 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 300 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 301 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 302 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 303 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 304 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 305 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 306 | 307 | c. The disclaimer of warranties and limitation of liability provided 308 | above shall be interpreted in a manner that, to the extent 309 | possible, most closely approximates an absolute disclaimer and 310 | waiver of all liability. 311 | 312 | 313 | Section 6 -- Term and Termination. 314 | 315 | a. This Public License applies for the term of the Copyright and 316 | Similar Rights licensed here. However, if You fail to comply with 317 | this Public License, then Your rights under this Public License 318 | terminate automatically. 319 | 320 | b. Where Your right to use the Licensed Material has terminated under 321 | Section 6(a), it reinstates: 322 | 323 | 1. automatically as of the date the violation is cured, provided 324 | it is cured within 30 days of Your discovery of the 325 | violation; or 326 | 327 | 2. upon express reinstatement by the Licensor. 328 | 329 | For the avoidance of doubt, this Section 6(b) does not affect any 330 | right the Licensor may have to seek remedies for Your violations 331 | of this Public License. 332 | 333 | c. For the avoidance of doubt, the Licensor may also offer the 334 | Licensed Material under separate terms or conditions or stop 335 | distributing the Licensed Material at any time; however, doing so 336 | will not terminate this Public License. 337 | 338 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 339 | License. 340 | 341 | 342 | Section 7 -- Other Terms and Conditions. 343 | 344 | a. The Licensor shall not be bound by any additional or different 345 | terms or conditions communicated by You unless expressly agreed. 346 | 347 | b. Any arrangements, understandings, or agreements regarding the 348 | Licensed Material not stated herein are separate from and 349 | independent of the terms and conditions of this Public License. 350 | 351 | 352 | Section 8 -- Interpretation. 353 | 354 | a. For the avoidance of doubt, this Public License does not, and 355 | shall not be interpreted to, reduce, limit, restrict, or impose 356 | conditions on any use of the Licensed Material that could lawfully 357 | be made without permission under this Public License. 358 | 359 | b. To the extent possible, if any provision of this Public License is 360 | deemed unenforceable, it shall be automatically reformed to the 361 | minimum extent necessary to make it enforceable. If the provision 362 | cannot be reformed, it shall be severed from this Public License 363 | without affecting the enforceability of the remaining terms and 364 | conditions. 365 | 366 | c. No term or condition of this Public License will be waived and no 367 | failure to comply consented to unless expressly agreed to by the 368 | Licensor. 369 | 370 | d. Nothing in this Public License constitutes or may be interpreted 371 | as a limitation upon, or waiver of, any privileges and immunities 372 | that apply to the Licensor or You, including from the legal 373 | processes of any jurisdiction or authority. 374 | 375 | 376 | ======================================================================= 377 | 378 | Creative Commons is not a party to its public 379 | licenses. Notwithstanding, Creative Commons may elect to apply one of 380 | its public licenses to material it publishes and in those instances 381 | will be considered the “Licensor.” The text of the Creative Commons 382 | public licenses is dedicated to the public domain under the CC0 Public 383 | Domain Dedication. Except for the limited purpose of indicating that 384 | material is shared under a Creative Commons public license or as 385 | otherwise permitted by the Creative Commons policies published at 386 | creativecommons.org/policies, Creative Commons does not authorize the 387 | use of the trademark "Creative Commons" or any other trademark or logo 388 | of Creative Commons without its prior written consent including, 389 | without limitation, in connection with any unauthorized modifications 390 | to any of its public licenses or any other arrangements, 391 | understandings, or agreements concerning use of licensed material. For 392 | the avoidance of doubt, this paragraph does not form part of the 393 | public licenses. 394 | 395 | Creative Commons may be contacted at creativecommons.org. --------------------------------------------------------------------------------