├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── bin
├── kill.bash
├── kll
├── runChrome
├── runCode
├── runFilemerge.bash
├── runJupyter.bash
├── runJupyterLab.bash
├── runSpyder.bash
├── runTensorboard.bash
├── runTensorboard2.bash
├── runi2l.bash
└── start_ssh_agent
├── docs
└── index.html
├── papers
├── 4878-understanding-dropout.pdf
├── ADAM Optimizer.pdf
├── BLEU.pdf
├── BLEU_SmoothingTechniques.pdf
├── CTC_paper.pdf
├── ConvNet.numbers
├── Dropout.pdf
├── Google_NMT_System.pdf
├── Grammar As A Foreign Language.pdf
├── Image-to-Markup Generation with Coarse-to-Fine Attention.pdf
├── LatexCommands.pdf
├── Learning to combine foveal glimpses with a third-order Boltzmann machine.pdf
├── MULTIPLE OBJECT RECOGNITION WITH VISUAL ATTENTION.pdf
├── Neural Machine Translation by Jointly Learning to Align and Translate.pdf
├── Recurrent Models of Visual Attention.pdf
├── Recurrent Neural Network Regularization.pdf
├── Show, Attend and Tell- Neural Image Caption Generation with Visual Attention slides.pdf
├── Show, Attend and Tell- Neural Image Caption Generation with Visual Attention.pdf
├── VERY DEEP CONVOLUTIONAL NETWORKS FOR LARGE-SCALE IMAGE RECOGNITION.pdf
├── Visualizing and understanding convolutional networks slides.pdf
├── Visualizing and understanding convolutional networks.pdf
├── What You Get Is What You See- A Visual Markup Decompiler.pdf
├── amsldoc.pdf
├── candidate_sampling.pdf
├── dropout_hinton.pdf
├── glorot10a.pdf
└── symbols-letter.pdf
├── src
├── README.md
├── commons
│ ├── data_commons.py
│ ├── data_reader.py
│ ├── dl_commons.py
│ ├── dl_commons_tests.py
│ ├── pub_commons.py
│ ├── test_tf_commons.py
│ ├── tf_commons.py
│ └── viz_commons.py
├── convnet.py
├── model
│ ├── CALSTM.py
│ ├── Im2LatexModel.py
│ ├── hyper_params.py
│ ├── tf_dynamic_decode.py
│ └── tf_tutorial_code.py
├── postprocessing
│ └── evaluate_images.ipynb
├── preprocessing
│ ├── README.md
│ ├── preprocessing_step_0.ipynb
│ ├── preprocessing_step_1.ipynb
│ ├── preprocessing_step_2_tokenizer.ipynb
│ ├── preprocessing_step_3_filter.ipynb
│ ├── preprocessing_step_4_binning.ipynb
│ └── preprocessing_step_5_padding.ipynb
├── run.py
├── tools
│ ├── bulk_disp_alpha.py
│ ├── diff_params.ipynb
│ ├── disp.ipynb
│ ├── disp_alpha.ipynb
│ ├── eval_runs.ipynb
│ ├── predict.ipynb
│ ├── prune_logs.ipynb
│ ├── publishing.ipynb
│ ├── sample_preds.ipynb
│ ├── sample_strs.ipynb
│ └── visualize.ipynb
└── train_multi_gpu.py
└── thirdparty
├── data
├── im2latex_formulas_downloaded.lst
└── im2latex_formulas_downloaded.norm.lst
└── harvardnlp_im2markup
├── LICENSE
├── Readme.md
├── scripts
├── evaluation
│ ├── LevSeq.py
│ ├── distance
│ │ ├── __init__.py
│ │ ├── _fastcomp.py
│ │ ├── _iterators.py
│ │ ├── _lcsubstrings.py
│ │ ├── _levenshtein.py
│ │ ├── _pyimports.py
│ │ └── _simpledists.py
│ ├── evaluate_bleu.py
│ ├── evaluate_image.py
│ ├── evaluate_text_edit_distance.py
│ ├── render_html.py
│ └── render_latex.py
├── preprocessing
│ ├── generate_latex_vocab.py
│ ├── preprocess_filter.py
│ ├── preprocess_formulas.py
│ ├── preprocess_images.py
│ └── preprocess_latex.js
└── utils
│ ├── image_utils.py
│ └── utils.py
└── third_party
├── katex
├── .#katex.js
├── LICENSE.txt
├── README.md
├── cli.js
├── katex.js
├── package.json
└── src
│ ├── Lexer.js
│ ├── Options.js
│ ├── ParseError.js
│ ├── Parser.js
│ ├── Settings.js
│ ├── Style.js
│ ├── buildCommon.js
│ ├── buildHTML.js
│ ├── buildMathML.js
│ ├── buildTree.js
│ ├── delimiter.js
│ ├── domTree.js
│ ├── environments.js
│ ├── fontMetrics.js
│ ├── fontMetricsData.js
│ ├── functions.js
│ ├── mathMLTree.js
│ ├── parseData.js
│ ├── parseTree.js
│ ├── symbols.js
│ └── utils.js
├── match-at
├── README.md
└── package.json
└── multi-bleu.perl
/.gitattributes:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/.gitattributes
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
103 | data
104 | data/
105 | ShowAndTellSrc/
106 | tb_metrics_*/
107 | tb_metrics/
108 | logdir/
109 | *.out
110 | .vscode/
111 | *.py~
112 | *.tgz
113 | bin/conda
114 | bin/activate
115 | bin/deactivate
116 | .DS_Store
117 | ._.DS_Store
118 | .idea/
119 | zpool_3TB
120 | scratch.*
121 | gallery/
122 | gallery
123 |
--------------------------------------------------------------------------------
/bin/kill.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | pgrep -fl $1
3 | pkill -fa $1
4 | pgrep -fl $1
5 |
--------------------------------------------------------------------------------
/bin/kll:
--------------------------------------------------------------------------------
1 | kill.bash
--------------------------------------------------------------------------------
/bin/runChrome:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | google-chrome --disable-gpu &
3 |
--------------------------------------------------------------------------------
/bin/runCode:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | code --disable-gpu
3 |
--------------------------------------------------------------------------------
/bin/runFilemerge.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | open /Applications/Xcode.app/Contents/Applications/FileMerge.app
3 |
--------------------------------------------------------------------------------
/bin/runJupyter.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | LOGFILE=~/logs/jupyter.out
3 | cd ~
4 | nohup jupyter notebook --ip=* --port 50001 > $LOGFILE 2>&1 &
5 | tail -f $LOGFILE
6 |
--------------------------------------------------------------------------------
/bin/runJupyterLab.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | LOGFILE=~/logs/jupyter_lab.out
3 | cd ~
4 | nohup jupyter lab --ip=* --port 50003 > $LOGFILE 2>&1 &
5 | tail -f $LOGFILE
6 |
--------------------------------------------------------------------------------
/bin/runSpyder.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | LOGFILE=~/logs/spyder.out
3 | cd ~
4 | nohup spyder > $LOGFILE 2>&1 &
5 | tail -f $LOGFILE
6 |
--------------------------------------------------------------------------------
/bin/runTensorboard.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | LOGFILE=~/logs/tensorboard.out
3 | cd ~
4 | nohup tensorboard --logdir ~/predictions/logdir --purge_orphaned_data --port 50002 > $LOGFILE 2>&1 &
5 | tail -f $LOGFILE
6 |
--------------------------------------------------------------------------------
/bin/runTensorboard2.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | LOGFILE=~/logs/tensorboard2.out
3 | cd ~
4 | nohup tensorboard --logdir ~/im2latex/src/tb_metrics_dev --purge_orphaned_data --port 50003 > $LOGFILE 2>&1 &
5 | tail -f $LOGFILE
6 |
--------------------------------------------------------------------------------
/bin/runi2l.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | LOGFILE=~/logs/run.out
3 | cd ~
4 | rm LOGFILE
5 | ## ./run.py -e -1 -b 40 -p -i 0 --r-lambda 0.00005 -k 1.0 -w 10 --squash-input-seq --logdir ./tb_metrics_dev
6 | ## ./run.py -e -1 -b 40 -p -i 0 --r-lambda 0.00005 -k 1.0 -w 10 --squash-input-seq --logdir ./tb_metrics_dev --logdir-tag test_3.1LSTM_2init_3out_3attConv_1beta
7 | nohup ./run.py -e -1 -b 64 -w 10 -k 1.0 -p -i 0 >$LOGFILE 2>&1 &
8 | tail -f $LOGFILE
9 |
--------------------------------------------------------------------------------
/bin/start_ssh_agent:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | eval "$(ssh-agent -s)"
3 |
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/papers/4878-understanding-dropout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/4878-understanding-dropout.pdf
--------------------------------------------------------------------------------
/papers/ADAM Optimizer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/ADAM Optimizer.pdf
--------------------------------------------------------------------------------
/papers/BLEU.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/BLEU.pdf
--------------------------------------------------------------------------------
/papers/BLEU_SmoothingTechniques.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/BLEU_SmoothingTechniques.pdf
--------------------------------------------------------------------------------
/papers/CTC_paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/CTC_paper.pdf
--------------------------------------------------------------------------------
/papers/ConvNet.numbers:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/ConvNet.numbers
--------------------------------------------------------------------------------
/papers/Dropout.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Dropout.pdf
--------------------------------------------------------------------------------
/papers/Google_NMT_System.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Google_NMT_System.pdf
--------------------------------------------------------------------------------
/papers/Grammar As A Foreign Language.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Grammar As A Foreign Language.pdf
--------------------------------------------------------------------------------
/papers/Image-to-Markup Generation with Coarse-to-Fine Attention.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Image-to-Markup Generation with Coarse-to-Fine Attention.pdf
--------------------------------------------------------------------------------
/papers/LatexCommands.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/LatexCommands.pdf
--------------------------------------------------------------------------------
/papers/Learning to combine foveal glimpses with a third-order Boltzmann machine.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Learning to combine foveal glimpses with a third-order Boltzmann machine.pdf
--------------------------------------------------------------------------------
/papers/MULTIPLE OBJECT RECOGNITION WITH VISUAL ATTENTION.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/MULTIPLE OBJECT RECOGNITION WITH VISUAL ATTENTION.pdf
--------------------------------------------------------------------------------
/papers/Neural Machine Translation by Jointly Learning to Align and Translate.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Neural Machine Translation by Jointly Learning to Align and Translate.pdf
--------------------------------------------------------------------------------
/papers/Recurrent Models of Visual Attention.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Recurrent Models of Visual Attention.pdf
--------------------------------------------------------------------------------
/papers/Recurrent Neural Network Regularization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Recurrent Neural Network Regularization.pdf
--------------------------------------------------------------------------------
/papers/Show, Attend and Tell- Neural Image Caption Generation with Visual Attention.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Show, Attend and Tell- Neural Image Caption Generation with Visual Attention.pdf
--------------------------------------------------------------------------------
/papers/VERY DEEP CONVOLUTIONAL NETWORKS FOR LARGE-SCALE IMAGE RECOGNITION.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/VERY DEEP CONVOLUTIONAL NETWORKS FOR LARGE-SCALE IMAGE RECOGNITION.pdf
--------------------------------------------------------------------------------
/papers/Visualizing and understanding convolutional networks slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Visualizing and understanding convolutional networks slides.pdf
--------------------------------------------------------------------------------
/papers/Visualizing and understanding convolutional networks.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Visualizing and understanding convolutional networks.pdf
--------------------------------------------------------------------------------
/papers/What You Get Is What You See- A Visual Markup Decompiler.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/What You Get Is What You See- A Visual Markup Decompiler.pdf
--------------------------------------------------------------------------------
/papers/amsldoc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/amsldoc.pdf
--------------------------------------------------------------------------------
/papers/candidate_sampling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/candidate_sampling.pdf
--------------------------------------------------------------------------------
/papers/dropout_hinton.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/dropout_hinton.pdf
--------------------------------------------------------------------------------
/papers/glorot10a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/glorot10a.pdf
--------------------------------------------------------------------------------
/papers/symbols-letter.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/symbols-letter.pdf
--------------------------------------------------------------------------------
/src/README.md:
--------------------------------------------------------------------------------
1 | # Notes about the Framework
2 | 1. Input is streamed, not loaded at once into memory
3 | 2. Use 'n' GPUs
4 | 3. Snapshots are taken every 'n' epochs or based on other dynamic conditions (e.g. best observed validation accuracy)
5 | 4. Snapshot taken when model training is interrupted
6 | 5. Metrics viewed in tensorboard
7 | 6. All hyperparameters are saved alongside model weights
8 | 7. Very flexible class for specifying hyperparameters (includes model architecture as well as training parameters)
9 | 8. ...
--------------------------------------------------------------------------------
/src/commons/data_commons.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | # -*- coding: utf-8 -*-
3 | """
4 | Copyright 2017 Sumeet S Singh
5 |
6 | This file is part of im2latex solution by Sumeet S Singh.
7 |
8 | This program is free software: you can redistribute it and/or modify
9 | it under the terms of the Affero GNU General Public License as published by
10 | the Free Software Foundation, either version 3 of the License, or
11 | (at your option) any later version.
12 |
13 | This program is distributed in the hope that it will be useful,
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | Affero GNU General Public License for more details.
17 |
18 | You should have received a copy of the Affero GNU General Public License
19 | along with this program. If not, see .
20 |
21 | Created on Mon Jul 17 19:58:00 2017
22 |
23 | @author: Sumeet S Singh
24 | """
25 | import os
26 | import time
27 | import logging
28 | # from six.moves import cPickle as pickle
29 | import dill as pickle
30 | import numpy as np
31 | import h5py
32 |
33 | dict_id2word = None
34 | i2w_ufunc = None
35 | logger = logging
36 |
37 |
38 | def setLogLevel(logger_, level):
39 | logging_levels = (logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG)
40 | logger_.setLevel(logging_levels[level - 1])
41 |
42 |
43 | def makeFormatter():
44 | return logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
45 |
46 |
47 | def makeLogger(logging_level=3, name='default', set_global=False):
48 | global logger
49 | logger_ = logging.Logger(name)
50 | ch = logging.StreamHandler()
51 | ch.setFormatter(makeFormatter())
52 | logger_.addHandler(ch)
53 | setLogLevel(logger_, logging_level)
54 | if set_global:
55 | logger = logger_
56 | return logger_
57 |
58 |
59 | def initialize(training_data_dir, params):
60 | global i2w_ufunc, dict_id2word
61 | # if logger is None:
62 | # logger = params.logger
63 | if i2w_ufunc is None:
64 | data_props = load(training_data_dir, 'data_props.pkl')
65 | dict_id2word = data_props['id2word']
66 | K = len(dict_id2word.keys())
67 | CTCBlankTokenID = params.CTCBlankTokenID
68 | if (CTCBlankTokenID is not None) and (CTCBlankTokenID >= K):
69 | dict_id2word[CTCBlankTokenID] = u'<>' ## CTC Blank Token
70 | dict_id2word[-1] = u'<-1>' ## Catch -1s that beamsearch emits after EOS.
71 | def i2w(id):
72 | try:
73 | return dict_id2word[id]
74 | except KeyError as e:
75 | logger.critical('i2w: KeyError: %s', e)
76 | return '<%d>'%(id,)
77 | i2w_ufunc = np.frompyfunc(i2w, 1, 1)
78 | return i2w_ufunc
79 |
80 |
81 | def seq2str(arr, label, separator=None):
82 | """
83 | Converts a matrix of id-sequences - shaped (B,T) - to an array of strings shaped (B,).
84 | Uses the supplied dict_id2word to map ids to words. The dictionary must map dtype of
85 | to string.
86 | """
87 | assert i2w_ufunc is not None, "i2w_ufunc is None. Please call initialize first in order to setup i2w_ufunc."
88 | str_arr = i2w_ufunc(arr) # (B, T)
89 | if separator is None:
90 | func1d = lambda vec: label + u" " + u"".join(vec)
91 | else:
92 | func1d = lambda vec: label + u" " + unicode(separator).join(vec)
93 | return [func1d(vec) for vec in str_arr]
94 |
95 |
96 | def join(*paths):
97 | return os.path.join(*paths)
98 |
99 |
100 | def dump(ar, *paths):
101 | path = join(*paths)
102 | assert not os.path.exists(path), 'A file already exists at path %s'%path
103 | with open(path, 'wb') as f:
104 | pickle.dump(ar, f, pickle.HIGHEST_PROTOCOL)
105 |
106 |
107 | def load(*paths):
108 | with open(join(*paths), 'rb') as f:
109 | return pickle.load(f)
110 |
111 |
112 | class Storer(object):
113 | def __init__(self, args, prefix, step):
114 | self._path = os.path.join(args.storedir, '%s_%d.h5'%(prefix, step))
115 | self._h5 = h5py.File(self._path, mode="w-", swmr=False)
116 |
117 | def __enter__(self):
118 | return self
119 |
120 | def __exit__(self, *err):
121 | self.close()
122 |
123 | def flush(self):
124 | self._h5.flush()
125 |
126 | def close(self):
127 | self._h5.close()
128 |
129 | def write(self, key, ar, dtype=None, batch_axis=0, doUnwrap=True):
130 | """
131 | WARNING: ar must either be an numpy.ndarray (not numpy scalar) or a python list/tuple of numpy.ndarray.
132 | Nothing else will work.
133 | :param key:
134 | :param ar:
135 | :param dtype:
136 | :param batch_axis:
137 | :param doUnwrap:
138 | :return:
139 | """
140 | if (isinstance(ar, tuple) or isinstance(ar, list)) and doUnwrap:
141 | return self._write(key, ar, dtype, batch_axis)
142 | else:
143 | return self._write(key, [ar], dtype, batch_axis)
144 |
145 | def _write(self, key, np_ar_list, dtype, batch_axis):
146 | """
147 | WARNING: np_ar_list must be a python list/tuple of numpy.ndarray. Nothing else will work.
148 |
149 | Stacks the tensors in the list along axis=batch_axis and writes them to disk.
150 | Dimensions along axis=batch_axis are summed up (since we're stacking along that dimension).
151 | Other dimensions are padded to the maximum size
152 | with a dtype-suitable value (np.nan for float, -2 for integer)
153 | """
154 | ## Assuming all arrays have same rank, find the max dims
155 | shapes = [ar.shape for ar in np_ar_list]
156 | dims = zip(*shapes)
157 | max_shape = [max(d) for d in dims]
158 | ## We'll concatenate all arrays along axis=batch_axis
159 | max_shape[batch_axis] = sum(dims[batch_axis])
160 | if dtype == np.unicode_:
161 | dt = h5py.special_dtype(vlen=unicode)
162 | dataset = self._h5.create_dataset(key, max_shape, dtype=dt)
163 | else:
164 | dataset = self._h5.create_dataset(key, max_shape, dtype=dtype, fillvalue=-2 if np.issubdtype(dtype, np.integer) else np.nan)
165 |
166 | def make_slice(row, shape, batch_axis):
167 | """
168 | Create a slice to place shape into the receiving dataset starting at rownum along axis=batch_axis,
169 | and starting at 0 along all other axes
170 | """
171 | s = [slice(0,d) for d in shape]
172 | s[batch_axis] = slice(row, row+shape[batch_axis])
173 | return tuple(s), row+shape[batch_axis]
174 |
175 | row = 0
176 | for ar in np_ar_list:
177 | s, row = make_slice(row, ar.shape, batch_axis)
178 | # logger.info('row=%d, slice=%s', row, s)
179 | dataset[s] = ar
180 |
181 |
182 | def makeLogfileName(logdir, name):
183 | prefix, ext = os.path.splitext(os.path.basename(name))
184 | filenames = os.listdir(logdir)
185 | if not (prefix + ext) in filenames:
186 | return os.path.join(logdir, prefix + ext)
187 | else:
188 | for i in xrange(2,101):
189 | if '%s_%d%s'%(prefix,i,ext) not in filenames:
190 | return os.path.join(logdir, '%s_%d%s'%(prefix,i,ext))
191 |
192 | raise Exception('logfile number limit (100) reached.')
193 |
194 |
195 | def exists(*paths):
196 | return os.path.exists(os.path.join(*paths))
197 |
198 |
199 | def makeLogDir(root, dirname):
200 | dirpath = makeLogfileName(root, dirname)
201 | os.makedirs(dirpath)
202 | return dirpath
203 |
204 |
205 | def makeTBDir(tb_logdir, logdir_tag=None):
206 | if logdir_tag is None:
207 | dirpath = os.path.join(tb_logdir, time.strftime('%Y-%m-%d %H-%M-%S %Z'))
208 | else:
209 | dirpath = os.path.join(tb_logdir, time.strftime('%Y-%m-%d %H-%M-%S %Z') + ' ' + logdir_tag)
210 |
211 | os.makedirs(dirpath)
212 | return dirpath
213 |
214 |
215 | def readlines_to_df(path, colname):
216 | # return pd.read_csv(output_file, sep='\t', header=None, names=['formula'], index_col=False, dtype=str, skipinitialspace=True, skip_blank_lines=True)
217 | rows = []
218 | n = 0
219 | with open(path, 'r') as f:
220 | print 'opened file %s'%path
221 | for line in f:
222 | n += 1
223 | line = line.strip() # remove \n
224 | if len(line) > 0:
225 | rows.append(line.encode('utf-8'))
226 | print 'processed %d lines resulting in %d rows'%(n, len(rows))
227 | return pd.DataFrame({colname:rows}, dtype=np.str_)
228 |
229 |
230 | def readlines_to_sr(path):
231 | rows = []
232 | n = 0
233 | with open(path, 'r') as f:
234 | print 'opened file %s'%path
235 | for line in f:
236 | n += 1
237 | line = line.strip() # remove \n
238 | if len(line) > 0:
239 | rows.append(line.encode('utf-8'))
240 | print 'processed %d lines resulting in %d rows'%(n, len(rows))
241 | return pd.Series(rows, dtype=np.str_)
242 |
243 |
244 | def sr_to_lines(sr, path):
245 | # df.to_csv(path, header=False, index=False, columns=['formula'], encoding='utf-8', quoting=csv.QUOTE_NONE, escapechar=None, sep='\t')
246 | assert sr.dtype == np.str_ or sr.dtype == np.object_
247 | with open(path, 'w') as f:
248 | for s in sr:
249 | assert '\n' not in s
250 | f.write(s.strip())
251 | f.write('\n')
252 |
--------------------------------------------------------------------------------
/src/commons/dl_commons_tests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | # -*- coding: utf-8 -*-
3 | """
4 | Copyright 2017 - 2018 Sumeet S Singh
5 |
6 | This file is part of im2latex solution by Sumeet S Singh.
7 |
8 | This program is free software: you can redistribute it and/or modify
9 | it under the terms of the Affero GNU General Public License as published by
10 | the Free Software Foundation, either version 3 of the License, or
11 | (at your option) any later version.
12 |
13 | This program is distributed in the hope that it will be useful,
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | Affero GNU General Public License for more details.
17 |
18 | You should have received a copy of the Affero GNU General Public License
19 | along with this program. If not, see .
20 |
21 | @author: Sumeet S Singh
22 |
23 | Works on python 2.7
24 | """
25 |
26 | import unittest
27 | import dl_commons as dlc
28 | from dl_commons import PD, LambdaVal, integer, integerOrNone, instanceof, equalto
29 | #import tf_commons as tfc
30 |
31 | class Props(dlc.Params):
32 | proto = (
33 | PD('m', '',
34 | integer(),
35 | 64),
36 | PD('D', '',
37 | integer(),
38 | 512)
39 | )
40 | def __init__(self, initVals={}):
41 | dlc.Params.__init__(self, self.proto, initVals)
42 |
43 | class Props2(dlc.Params):
44 | def makeProto(self, GLOBAL):
45 | return Props.proto + (
46 | PD('i', '',
47 | integer(),
48 | LambdaVal(lambda _, __: GLOBAL.m + GLOBAL.D)
49 | ),
50 | PD('m2', '',
51 | integer(),
52 | equalto('m', GLOBAL)),
53 | PD('D2', '',
54 | integer(),
55 | equalto('D', GLOBAL)),
56 | PD('j', '',
57 | integerOrNone(),
58 | None
59 | ),
60 | PD('k', '',
61 | integerOrNone(),
62 | 1
63 | ),
64 | )
65 | def __init__(self, initVals={}):
66 | dlc.Params.__init__(self, self.makeProto(initVals), initVals)
67 |
68 | class Props3(dlc.Params):
69 | def makeProto(self, GLOBAL):
70 | return Props.proto + (
71 | PD('i', '',
72 | integer(),
73 | equalto('i', GLOBAL)
74 | ),
75 | PD('m3', '',
76 | integer(),
77 | equalto('m2', GLOBAL)),
78 | PD('D3', '',
79 | integer(),
80 | equalto('D2', GLOBAL)),
81 | PD('j', '',
82 | integerOrNone(),
83 | 2
84 | ),
85 | PD('k', '',
86 | integerOrNone(),
87 | 2
88 | ),
89 | PD('l', '',
90 | integerOrNone(),
91 | 2
92 | ),
93 | )
94 | def __init__(self, initVals={}):
95 | dlc.Params.__init__(self, self.makeProto(initVals), initVals)
96 |
97 |
98 |
99 | class TestCaseBase(unittest.TestCase):
100 | @staticmethod
101 | def dictSet(d, name, val):
102 | d[name] = val
103 |
104 | @staticmethod
105 | def dictGet(d, name):
106 | return d[name]
107 |
108 | @staticmethod
109 | def instantiate(cls, *args):
110 | cls(*args)
111 |
112 | class PropertiesTest(TestCaseBase):
113 | def __init__(self, *args):
114 | unittest.TestCase.__init__(self, *args)
115 |
116 | def test_good_props(self):
117 | props = {
118 | 'model_name':'im2latex',
119 | 'num_layers':None,
120 | 'unset':None
121 | }
122 | open = dlc.Properties(props)
123 | sealed = dlc.Properties(open).seal()
124 | props['num_layers'] = 10
125 | frozen = dlc.Properties(props).freeze()
126 |
127 | open.layer_type = 'MLP' # create new property
128 | self.assertEqual(open.layer_type, 'MLP')
129 | self.assertEqual(open['layer_type'], 'MLP')
130 | open['layer_type'] = 'CNN'
131 | self.assertEqual(open.layer_type, 'CNN')
132 | self.assertEqual(open['layer_type'], 'CNN')
133 |
134 | self.assertEqual(frozen.model_name, 'im2latex')
135 | self.assertEqual(frozen.unset, None)
136 | self.assertEqual(frozen['unset'], None)
137 | self.assertEqual(frozen['num_layers'], 10)
138 | self.assertEqual(frozen.num_layers, 10)
139 |
140 |
141 | def test_bad_props(self):
142 | props = {
143 | 'model_name':'im2latex',
144 | 'num_layers':None,
145 | 'unset':None
146 | }
147 | open = dlc.Properties(props)
148 | sealed = dlc.Properties(open).seal()
149 | props['num_layers'] = 10
150 | frozen = dlc.Properties(props).freeze()
151 |
152 | self.assertRaises(dlc.AccessDeniedError, setattr, sealed, "x", "MyNeuralNetwork")
153 | self.assertRaises(dlc.AccessDeniedError, self.dictSet, sealed, "x", "MyNeuralNetwork")
154 | self.assertRaises(dlc.AccessDeniedError, setattr, frozen, "name", "MyNeuralNetwork")
155 | self.assertRaises(dlc.AccessDeniedError, self.dictSet, frozen, "name", "MyNeuralNetwork")
156 |
157 | self.assertRaises(KeyError, getattr, sealed, "x")
158 | self.assertRaises(KeyError, self.dictGet, sealed, "x")
159 |
160 | def test_good_params(self):
161 | sealed = dlc.Params((
162 | dlc.ParamDesc('model_name', 'Name of Model', None, 'im2latex'),
163 | dlc.ParamDesc('layer_type', 'Type of layers to be created', ['CNN', 'MLP', 'LSTM', 'RNN'], 'LSTM'),
164 | dlc.ParamDesc('num_layers', 'Number of layers to create', range(1,11)),
165 | dlc.ParamDesc('unset', 'Unset property', range(1,11))
166 | )
167 | ).seal()
168 | frozen = dlc.Params(sealed, {'num_layers':10}).freeze()
169 | sealed.layer_type = 'MLP'
170 | self.assertEqual(sealed.layer_type, 'MLP')
171 | self.assertEqual(sealed['layer_type'], 'MLP')
172 | sealed['layer_type'] = 'CNN'
173 | self.assertEqual(sealed.layer_type, 'CNN')
174 | self.assertEqual(sealed['layer_type'], 'CNN')
175 |
176 | self.assertEqual(frozen.model_name, 'im2latex')
177 | self.assertEqual(frozen.layer_type, 'LSTM')
178 | self.assertEqual(frozen['num_layers'], 10)
179 | self.assertEqual(frozen.num_layers, 10)
180 |
181 |
182 | def test_bad_params(self):
183 | proto = (
184 | dlc.ParamDesc('model_name', 'Name of Model', None, 'im2latex'),
185 | dlc.ParamDesc('layer_type', 'Type of layers to be created', ['CNN', 'MLP', 'LSTM', 'RNN']),
186 | dlc.ParamDesc('num_layers', 'Number of layers to create', range(1,11)),
187 | dlc.ParamDesc('unset', 'Unset property', range(1,11))
188 | )
189 | sealed = dlc.Params(proto).seal()
190 | frozen = dlc.Params(proto, {'num_layers':10}).freeze()
191 | self.assertRaises(KeyError, setattr, sealed, "x", "MyNeuralNetwork")
192 | self.assertRaises(KeyError, self.dictSet, sealed, "x", "MyNeuralNetwork")
193 | self.assertRaises(KeyError, setattr, frozen, "name", "MyNeuralNetwork")
194 | self.assertRaises(KeyError, self.dictSet, frozen, "name", "MyNeuralNetwork")
195 |
196 | self.assertRaises(ValueError, setattr, sealed, "layer_type", "SVM")
197 | self.assertRaises(ValueError, self.dictSet, sealed, "layer_type", "SVM")
198 |
199 | self.assertRaises(KeyError, getattr, sealed, "x")
200 | self.assertRaises(KeyError, self.dictGet, sealed, "x")
201 |
202 | def test_good_hyperparams(self):
203 | sealed = dlc.HyperParams((
204 | dlc.ParamDesc('model_name', 'Name of Model', None, 'im2latex'),
205 | dlc.ParamDesc('layer_type', 'Type of layers to be created', ['CNN', 'MLP', 'LSTM', 'RNN'], 'MLP'),
206 | dlc.ParamDesc('num_layers', 'Number of layers to create', range(1,11)),
207 | dlc.ParamDesc('unset', 'Unset property', range(1,11)),
208 | dlc.ParamDesc('none', 'None property', (None,), None)
209 | )
210 | ).seal()
211 | frozen = dlc.HyperParams(sealed, {'num_layers':10}).freeze()
212 | self.assertRaises(dlc.OneValError, setattr, sealed, "model_name", "xyz")
213 | self.assertRaises(dlc.OneValError, setattr, sealed, "layer_type", "xyz")
214 | self.assertEqual(sealed.layer_type, 'MLP')
215 | self.assertEqual(sealed['layer_type'], 'MLP')
216 |
217 | self.assertEqual(frozen.model_name, 'im2latex')
218 | self.assertEqual(frozen['num_layers'], 10)
219 | self.assertEqual(frozen.num_layers, 10)
220 | self.assertEqual(frozen.none, None)
221 | self.assertEqual(frozen['none'], None)
222 | self.assertEqual(sealed.none, None)
223 | self.assertEqual(sealed['none'], None)
224 |
225 | def test_bad_hyperparams(self):
226 | sealed = dlc.HyperParams((
227 | dlc.ParamDesc('model_name', 'Name of Model', None, 'im2latex'),
228 | dlc.ParamDesc('layer_type', 'Type of layers to be created', ['CNN', 'MLP', 'LSTM', 'RNN']),
229 | dlc.ParamDesc('num_layers', 'Number of layers to create', range(1, 11)),
230 | dlc.ParamDesc('unset', 'Unset property', range(1, 11)),
231 | dlc.ParamDesc('none', 'None property', (None,), None)
232 | )).seal()
233 | frozen = dlc.HyperParams(sealed, {'num_layers': 10}).freeze()
234 | self.assertRaises(KeyError, setattr, sealed, "x", "MyNeuralNetwork")
235 | self.assertRaises(KeyError, self.dictSet, sealed, "x", "MyNeuralNetwork")
236 | self.assertRaises(KeyError, setattr, frozen, "name", "MyNeuralNetwork")
237 | self.assertRaises(KeyError, self.dictSet, frozen, "name", "MyNeuralNetwork")
238 |
239 | self.assertRaises(ValueError, setattr, sealed, "layer_type", "SVM")
240 | self.assertRaises(ValueError, self.dictSet, sealed, "layer_type", "SVM")
241 |
242 | self.assertRaises(KeyError, getattr, sealed, "x")
243 | self.assertRaises(KeyError, self.dictGet, sealed, "x")
244 | self.assertRaises(KeyError, getattr, frozen, 'layer_type')
245 | self.assertRaises(KeyError, getattr, sealed, 'layer_type')
246 |
247 |
248 | def test_lambda_vals(self):
249 | p = Props()
250 | p2 = Props2(p)
251 | p3 = Props3(p2)
252 | self.assertEqual(p.m, 64)
253 | self.assertEqual(p.D, 512)
254 | self.assertEqual(p2.m, 64)
255 | self.assertEqual(p2.D, 512)
256 | self.assertEqual(p2.i, 512+64)
257 | self.assertEqual(p2.m2, 64)
258 | self.assertEqual(p2.D2, 512)
259 | self.assertEqual(p3.m, 64)
260 | self.assertEqual(p3.D, 512)
261 | self.assertEqual(p3.i, 512+64)
262 | self.assertEqual(p3.m3, 64)
263 | self.assertEqual(p3.D3, 512)
264 |
265 | p.m = 128
266 | self.assertEqual(p.m, 128)
267 | self.assertEqual(p.D, 512)
268 | self.assertEqual(p2.m, 64)
269 | self.assertEqual(p2.D, 512)
270 | self.assertEqual(p2.i, 512+128)
271 | self.assertEqual(p2.m2, 128)
272 | self.assertEqual(p2.D2, 512)
273 | self.assertEqual(p3.m, 64)
274 | self.assertEqual(p3.D, 512)
275 | self.assertEqual(p3.i, 512+128)
276 | self.assertEqual(p3.m3, 128)
277 | self.assertEqual(p3.D3, 512)
278 | self.assertEqual(p3.j, None)
279 | self.assertEqual(p3.k, 1)
280 | self.assertEqual(p3.l, 2)
281 |
282 |
283 | unittest.TextTestRunner(verbosity=2).run(unittest.TestLoader().loadTestsFromTestCase(PropertiesTest))
284 |
--------------------------------------------------------------------------------
/src/commons/pub_commons.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Copyright 2017 - 2018 Sumeet S Singh
4 |
5 | This file is part of im2latex solution by Sumeet S Singh.
6 |
7 | This program is free software: you can redistribute it and/or modify
8 | it under the terms of the Affero GNU General Public License as published by
9 | the Free Software Foundation, either version 3 of the License, or
10 | (at your option) any later version.
11 |
12 | This program is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | Affero GNU General Public License for more details.
16 |
17 | You should have received a copy of the Affero GNU General Public License
18 | along with this program. If not, see .
19 |
20 | @author: Sumeet S Singh
21 |
22 | Works on python 2.7
23 | """
24 | import os
25 | import pandas as pd
26 | import data_commons as dtc
27 | from viz_commons import VisualizeStep, VisualizeDir
28 |
29 | pd.options.display.max_rows = 150
30 | pd.options.display.max_columns = None
31 | pd.options.display.max_colwidth = -1
32 | pd.options.display.width = None
33 | pd.options.display.max_seq_items = None
34 | pd.options.display.expand_frame_repr = True
35 | # pd.options.display.colheader_justify = 'right'
36 | # display.pprint_nest_depth = 1
37 |
38 |
39 | def verbatim(s):
40 | s = s.strip('$')
41 | if r'\begin' in s:
42 | s = s.replace(r'\begin', r'\begIn') # Needed fool Mathjax into not rendering the LaTeX
43 | return s
44 | # return r'\begin{verbatim}\n%s\n\end{verbatim}\n' % (s,) if r'\begin' in s else s
45 |
46 |
47 | def get_strs(dir):
48 | vd = VisualizeDir(dir)
49 | last_step = vd.get_steps()[1][-1]
50 | vs = VisualizeStep(vd, 'test', last_step)
51 | df_strs = vs.strs( 'y', 'predicted_ids', mingle=False, trim=True, wrap_strs=True, keys=['image_name'])
52 | df_strs['image_name_trunc'] = df_strs.image_name.str.replace('_basic.png', '.png')
53 | return df_strs
54 |
55 |
56 | def DISP_ALPHA(storedir, graph, step, normalized_dataset=True,
57 | sample_num=0, invert_alpha=True, words=None, gamma=1, cmap='gist_gray', image=None, show_image=True):
58 | dtc.makeLogger(3, set_global=True)
59 | # Note: Good cmap values are: gist_gray, gist_yarg, gist_heat
60 | # Good values of gamma_correction are 1 and 2.2/2.3
61 | vs = VisualizeStep(VisualizeDir(storedir, normalized_dataset=normalized_dataset), graph, step)
62 | df_strs = vs.strs('y', 'predicted_ids', mingle=False, trim=True, wrap_strs=True, keys=['image_name'])
63 | if image:
64 | if not image.endswith('_basic.png'):
65 | image = image.replace('.png', '_basic.png')
66 | df_strs = df_strs[df_strs.image_name.isin([image])]
67 | assert sample_num == 0
68 | else:
69 | df_strs = df_strs.iloc[sample_num:sample_num+1]
70 |
71 | vs.alpha(sample_num, invert_alpha=invert_alpha, words=words, gamma_correction=gamma,
72 | cmap=cmap, index=df_strs.index, show_image=show_image)
73 |
74 | # df_ = pd.DataFrame(data={
75 | # '$\mathbf{\hat{y}}$': [df_strs.predicted_ids.iloc[0], df_strs.predicted_ids.iloc[0].strip('$')],
76 | # '$\mathbf{\hat{y}}$_len': [df_strs.predicted_ids_len.iloc[0]]*2,
77 | # '$\mathbf{y}$': [df_strs.y.iloc[0], df_strs.y.iloc[0].strip('$')] ,
78 | # '$\mathbf{y}$_len': [df_strs.y_len.iloc[0]]*2
79 | # })
80 |
81 |
82 | df_ = pd.DataFrame(data={
83 | 'length': [df_strs.y_len.iloc[0], df_strs.predicted_ids_len.iloc[0]]*2 + [''],
84 | 'value': [df_strs.y.iloc[0], df_strs.predicted_ids.iloc[0]] +
85 | [verbatim(df_strs.y.iloc[0]), verbatim(df_strs.predicted_ids.iloc[0])] +
86 | [df_strs.ed.iloc[0]],
87 | },
88 | index=['$\mathbf{y}$', '$\mathbf{\hat{y}}$', '$\mathbf{y}$_seq', '$\mathbf{\hat{y}}$_seq', 'edit distance'])
89 |
90 | display(df_[['value', 'length']])
91 |
92 |
93 | def rmtails(s, *tails):
94 | for t in tails:
95 | s = s.rsplit(t, 1)[0]
96 | return s
97 |
98 |
99 | rmtail = rmtails
100 |
101 |
102 | def rmheads(s, *heads):
103 | for h in heads:
104 | s = s.split(h, 1)[1]
105 | return s
106 |
107 |
108 | rmhead = rmheads
109 |
110 |
111 | def get_unmatched_images(rendered_dir, strip=False):
112 | with open(os.path.join(rendered_dir, 'unmatched_filenames.txt'), 'r') as f:
113 | unmatched = [];
114 | missing = []
115 | for fname in f:
116 | fname = os.path.basename(fname.strip())
117 | path = os.path.join(rendered_dir, 'images_pred', fname)
118 | if not os.path.exists(path):
119 | if strip:
120 | missing.append(fname.rsplit('.png', 1)[0])
121 | else:
122 | missing.append(fname)
123 | else:
124 | if strip:
125 | unmatched.append(fname.rsplit('.png', 1)[0])
126 | else:
127 | unmatched.append(fname)
128 |
129 | return unmatched, missing
130 |
131 |
132 | def strip_image_name(df, col='image_name'):
133 | """Changes name of images from xx_basic.png to xxx.png"""
134 | df[col] = df[col].str.replace('_basic.png', '.png')
135 | return df
136 |
137 | def disp_matched_strs(dir):
138 | df = pd.read_pickle(os.path.join(dir, 'gallery_data', 'df_strs_matched_100.pkl'))
139 | df_out = pd.DataFrame({
140 | 'edit_distance': df.ed,
141 | '$\mathbf{y}$_len': df.y_len,
142 | '$\mathbf{y}$': df.y,
143 | '$\mathbf{\hat{y}}$_len': df.predicted_ids_len,
144 | '$\mathbf{\hat{y}}$': df.predicted_ids
145 | }).reset_index(drop=True)[
146 | ['edit_distance', '$\mathbf{y}$_len', '$\mathbf{y}$', '$\mathbf{\hat{y}}$_len', '$\mathbf{\hat{y}}$']]
147 | return df_out
148 |
149 | def disp_matched_strs2(dir):
150 | df = pd.read_pickle(os.path.join(dir, 'gallery_data', 'df_strs_matched_100.pkl'))
151 | df_out = pd.DataFrame({
152 | 'edit_distance': df.ed,
153 | '$\mathbf{y}$_len': df.y_len,
154 | '$\mathbf{y}$': df.y,
155 | '$\mathbf{\hat{y}}$_len': df.predicted_ids_len,
156 | '$\mathbf{\hat{y}}$': df.predicted_ids,
157 | '$\mathbf{y}$_seq': df.y.apply(verbatim, convert_dtype=False),
158 | '$\mathbf{\hat{y}}$_seq': df.predicted_ids.apply(verbatim, convert_dtype=False)
159 | }).reset_index(drop=True)[['edit_distance', '$\mathbf{y}$_len', '$\mathbf{y}$', '$\mathbf{\hat{y}}$_len',
160 | '$\mathbf{\hat{y}}$', '$\mathbf{y}$_seq', '$\mathbf{\hat{y}}$_seq']]
161 |
162 | return df_out
163 |
164 | def disp_unmatched(dir):
165 | df = pd.read_pickle(os.path.join(dir, 'gallery_data', 'unmatched_preds_sample.pkl'))
166 | df_out = pd.DataFrame({
167 | 'edit_distance': df.ed,
168 | '$\mathbf{y}$_len': df.target_len,
169 | '$\mathbf{y}$': df.y,
170 | '$\mathbf{\hat{y}}$_len': df.pred_len,
171 | '$\mathbf{\hat{y}}$': df['$\hat{y}$'],
172 | '$\mathbf{y}$_seq': df.target_seq.apply(verbatim, convert_dtype=False),
173 | '$\mathbf{\hat{y}}$_seq': df.pred_seq.apply(verbatim, convert_dtype=False)
174 | }).reset_index(drop=True)[['edit_distance', '$\mathbf{y}$_len', '$\mathbf{y}$', '$\mathbf{\hat{y}}$_len',
175 | '$\mathbf{\hat{y}}$', '$\mathbf{y}$_seq', '$\mathbf{\hat{y}}$_seq']]
176 |
177 | return df_out
178 |
179 | def disp_rand_sample(dir):
180 | df = pd.read_pickle(os.path.join(dir, 'gallery_data', 'rand_sample_100.pkl'))
181 | df_out = pd.DataFrame({
182 | 'edit_distance': df.ed,
183 | '$\mathbf{y}$_len': df.y_len,
184 | '$\mathbf{y}$': df.y,
185 | '$\mathbf{\hat{y}}$_len': df.predicted_ids_len,
186 | '$\mathbf{\hat{y}}$': df.predicted_ids,
187 | '$\mathbf{y}$_seq': df.y.apply(verbatim, convert_dtype=False),
188 | '$\mathbf{\hat{y}}$_seq': df.predicted_ids.apply(verbatim, convert_dtype=False)
189 | }).reset_index(drop=True)[['edit_distance', '$\mathbf{y}$_len', '$\mathbf{y}$', '$\mathbf{\hat{y}}$_len',
190 | '$\mathbf{\hat{y}}$', '$\mathbf{y}$_seq', '$\mathbf{\hat{y}}$_seq']]
191 |
192 | return df_out
193 |
194 |
--------------------------------------------------------------------------------
/src/commons/test_tf_commons.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | # -*- coding: utf-8 -*-
3 | """
4 | Copyright 2017 Sumeet S Singh
5 |
6 | This file is part of im2latex solution by Sumeet S Singh.
7 |
8 | This program is free software: you can redistribute it and/or modify
9 | it under the terms of the Affero GNU General Public License as published by
10 | the Free Software Foundation, either version 3 of the License, or
11 | (at your option) any later version.
12 |
13 | This program is distributed in the hope that it will be useful,
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | Affero GNU General Public License for more details.
17 |
18 | You should have received a copy of the Affero GNU General Public License
19 | along with this program. If not, see .
20 |
21 | Created on Sun Jul 9 11:44:46 2017
22 | Tested on python 2.7
23 |
24 | @author: Sumeet S Singh
25 | """
26 | import tensorflow as tf
27 | import tf_commons as tfc
28 | from tf_commons import K
29 | import numpy as np
30 |
31 | def flatten(h,l):
32 | B, k, T = K.int_shape(h)
33 | return tf.reshape(h, (B*k, -1)), tf.reshape(l, (B*k,))
34 |
35 | with tf.device('/cpu:*'):
36 | ############### Tensors with ED == 0
37 | h1 = tf.constant([[[1,2,3],[4,5,6]],
38 | [[7,8,9],[10,11,12]],
39 | [[13,14,15],[16,17,18]] ])
40 | l1 = tf.constant([[3, 3],
41 | [3, 3],
42 | [3, 3]])
43 | print 'Shapes: h1:%s, l1:%s'%(K.int_shape(h1), K.int_shape(l1))
44 | h2 = tf.constant([[[1,2,3,0,0,0,0],[4,5,6,0,0,0,0]],
45 | [[7,8,100,9,101,0,0],[100,10,100,11,12,0,0]],
46 | [[13,14,15,100,100,100,0],[101,16,17,18,0,0,0]] ])
47 | l2 = tf.constant([[3, 3],
48 | [5, 5],
49 | [6, 4]])
50 | print 'Shapes: h2:%s, l2:%s'%(K.int_shape(h2), K.int_shape(l2))
51 | h1_s, l1_s = tfc.squash_3d(3, 2, h1, l1, 100)
52 | print 'Shapes: h1_s:%s, l1_s:%s'%(K.int_shape(h1_s), K.int_shape(l1_s))
53 | h2_s, l2_s = tfc.squash_3d(3, 2, h2, l2, 100)
54 | print 'Shapes: h2_s:%s, l2_s:%s'%(K.int_shape(h2_s), K.int_shape(l2_s))
55 | ed1 = tfc.edit_distance3D(3, 2, h2, l2, h1, l1, 100, 101)
56 | mean1 = tf.reduce_mean(ed1)
57 | acc1 = tf.reduce_mean(tf.to_float(tf.equal(ed1, 0)))
58 | ed1_s = tfc.edit_distance3D(3, 2, h2_s, l2_s, h1_s, l1_s, 100, 101)
59 | mean1_s = tf.reduce_mean(ed1_s)
60 | acc1_s = tf.reduce_mean(tf.to_float(tf.equal(ed1_s, 0)))
61 |
62 | _h1, _l1 = flatten(h1, l1)
63 | _h1_s, _l1_s = flatten(h1_s, l1_s)
64 | _h2, _l2 = flatten(h2, l2)
65 | _h2_s, _l2_s = flatten(h2_s, l2_s)
66 |
67 | _ed1 = tfc.edit_distance2D(6, _h2, _l2, _h1, _l1, 100, 101)
68 | _mean1 = tf.reduce_mean(_ed1)
69 | _acc1 = tf.reduce_mean(tf.to_float(tf.equal(_ed1, 0)))
70 | _ed1_s = tfc.edit_distance2D(6, _h2_s, _l2_s, _h1_s, _l1_s, 100, 101)
71 | _mean1_s = tf.reduce_mean(_ed1_s)
72 | _acc1_s = tf.reduce_mean(tf.to_float(tf.equal(_ed1_s, 0)))
73 |
74 | ######################## Tensor with ED > 0
75 | h2_2 = tf.constant([[[1,2,3,99,0,0,0],[4,5,6,0,0,0,0]],
76 | [[7,8,100,99,0,0,0],[100,10,100,11,12,0,0]],
77 | [[13,100,15,100,100,100,0],[100,16,17,18,0,0,0]] ])
78 | l2_2 = tf.constant([[4, 3],
79 | [4, 5],
80 | [6, 4]])
81 | h2_2_s, l2_2_s = tfc.squash_3d(3, 2, h2_2, l2_2, 100)
82 | _h2_2, _l2_2 = flatten(h2_2, l2_2)
83 | _h2_2_s, _l2_2_s = flatten(h2_2_s, l2_2_s)
84 |
85 | ed2 = tfc.edit_distance3D(3, 2, h2_2, l2_2, h1, l1, 100, 101)
86 | acc2 = tf.reduce_mean(tf.to_float(tf.equal(ed2, 0)))
87 | sum2 = tf.reduce_sum(ed2)
88 | ed2_s = tfc.edit_distance3D(3, 2, h2_2_s, l2_2_s, h1_s, l1_s, 100, 101)
89 | acc2_s = tf.reduce_mean(tf.to_float(tf.equal(ed2_s, 0)))
90 | sum2_s = tf.reduce_sum(ed2_s)
91 | print 'Shape of ed1=%s'%(K.int_shape(ed1),)
92 | print 'Shape of ed2_s=%s'%(K.int_shape(ed2_s),)
93 | _ed2 = tfc.edit_distance2D(6, _h2_2, _l2_2, _h1, _l1, 100, 101)
94 | _ed2_s = tfc.edit_distance2D(6, _h2_2_s, _l2_2_s, _h1_s, _l1_s, 100, 101)
95 | _sum2 = tf.reduce_sum(_ed2)
96 | _sum2_s = tf.reduce_sum(_ed2_s)
97 | _acc2 = tf.reduce_mean(tf.to_float(tf.equal(_ed2, 0)))
98 | _acc2_s = tf.reduce_mean(tf.to_float(tf.equal(_ed2_s, 0)))
99 |
100 | # tf.reduce_mean(tf.to_float(tf.equal(top1_ed, 0)))
101 |
102 | ## Test seqlens
103 | b = []
104 | for i in range(11):
105 | b.append([i]*11)
106 | b[i][i] = 0
107 | b.append([11]*11)
108 | b = np.asarray(b)
109 |
110 | tf_b = tf.constant(b)
111 | tf_lens1 = tfc.seqlens(tf.constant(b))
112 | tf_lens2 = tfc.seqlens(tf.constant(b.reshape((3,4,11)) ) )
113 | tf_lens1_2 = tfc.seqlens(tf.constant(b), include_eos_token=False)
114 | tf_lens2_2 = tfc.seqlens(tf.constant(b.reshape((3,4,11))) ,include_eos_token=False)
115 | len_1 = np.arange(1,13); len_1[11] = 11
116 | len_2 = np.arange(12)
117 |
118 | with tf.Session():
119 | print 'ed1 = \n%s'%ed1.eval()
120 | assert mean1.eval() == 0.
121 | assert acc1.eval() == 1
122 | print '_ed1 = \n%s'%_ed1.eval()
123 | assert _mean1.eval() == 0.
124 | assert _acc1.eval() == 1
125 | print 'ed1_s = \n%s'%ed1_s.eval()
126 | assert mean1_s.eval() == 0.
127 | assert acc1_s.eval() == 1
128 | print '_ed1_s = \n%s'%_ed1_s.eval()
129 | assert _mean1_s.eval() == 0.
130 | assert _acc1_s.eval() == 1
131 |
132 |
133 | print 'ed2 = \n%s'%ed2.eval()
134 | assert sum2.eval() == 1.
135 | assert acc2.eval() == 1./2.
136 | print '_ed2 = \n%s'%_ed2.eval()
137 | assert _sum2.eval() == 1.
138 | assert _acc2.eval() == 1./2.
139 | print 'ed2_s = \n%s'%ed2_s.eval()
140 | assert sum2_s.eval() == 1.
141 | assert acc2_s.eval() == 1./2.
142 | print '_ed2_s = \n%s'%_ed2_s.eval()
143 | assert _sum2_s.eval() == 1.
144 | assert _acc2_s.eval() == 1./2.
145 |
146 | print tf_lens1.eval()
147 | print tf_lens1_2.eval()
148 | assert np.all(tf_lens1.eval() == len_1 )
149 | assert np.all(tf_lens1_2.eval() == len_2)
150 | print tf_lens2.eval()
151 | print tf_lens2_2.eval()
152 | assert np.all(tf_lens2.eval() == len_1.reshape(3,4))
153 | assert np.all(tf_lens2_2.eval() == len_2.reshape(3,4))
154 | print "Success !"
155 |
--------------------------------------------------------------------------------
/src/convnet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | # -*- coding: utf-8 -*-
3 | """
4 | Copyright 2017 Sumeet S Singh
5 |
6 | This file is part of im2latex solution by Sumeet S Singh.
7 |
8 | This program is free software: you can redistribute it and/or modify
9 | it under the terms of the Affero GNU General Public License as published by
10 | the Free Software Foundation, either version 3 of the License, or
11 | (at your option) any later version.
12 |
13 | This program is distributed in the hope that it will be useful,
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | Affero GNU General Public License for more details.
17 |
18 | You should have received a copy of the Affero GNU General Public License
19 | along with this program. If not, see .
20 | Created on Mon Jul 24 12:28:55 2017
21 |
22 | @author: Sumeet S Singh
23 | """
24 |
25 | import os
26 | import argparse as arg
27 | import time
28 | from six.moves import cPickle as pickle
29 | import pandas as pd
30 | import tensorflow as tf
31 | from tensorflow.contrib.keras import backend as K
32 | from hyper_params import make_hyper
33 | import data_commons as dtc
34 | import dl_commons as dlc
35 | import tf_commons as tfc
36 | from data_reader import BatchImageIterator2, ImagenetProcessor
37 | from Im2LatexModel import build_vgg_context
38 |
39 | def get_df(params):
40 | # image_features_folder = params.vgg16_folder
41 | # raw_data_folder = params.raw_data_folder
42 | # image_features_folder = params.vgg16_folder
43 |
44 | # Join the two data-frames
45 | df_train = pd.read_pickle(os.path.join(params.raw_data_folder, 'df_train.pkl'))
46 | df_test = pd.read_pickle(os.path.join(params.raw_data_folder, 'df_test.pkl'))
47 | df = df_train.append(df_test)
48 |
49 | # Remove images that have been processed already. But round-up to batch-size
50 | image_list = [os.path.splitext(s)[0]+'.png' for s in filter(lambda s: s.endswith('.pkl'), os.listdir(params.vgg16_folder))]
51 | df = df[~df.image.isin(image_list)]
52 | if (df.shape[0] % params.B) != 0:
53 | shortfall = params.B - (df.shape[0] % params.B)
54 | df_remove = df[df.image.isin(image_list)]
55 | df = df.append(df_remove.sample(n=shortfall), verify_integrity=True)
56 |
57 | dtc.logger.info('Working with %d images', df.shape[0])
58 |
59 | # Set all bin_len to max to ensure only one bin
60 | df.bin_len = df.bin_len.max()
61 | return df
62 |
63 | def run_convnet(params):
64 | HYPER = make_hyper(params)
65 | image_folder = params.image_folder
66 | raw_data_folder = params.raw_data_folder
67 | image_features_folder = params.vgg16_folder
68 | logger = HYPER.logger
69 |
70 | df = get_df(params)
71 | if df.shape[0] == 0:
72 | logger.info('No images remaining to process. All done.')
73 | else:
74 | logger.info('Processing %d images.', df.shape[0])
75 |
76 | logger.info('\n#################### Args: ####################\n%s', params.pformat())
77 | logger.info('##################################################################\n')
78 | logger.info( '\n######################### HYPER Params: #########################\n%s', HYPER.pformat())
79 | logger.info('##################################################################\n')
80 |
81 | b_it = BatchImageIterator2(
82 | raw_data_folder,
83 | image_folder,
84 | HYPER,
85 | image_processor=ImagenetProcessor(HYPER),
86 | df=df,
87 | num_steps=params.num_steps,
88 | num_epochs=params.num_epochs)
89 |
90 | graph = tf.Graph()
91 | with graph.as_default():
92 |
93 | config=tf.ConfigProto(log_device_placement=False)
94 | config.gpu_options.allow_growth = True
95 | tf_session = tf.Session(config=config)
96 | with tf_session.as_default():
97 | K.set_session(tf_session)
98 |
99 | tf_im = tf.placeholder(dtype=HYPER.dtype, shape=((HYPER.B,)+HYPER.image_shape), name='image')
100 | with tf.device('/gpu:1'): # change this to gpu:0 if you only have one gpu
101 | tf_a_batch = build_vgg_context(HYPER, tf_im)
102 | tf_a_list = tf.unstack(tf_a_batch, axis=0)
103 |
104 | t_n = tfc.printVars('Trainable Variables', tf.trainable_variables())
105 | g_n = tfc.printVars('Global Variables', tf.global_variables())
106 | l_n = tfc.printVars('Local Variables', tf.local_variables())
107 | assert t_n == g_n
108 | assert g_n == l_n
109 |
110 | print '\nUninitialized params'
111 | print tf_session.run(tf.report_uninitialized_variables())
112 |
113 | print 'Flushing graph to disk'
114 | tf_sw = tf.summary.FileWriter(tfc.makeTBDir(HYPER.tb), graph=graph)
115 | tf_sw.flush()
116 |
117 | print '\n'
118 | start_time = time.clock()
119 | for step, b in enumerate(b_it, start=1):
120 | # if b.epoch > 1 or (params.num_steps >= 0 and step > params.num_steps):
121 | # break
122 | feed_dict = {tf_im: b.im, K.learning_phase(): 0}
123 | a_list = tf_session.run(tf_a_list, feed_dict = feed_dict)
124 | assert len(a_list) == len(b.image_name)
125 | for i, a in enumerate(a_list):
126 | ## print 'Writing %s, shape=%s'%(b.image_name[i], a.shape)
127 | with open(os.path.join(image_features_folder, os.path.splitext(b.image_name[i])[0] + '.pkl'),
128 | 'wb') as f:
129 | pickle.dump(a, f, pickle.HIGHEST_PROTOCOL)
130 | if step % 10 == 0:
131 | print('Elapsed time for %d steps = %d seconds'%(step, time.clock()-start_time))
132 | print('Elapsed time for %d steps = %d seconds'%(step, time.clock()-start_time))
133 | print 'done'
134 |
135 | def main():
136 | _data_folder = '../data/dataset3'
137 |
138 | parser = arg.ArgumentParser(description='train model')
139 | parser.add_argument("--num-steps", "-n", dest="num_steps", type=int,
140 | help="Number of training steps to run. Defaults to -1 if unspecified, i.e. run to completion",
141 | default=-1)
142 | parser.add_argument("--num-epochs", "-e", dest="num_epochs", type=int,
143 | help="Number of training steps to run. Defaults to 1 if unspecified.",
144 | default=1)
145 | parser.add_argument("--batch-size", "-b", dest="batch_size", type=int,
146 | help="Batchsize. If unspecified, defaults to the default value in hyper_params",
147 | default=None)
148 | parser.add_argument("--print-steps", "-s", dest="print_steps", type=int,
149 | help="Number of training steps after which to log results. Defaults to 10 if unspecified",
150 | default=100)
151 | parser.add_argument("--data-folder", "-d", dest="data_folder", type=str,
152 | help="Data folder. If unspecified, defaults to " + _data_folder,
153 | default=_data_folder)
154 | parser.add_argument("--raw-data-folder", dest="raw_data_folder", type=str,
155 | help="Raw data folder. If unspecified, defaults to data_folder/training",
156 | default=None)
157 | parser.add_argument("--vgg16-folder", dest="vgg16_folder", type=str,
158 | help="vgg16 data folder. If unspecified, defaults to data_folder/vgg16_features",
159 | default=None)
160 | parser.add_argument("--image-folder", dest="image_folder", type=str,
161 | help="image folder. If unspecified, defaults to data_folder/formula_images",
162 | default=None)
163 | parser.add_argument("--partial-batch", "-p", dest="partial_batch", action='store_true',
164 | help="Sets assert_whole_batch hyper param to False. Default hyper_param value will be used if unspecified")
165 | parser.add_argument("--logging-level", "-l", dest="logging_level", type=int,
166 | help="Logging verbosity level from 1 to 5 in increasing order of verbosity.",
167 | default=4)
168 |
169 |
170 | args = parser.parse_args()
171 | data_folder = args.data_folder
172 | params = dlc.Properties({'num_steps': args.num_steps,
173 | 'print_steps':args.print_steps,
174 | 'num_epochs': args.num_epochs,
175 | 'logger': dtc.makeLogger(args.logging_level, set_global=True),
176 | 'build_image_context': 1,
177 | 'weights_regularizer': None,
178 | 'num_gpus': 1,
179 | 'tb': tfc.TensorboardParams({'tb_logdir': 'tb_metrics_convnet'}).freeze()
180 | })
181 | if args.image_folder:
182 | params.image_folder = args.image_folder
183 | else:
184 | params.image_folder = os.path.join(data_folder,'formula_images')
185 |
186 | if args.raw_data_folder:
187 | params.raw_data_folder = args.raw_data_folder
188 | else:
189 | params.raw_data_folder = os.path.join(data_folder, 'training')
190 | params.raw_data_dir = params.raw_data_folder
191 |
192 | if args.vgg16_folder:
193 | params.vgg16_folder = args.vgg16_folder
194 | else:
195 | params.vgg16_folder = os.path.join(data_folder, 'vgg16_features')
196 |
197 | if args.batch_size is not None:
198 | params.B = args.batch_size
199 | if args.partial_batch:
200 | params.assert_whole_batch = False
201 |
202 | data_props = dtc.load(params.raw_data_folder, 'data_props.pkl')
203 | params.image_shape = (data_props['padded_image_dim']['height'], data_props['padded_image_dim']['width'], 3)
204 | run_convnet(params)
205 |
206 | main()
207 |
--------------------------------------------------------------------------------
/src/model/tf_dynamic_decode.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Seq2seq layer operations for use in neural networks."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import abc
22 | import six
23 |
24 | from tensorflow.python.framework import constant_op
25 | from tensorflow.python.framework import dtypes
26 | from tensorflow.python.framework import ops
27 | from tensorflow.python.framework import tensor_shape
28 | from tensorflow.python.framework import tensor_util
29 | from tensorflow.python.ops import array_ops
30 | from tensorflow.python.ops import control_flow_ops
31 | from tensorflow.python.ops import math_ops
32 | from tensorflow.python.ops import rnn
33 | from tensorflow.python.ops import tensor_array_ops
34 | from tensorflow.python.ops import variable_scope
35 | from tensorflow.python.util import nest
36 | from tensorflow.contrib.seq2seq import Decoder
37 | import tf_commons as tfc
38 | import data_commons as dtc
39 |
40 | __all__ = ["dynamic_decode"]
41 |
42 |
43 | _transpose_batch_time = rnn._transpose_batch_time # pylint: disable=protected-access
44 |
45 |
46 |
47 |
48 |
49 | def _create_zero_outputs(size, dtype, batch_size):
50 | """Create a zero outputs Tensor structure."""
51 | def _t(s):
52 | return (s if isinstance(s, ops.Tensor) else constant_op.constant(
53 | tensor_shape.TensorShape(s).as_list(),
54 | dtype=dtypes.int32,
55 | name="zero_suffix_shape"))
56 |
57 | def _create(s, d):
58 | return array_ops.zeros(
59 | array_ops.concat(
60 | ([batch_size], _t(s)), axis=0), dtype=d)
61 |
62 | return nest.map_structure(_create, size, dtype)
63 |
64 |
65 | def dynamic_decode(decoder,
66 | output_time_major=False,
67 | impute_finished=False,
68 | maximum_iterations=None,
69 | parallel_iterations=32,
70 | swap_memory=False,
71 | scope=None):
72 | """Perform dynamic decoding with `decoder`.
73 |
74 | Calls initialize() once and step() repeatedly on the Decoder object.
75 |
76 | Args:
77 | decoder: A `Decoder` instance.
78 | output_time_major: Python boolean. Default: `False` (batch major). If
79 | `True`, outputs are returned as time major tensors (this mode is faster).
80 | Otherwise, outputs are returned as batch major tensors (this adds extra
81 | time to the computation).
82 | impute_finished: Python boolean. If `True`, then states for batch
83 | entries which are marked as finished get copied through and the
84 | corresponding outputs get zeroed out. This causes some slowdown at
85 | each time step, but ensures that the final state and outputs have
86 | the correct values and that backprop ignores time steps that were
87 | marked as finished.
88 | maximum_iterations: `int32` scalar, maximum allowed number of decoding
89 | steps. Default is `None` (decode until the decoder is fully done).
90 | parallel_iterations: Argument passed to `tf.while_loop`.
91 | swap_memory: Argument passed to `tf.while_loop`.
92 | scope: Optional variable scope to use.
93 |
94 | Returns:
95 | `(final_outputs, final_state, final_sequence_lengths)`.
96 |
97 | Raises:
98 | TypeError: if `decoder` is not an instance of `Decoder`.
99 | ValueError: if `maximum_iterations` is provided but is not a scalar.
100 | """
101 | if not isinstance(decoder, Decoder):
102 | raise TypeError("Expected decoder to be type Decoder, but saw: %s" %
103 | type(decoder))
104 |
105 | with variable_scope.variable_scope(scope, "decoder") as varscope:
106 | # Properly cache variable values inside the while_loop
107 | if varscope.caching_device is None:
108 | varscope.set_caching_device(lambda op: op.device)
109 |
110 | if maximum_iterations is not None:
111 | maximum_iterations = ops.convert_to_tensor(
112 | maximum_iterations, dtype=dtypes.int32, name="maximum_iterations")
113 | if maximum_iterations.get_shape().ndims != 0:
114 | raise ValueError("maximum_iterations must be a scalar")
115 |
116 | initial_finished, initial_inputs, initial_state = decoder.initialize()
117 |
118 | zero_outputs = _create_zero_outputs(decoder.output_size,
119 | decoder.output_dtype,
120 | decoder.batch_size)
121 |
122 | if maximum_iterations is not None:
123 | initial_finished = math_ops.logical_or(
124 | initial_finished, 0 >= maximum_iterations)
125 | initial_sequence_lengths = array_ops.zeros_like(
126 | initial_finished, dtype=dtypes.int32)
127 | initial_time = constant_op.constant(0, dtype=dtypes.int32)
128 |
129 | def _shape(batch_size, from_shape):
130 | if not isinstance(from_shape, tensor_shape.TensorShape):
131 | return tensor_shape.TensorShape(None)
132 | else:
133 | batch_size = tensor_util.constant_value(
134 | ops.convert_to_tensor(
135 | batch_size, name="batch_size"))
136 | return tensor_shape.TensorShape([batch_size]).concatenate(from_shape)
137 |
138 | def _create_ta(s, d):
139 | return tensor_array_ops.TensorArray(
140 | dtype=d,
141 | size=0,
142 | dynamic_size=True,
143 | element_shape=_shape(decoder.batch_size, s))
144 |
145 | def _create_states_ta(t):
146 | return tensor_array_ops.TensorArray(
147 | dtype=t.dtype,
148 | size=0,
149 | dynamic_size=True,
150 | clear_after_read=True,
151 | element_shape=t.shape
152 | )
153 |
154 | initial_outputs_ta = nest.map_structure(_create_ta, decoder.output_size,
155 | decoder.output_dtype)
156 | # dtc.logger.info('decoder.initial_state.shape=%s', tfc.nested_tf_shape(initial_state))
157 | initial_states_ta = nest.map_structure(_create_states_ta, initial_state)
158 | # dtc.logger.info('initial_states_ta=%s', initial_states_ta)
159 |
160 | def condition(unused_time, unused_outputs_ta, unused_state, unused_inputs,
161 | finished, unused_sequence_lengths, unused_states_ta):
162 | return math_ops.logical_not(math_ops.reduce_all(finished))
163 |
164 | def body(time, outputs_ta, state, inputs, finished, sequence_lengths, states_ta):
165 | """Internal while_loop body.
166 |
167 | Args:
168 | time: scalar int32 tensor.
169 | outputs_ta: structure of TensorArray.
170 | state: (structure of) state tensors and TensorArrays.
171 | inputs: (structure of) input tensors.
172 | finished: bool tensor (keeping track of what's finished).
173 | sequence_lengths: int32 tensor (keeping track of time of finish).
174 | states_ta: structure of TensorArray for storing states.
175 |
176 | Returns:
177 | `(time + 1, outputs_ta, next_state, next_inputs, next_finished,
178 | next_sequence_lengths, states_ta)`.
179 | ```
180 | """
181 | (next_outputs, decoder_state, next_inputs,
182 | decoder_finished) = decoder.step(time, inputs, state)
183 | next_finished = math_ops.logical_or(decoder_finished, finished)
184 | if maximum_iterations is not None:
185 | next_finished = math_ops.logical_or(
186 | next_finished, time + 1 >= maximum_iterations)
187 | next_sequence_lengths = array_ops.where(
188 | math_ops.logical_and(math_ops.logical_not(finished), next_finished),
189 | array_ops.fill(array_ops.shape(sequence_lengths), time + 1),
190 | sequence_lengths)
191 |
192 | nest.assert_same_structure(state, decoder_state)
193 | nest.assert_same_structure(states_ta, decoder_state)
194 | nest.assert_same_structure(outputs_ta, next_outputs)
195 | nest.assert_same_structure(inputs, next_inputs)
196 |
197 | # Zero out output values past finish
198 | if impute_finished:
199 | emit = nest.map_structure(
200 | lambda out, zero: array_ops.where(finished, zero, out),
201 | next_outputs,
202 | zero_outputs)
203 | else:
204 | emit = next_outputs
205 |
206 | # Copy through states past finish
207 | def _maybe_copy_state(new, cur):
208 | # TensorArrays and scalar states get passed through.
209 | if isinstance(cur, tensor_array_ops.TensorArray):
210 | pass_through = True
211 | else:
212 | new.set_shape(cur.shape)
213 | pass_through = (new.shape.ndims == 0)
214 | return new if pass_through else array_ops.where(finished, cur, new)
215 |
216 | if impute_finished:
217 | next_state = nest.map_structure(
218 | _maybe_copy_state, decoder_state, state)
219 | else:
220 | next_state = decoder_state
221 |
222 | outputs_ta = nest.map_structure(lambda ta, out: ta.write(time, out),
223 | outputs_ta, emit)
224 | states_ta = nest.map_structure(lambda ta, st: ta.write(time, st),
225 | states_ta, next_state)
226 |
227 | return (time + 1, outputs_ta, next_state, next_inputs, next_finished,
228 | next_sequence_lengths, states_ta)
229 |
230 | res = control_flow_ops.while_loop(
231 | condition,
232 | body,
233 | loop_vars=[
234 | initial_time, initial_outputs_ta, initial_state, initial_inputs,
235 | initial_finished, initial_sequence_lengths, initial_states_ta
236 | ],
237 | parallel_iterations=parallel_iterations,
238 | swap_memory=swap_memory)
239 |
240 | final_outputs_ta = res[1]
241 | final_state = res[2]
242 | final_sequence_lengths = res[5]
243 | final_states_ta = res[6]
244 |
245 | final_outputs = nest.map_structure(lambda ta: ta.stack(), final_outputs_ta)
246 | final_states = nest.map_structure(lambda ta: ta.stack(), final_states_ta)
247 |
248 | try:
249 | final_outputs, final_state = decoder.finalize(
250 | final_outputs, final_state, final_sequence_lengths)
251 | except NotImplementedError:
252 | pass
253 |
254 | if not output_time_major:
255 | final_outputs = nest.map_structure(_transpose_batch_time, final_outputs)
256 | final_states = nest.map_structure(_transpose_batch_time, final_states)
257 |
258 | return final_outputs, final_state, final_sequence_lengths, final_states
259 |
--------------------------------------------------------------------------------
/src/model/tf_tutorial_code.py:
--------------------------------------------------------------------------------
1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | import tensorflow as tf
16 |
17 |
18 | def average_gradients(tower_grads):
19 | """Calculate the average gradient for each shared variable across all towers.
20 | Note that this function provides a synchronization point across all towers.
21 | Args:
22 | tower_grads: List of lists of (gradient, variable) tuples. The outer list
23 | is over individual gradients. The inner list is over the gradient
24 | calculation for each tower.
25 | Returns:
26 | List of pairs of (gradient, variable) where the gradient has been averaged
27 | across all towers.
28 | """
29 | average_grads = []
30 | for grad_and_vars in zip(*tower_grads):
31 | # Note that each grad_and_vars looks like the following:
32 | # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
33 | grads = []
34 | i = 0
35 | for g, v in grad_and_vars:
36 | i += 1
37 | # Add 0 dimension to the gradients to represent the tower.
38 | expanded_g = tf.expand_dims(g, 0)
39 |
40 | # Append on a 'tower' dimension which we will average over below.
41 | grads.append(expanded_g)
42 |
43 | # Average over the 'tower' dimension.
44 | grad = tf.concat(axis=0, values=grads)
45 | grad = tf.reduce_mean(grad, 0)
46 |
47 | # Keep in mind that the Variables are redundant because they are shared
48 | # across towers. So .. we will just return the first tower's pointer to
49 | # the Variable.
50 | v = grad_and_vars[0][1]
51 | grad_and_var = (grad, v)
52 | average_grads.append(grad_and_var)
53 | return average_grads
54 |
--------------------------------------------------------------------------------
/src/preprocessing/README.md:
--------------------------------------------------------------------------------
1 | # Preprocessing Notes
2 | 1. Download latex files from KDD Cup 2003 dataset.
3 | 2. Parase the files and extract suitable latex math equations (formulas from now on)
4 | 3. Normalize the extracted formulas by parsing them using katex parser and then regenerating latex from the parse-tree.
5 | 4. Render formulas to jpeg images (using pdflatex)
6 | 5. Create a pandas dataframe mapping images to normalized formula
7 | 6. Clean formula text
8 | 7. Tokenize formulas and extract vocabulary. Clean vocabulary and bad formulas.
9 | 8. Analyze data and remove very large images and latex sequences (in order to limit image size)
10 | 9. ...
11 |
--------------------------------------------------------------------------------
/src/tools/bulk_disp_alpha.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | """
3 | This file executes disp_alpha.ipynb in bulk, once for each of the images below. It then exports the notebook into
4 | HTML in the current folder. Run it in a folder such as gallery/I2L-NOPOOL along with the companion disp_alpha.ipynb.
5 | """
6 | import sys, os
7 | from multiprocessing import Pool
8 |
9 | def do(image_name):
10 | def rmtail(s, t):
11 | return s.rsplit(t, 1)[0]
12 |
13 | # import nbformat
14 | # from nbconvert.preprocessors import ExecutePreprocessor
15 |
16 | os.environ['image_name'] = repr(image_name)
17 | os.putenv('image_name', image_name)
18 |
19 | # with open('disp_alpha.ipynb') as f:
20 | # nb = nbformat.read(f, as_version=4)
21 | # ep = ExecutePreprocessor(timeout=600) # kernel_name='python2')
22 | # ep.preprocess(nb)
23 |
24 | command = 'jupyter nbconvert --to HTML --output alpha_%s_gray --execute --ExecutePreprocessor.timeout=300 disp_alpha.ipynb'%rmtail(image_name, '.png')
25 | print('Executing command: %s'%command)
26 | os.system(command)
27 |
28 | if __name__ == '__main__':
29 | if 'image_name' in os.environ:
30 | print('Processing image %s'%os.environ['image_name'])
31 | exit(0)
32 |
33 | p = Pool(2)
34 | I2L_NOPOOL_50_MATCHED = [
35 | u'6de537f98f51a70.png',
36 | u'e151d0cb6a1f4b8.png',
37 | u'125e0edbdc14c16.png',
38 | u'c1a595cf0e1b410.png',
39 | u'976c67c09595d48.png',
40 | u'48e151a0a2e1d66.png',
41 | u'eb4edff43972a77.png',
42 | u'f535e2d3ffd72a9.png',
43 | u'fbf3c74e173ede6.png',
44 | u'b727765af13988d.png',
45 | u'c236ef8f2d69db4.png',
46 | u'17806d8a43ed4d7.png',
47 | u'7bf25eec600c770.png',
48 | u'd67b0016af15368.png',
49 | u'beac5a98ad0bba3.png',
50 | u'6589b8b41dec5f5.png',
51 | u'c53968dbdf5e491.png',
52 | u'f7df71e09e679fa.png',
53 | u'88085cbe4db62f4.png',
54 | u'a4069d6109fdb32.png',
55 | u'7e7c82bcbbab14d.png',
56 | u'4cab7f4e7119975.png',
57 | u'ee3f8d415a17042.png',
58 | u'09c406611c97ca6.png',
59 | u'acc6b030ec1db54.png',
60 | u'5bde325cdc5c9fb.png',
61 | u'fc51f4f92be6b9e.png',
62 | u'831233abfc981bb.png',
63 | u'2af02fe9dda544b.png',
64 | u'dc311ef87140544.png',
65 | u'62d52e5875f15f2.png',
66 | u'082d6f67587ff53.png',
67 | u'be5020af1c11fb0.png',
68 | u'0ebe66af564fdea.png',
69 | u'8f17277609baf0d.png',
70 | u'cd7ee25bb44ee96.png',
71 | u'bde00b1efb71c8f.png',
72 | u'1637deef28fa753.png',
73 | u'ba84027cf12d913.png',
74 | u'ca7098dc8853675.png',
75 | u'5be77b312bfa0c1.png',
76 | u'9afabb69abb8665.png',
77 | u'e75a0c252c98431.png',
78 | u'05a32153f52b845.png',
79 | u'c450aeeee50eacb.png',
80 | u'8f249bcfcbd0d4a.png',
81 | u'c9908dd9001ae2a.png',
82 | u'8a7278fd1af0571.png',
83 | u'780ce6e35d2dfb2.png',
84 | u'04237c2640a6ef2.png']
85 |
86 | I2L_STRIPS_50_MATCHED = [
87 | u'4ef63353075e5b6.png',
88 | u'6de537f98f51a70.png',
89 | u'9dc9caeac24960d.png',
90 | u'125e0edbdc14c16.png',
91 | u'48e151a0a2e1d66.png',
92 | u'd4b25f217be4cca.png',
93 | u'7e7c82bcbbab14d.png',
94 | u'c236ef8f2d69db4.png',
95 | u'fbf3c74e173ede6.png',
96 | u'e8bd11a6b2feacf.png',
97 | u'b727765af13988d.png',
98 | u'7bf25eec600c770.png',
99 | u'21b2c45e268829b.png',
100 | u'f7df71e09e679fa.png',
101 | u'd67b0016af15368.png',
102 | u'6201fd941a8d4da.png',
103 | u'6f3d3d2ed89345d.png',
104 | u'beac5a98ad0bba3.png',
105 | u'136ca940c9932d4.png',
106 | u'cda328a07cba902.png',
107 | u'7fec9f1799b13ec.png',
108 | u'6147055797ca25d.png',
109 | u'938f5c3d05f5cf4.png',
110 | u'eebbeeddab4c0af.png',
111 | u'f16ea5d12d68b60.png',
112 | u'0734f11afe9aa90.png',
113 | u'186678817078727.png',
114 | u'2590ff270553f09.png',
115 | u'ee3f8d415a17042.png',
116 | u'4cab7f4e7119975.png',
117 | u'e99ef7e83d7b337.png',
118 | u'5bde325cdc5c9fb.png',
119 | u'a4d4967273292d2.png',
120 | u'23b08d245124d3c.png',
121 | u'a535502c45b16f6.png',
122 | u'8b27d32b2738fce.png',
123 | u'62d52e5875f15f2.png',
124 | u'acc6b030ec1db54.png',
125 | u'db4e9e9fba352e8.png',
126 | u'93cdbab1859dd05.png',
127 | u'dc311ef87140544.png',
128 | u'be5020af1c11fb0.png',
129 | u'831233abfc981bb.png',
130 | u'f8cbaf91c3c404f.png',
131 | u'c6d77ca7ad58ced.png',
132 | u'ca7098dc8853675.png',
133 | u'bde00b1efb71c8f.png',
134 | u'1637deef28fa753.png',
135 | u'e75a0c252c98431.png',
136 | u'05a32153f52b845.png']
137 |
138 | p.map(do, set(I2L_NOPOOL_50_MATCHED+I2L_STRIPS_50_MATCHED))
139 |
--------------------------------------------------------------------------------
/src/tools/disp_alpha.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Note: Copy this notebook into the gallery folder before running it."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 7,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "import sys, os\n",
19 | "sys.path.extend(['../src/commons'])\n",
20 | "from pub_commons import DISP_ALPHA\n",
21 | "%matplotlib inline"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {
28 | "collapsed": true,
29 | "scrolled": false,
30 | "slideshow": {
31 | "slide_type": "-"
32 | }
33 | },
34 | "outputs": [],
35 | "source": [
36 | "DISP_ALPHA(storedir='I2L-NOPOOL', graph='test', step=167526, cmap='gist_gray', image='bde00b1efb71c8f.png')"
37 | ]
38 | }
39 | ],
40 | "metadata": {
41 | "kernelspec": {
42 | "display_name": "Python 2",
43 | "language": "python",
44 | "name": "python2"
45 | },
46 | "language_info": {
47 | "codemirror_mode": {
48 | "name": "ipython",
49 | "version": 2
50 | },
51 | "file_extension": ".py",
52 | "mimetype": "text/x-python",
53 | "name": "python",
54 | "nbconvert_exporter": "python",
55 | "pygments_lexer": "ipython2",
56 | "version": "2.7.14"
57 | }
58 | },
59 | "nbformat": 4,
60 | "nbformat_minor": 2
61 | }
62 |
--------------------------------------------------------------------------------
/src/tools/sample_preds.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "#### This notebook is used for extracting and formatting data for publishing. Copy it into a gallery folder such as gallery/I2L-STRIPS before running it."
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "import sys\n",
19 | "sys.path.extend(['../../src/commons'])"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": null,
25 | "metadata": {
26 | "collapsed": true
27 | },
28 | "outputs": [],
29 | "source": [
30 | "from IPython.core.display import display, HTML\n",
31 | "display(HTML(\"\"))"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": null,
37 | "metadata": {
38 | "collapsed": true
39 | },
40 | "outputs": [],
41 | "source": [
42 | "import pandas as pd\n",
43 | "import os\n",
44 | "import re\n",
45 | "import codecs\n",
46 | "from IPython.display import display, Math, Latex\n",
47 | "from IPython.display import Image as ipImage\n",
48 | "from six.moves import cPickle as pickle\n",
49 | "import string\n",
50 | "from PIL import Image\n",
51 | "import numpy as np\n",
52 | "import h5py\n",
53 | "import matplotlib as mpl\n",
54 | "from matplotlib import pyplot as plt\n",
55 | "from mpl_toolkits.axes_grid1 import ImageGrid\n",
56 | "# Config the matplotlib backend as plotting inline in IPython\n",
57 | "%matplotlib inline"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "metadata": {
64 | "collapsed": true
65 | },
66 | "outputs": [],
67 | "source": [
68 | "pd.options.display.max_rows = 120\n",
69 | "pd.options.display.max_colwidth = 600\n",
70 | "pd.options.display.expand_frame_repr = False\n",
71 | "pd.options.display.colheader_justify = 'left'"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {
78 | "collapsed": true
79 | },
80 | "outputs": [],
81 | "source": [
82 | "import data_commons as dtc\n",
83 | "import dl_commons as dlc\n",
84 | "import viz_commons as viz\n",
85 | "from viz_commons import VisualizeDir, DiffParams, VisualizeStep"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "### Load results of test run"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": null,
98 | "metadata": {
99 | "collapsed": true
100 | },
101 | "outputs": [],
102 | "source": [
103 | "storedir = '.'\n",
104 | "clobber = True\n",
105 | "dump = True\n",
106 | "evaldir = os.path.join(storedir, 'eval_images')\n",
107 | "rendered_dir = os.path.join(evaldir, 'rendered_images')\n",
108 | "dumpdir = os.path.join(storedir, 'gallery_data')\n",
109 | "\n",
110 | "def chkclobber(path):\n",
111 | " assert clobber or (not os.path.exists(path)), \"Can't overwrite file %s when clobber==False\"%path\n",
112 | " return path\n",
113 | "\n",
114 | "def dump(df_, df_sample_, fname):\n",
115 | " if dump:\n",
116 | " with open(chkclobber(os.path.join(dumpdir, '%s_sample_table.txt'%fname)), 'w') as f:\n",
117 | " for row in df_sample_[['y','$\\hat{y}$']].itertuples(index=False):\n",
118 | " f.write(row[0] + ' & ' + row[1] + '\\n')\n",
119 | " df_.to_pickle(chkclobber(os.path.join(dumpdir, '%s_preds.pkl'%fname)))\n",
120 | " df_sample_.to_pickle(chkclobber(os.path.join(dumpdir, '%s_preds_sample.pkl'%fname)))"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {
127 | "collapsed": true
128 | },
129 | "outputs": [],
130 | "source": [
131 | "vd = VisualizeDir(os.path.expanduser(storedir))\n",
132 | "last_step = vd.get_steps()[1][-1]\n",
133 | "print('last_step = %d' % last_step)\n",
134 | "vs = VisualizeStep(vd, 'test', last_step)\n",
135 | "df_preds = pd.read_pickle(os.path.join(evaldir, 'predictions_test_%d.pkl'%last_step))"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": null,
141 | "metadata": {
142 | "collapsed": true,
143 | "scrolled": true
144 | },
145 | "outputs": [],
146 | "source": [
147 | "df_preds"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {
154 | "collapsed": true
155 | },
156 | "outputs": [],
157 | "source": [
158 | "df_strs = vs.strs( 'y', 'predicted_ids', sortkey=None, mingle=False, trim=True, wrap_strs=True, keys=['image_name', 'ed'])\n",
159 | "df_strs.columns"
160 | ]
161 | },
162 | {
163 | "cell_type": "markdown",
164 | "metadata": {},
165 | "source": [
166 | "### View and save the unmatched images"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": null,
172 | "metadata": {
173 | "collapsed": true,
174 | "scrolled": true
175 | },
176 | "outputs": [],
177 | "source": [
178 | "with open(os.path.join(rendered_dir, 'unmatched_filenames.txt'), 'r') as f:\n",
179 | " unmatched = []; missing = []\n",
180 | " for fname in f:\n",
181 | " fname = os.path.basename(fname.strip())\n",
182 | " path = os.path.join(rendered_dir, 'images_pred', fname)\n",
183 | " if not os.path.exists(path):\n",
184 | " missing.append(fname)\n",
185 | " else:\n",
186 | " unmatched.append(fname)\n",
187 | "num_missing = len(missing)\n",
188 | "total = len(df_preds)\n",
189 | "print('%d(%.3f%%) files missing out of %d'%(num_missing, (num_missing*100.)/(total*1.0), total))\n",
190 | "df_bad = df_preds.loc[unmatched]\n",
191 | "\n",
192 | "def wrap_math(df_):\n",
193 | " \"\"\"Wrap the latex formulas with $ symbols.\"\"\"\n",
194 | " targets=[]; preds=[]; # image=[];\n",
195 | " for row in df_[['target_seq', 'pred_seq']].itertuples(index=True):\n",
196 | "# image.append(row[0])\n",
197 | " targets.append('$%s$'%row[1])\n",
198 | " preds.append('$%s$'%row[2])\n",
199 | " _df = df_.drop(['iloc'], axis=1).reset_index(drop=False).copy(deep=True)\n",
200 | " _df = _df.assign(y=targets, pred=preds)\n",
201 | " return _df.rename(columns={'pred':'$\\hat{y}$'})\n",
202 | "\n",
203 | "df_bad_sample_ = wrap_math(df_bad.sample(115))\n",
204 | "df_bad_sample_[['$\\hat{y}$', 'y']]"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {
211 | "collapsed": true
212 | },
213 | "outputs": [],
214 | "source": [
215 | "df_bad_sample_.columns"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "metadata": {},
221 | "source": [
222 | "#### Filter MathJax Errors\n",
223 | "Though they rendered fine with pdflatex, MathJax has difficulty rendering some images. Therefore we will remove them for visualization purposes so that one may leverage pandas to generate pretty formatted formulas."
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {
230 | "collapsed": true,
231 | "scrolled": true
232 | },
233 | "outputs": [],
234 | "source": [
235 | "df_bad_sample = df_bad_sample_.drop([1,3,44,45,86,89,94,102,107,110,114]).iloc[:100].reset_index(drop=True)\n",
236 | "df_bad_sample[['$\\hat{y}$', 'y']]"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "metadata": {
243 | "collapsed": true
244 | },
245 | "outputs": [],
246 | "source": [
247 | "dump(df_bad, df_bad_sample, 'unmatched')"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "metadata": {
254 | "collapsed": true
255 | },
256 | "outputs": [],
257 | "source": [
258 | "df_matched = df_preds[~df_preds.index.isin(unmatched + missing)]\n",
259 | "df_matched.shape"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": null,
265 | "metadata": {
266 | "collapsed": true
267 | },
268 | "outputs": [],
269 | "source": [
270 | "df_matched[df_matched.ed==0.0].shape"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "metadata": {
277 | "collapsed": true,
278 | "scrolled": true
279 | },
280 | "outputs": [],
281 | "source": [
282 | "df_txt_matched = df_matched[df_matched.ed==0.0].sort_values(by='pred_len', ascending=False)\n",
283 | "df_txt_matched_sample_ = wrap_math(df_txt_matched[:100])\n",
284 | "df_txt_matched_sample_[['$\\hat{y}$', 'y']]"
285 | ]
286 | },
287 | {
288 | "cell_type": "code",
289 | "execution_count": null,
290 | "metadata": {
291 | "collapsed": true
292 | },
293 | "outputs": [],
294 | "source": [
295 | "dump(df_txt_matched, df_txt_matched_sample_, 'txt_matched')"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": null,
301 | "metadata": {
302 | "collapsed": true
303 | },
304 | "outputs": [],
305 | "source": [
306 | "df_img_matched.columns"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": null,
312 | "metadata": {
313 | "collapsed": true,
314 | "scrolled": true
315 | },
316 | "outputs": [],
317 | "source": [
318 | "df_img_matched = df_matched[df_matched.ed!=0.0].sort_values(by='pred_len', ascending=False)\n",
319 | "df_img_matched_sample_ = wrap_math(df_img_matched[:110])\n",
320 | "df_img_matched_sample_[['$\\hat{y}$', 'pred_len', 'y', 'target_len']]"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "metadata": {
327 | "collapsed": true,
328 | "scrolled": true
329 | },
330 | "outputs": [],
331 | "source": [
332 | "df_img_matched_sample = df_img_matched_sample_.drop([29, 60, 89, 104]).reset_index(drop=True).iloc[:100]\n",
333 | "df_img_matched_sample[['$\\hat{y}$', 'pred_len', 'y', 'target_len']]"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {
340 | "collapsed": true
341 | },
342 | "outputs": [],
343 | "source": [
344 | "dump(df_img_matched, df_img_matched_sample, 'img_matched')"
345 | ]
346 | },
347 | {
348 | "cell_type": "markdown",
349 | "metadata": {},
350 | "source": [
351 | "# End"
352 | ]
353 | }
354 | ],
355 | "metadata": {
356 | "kernelspec": {
357 | "display_name": "Python 2",
358 | "language": "python",
359 | "name": "python2"
360 | },
361 | "language_info": {
362 | "codemirror_mode": {
363 | "name": "ipython",
364 | "version": 2
365 | },
366 | "file_extension": ".py",
367 | "mimetype": "text/x-python",
368 | "name": "python",
369 | "nbconvert_exporter": "python",
370 | "pygments_lexer": "ipython2",
371 | "version": "2.7.14"
372 | }
373 | },
374 | "nbformat": 4,
375 | "nbformat_minor": 2
376 | }
377 |
--------------------------------------------------------------------------------
/thirdparty/data/im2latex_formulas_downloaded.lst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/thirdparty/data/im2latex_formulas_downloaded.lst
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016 Harvard NLP
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/Readme.md:
--------------------------------------------------------------------------------
1 | All code in this directory is taken from https://github.com/harvardnlp/im2markup under MIT license. The code is used for dataset preprocessing only.
2 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/evaluation/LevSeq.py:
--------------------------------------------------------------------------------
1 | from Levenshtein import *
2 | from warnings import warn
3 |
4 | class StringMatcher:
5 | """A SequenceMatcher-like class built on the top of Levenshtein"""
6 |
7 | def _reset_cache(self):
8 | self._ratio = self._distance = None
9 | self._opcodes = self._editops = self._matching_blocks = None
10 |
11 | def __init__(self, isjunk=None, seq1='', seq2=''):
12 | if isjunk:
13 | warn("isjunk not NOT implemented, it will be ignored")
14 | self._str1, self._str2 = seq1, seq2
15 | self._reset_cache()
16 |
17 | def set_seqs(self, seq1, seq2):
18 | self._str1, self._str2 = seq1, seq2
19 | self._reset_cache()
20 |
21 | def set_seq1(self, seq1):
22 | self._str1 = seq1
23 | self._reset_cache()
24 |
25 | def set_seq2(self, seq2):
26 | self._str2 = seq2
27 | self._reset_cache()
28 |
29 | def get_opcodes(self):
30 | if not self._opcodes:
31 | if self._editops:
32 | self._opcodes = opcodes(self._editops, self._str1, self._str2)
33 | else:
34 | self._opcodes = opcodes(self._str1, self._str2)
35 | return self._opcodes
36 |
37 | def get_editops(self):
38 | if not self._editops:
39 | if self._opcodes:
40 | self._editops = editops(self._opcodes, self._str1, self._str2)
41 | else:
42 | self._editops = editops(self._str1, self._str2)
43 | return self._editops
44 |
45 | def get_matching_blocks(self):
46 | if not self._matching_blocks:
47 | self._matching_blocks = matching_blocks(self.get_opcodes(),
48 | self._str1, self._str2)
49 | return self._matching_blocks
50 |
51 | def ratio(self):
52 | if not self._ratio:
53 | self._ratio = ratio(self._str1, self._str2)
54 | return self._ratio
55 |
56 | def quick_ratio(self):
57 | # This is usually quick enough :o)
58 | if not self._ratio:
59 | self._ratio = ratio(self._str1, self._str2)
60 | return self._ratio
61 |
62 | def real_quick_ratio(self):
63 | len1, len2 = len(self._str1), len(self._str2)
64 | return 2.0 * min(len1, len2) / (len1 + len2)
65 |
66 | def distance(self):
67 | if not self._distance:
68 | self._distance = distance(self._str1, self._str2)
69 | return self._distance
70 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/evaluation/distance/__init__.py:
--------------------------------------------------------------------------------
1 | "Utilities for comparing sequences"
2 |
3 | __all__ = ["hamming", "levenshtein", "nlevenshtein", "jaccard", "sorensen",
4 | "fast_comp", "lcsubstrings", "ilevenshtein", "ifast_comp"]
5 |
6 | try:
7 | from .cdistance import *
8 | except ImportError:
9 | from ._pyimports import *
10 |
11 | from ._pyimports import jaccard, sorensen
12 |
13 | def quick_levenshtein(str1, str2):
14 | return fast_comp(str1, str2, transpositions=False)
15 |
16 | def iquick_levenshtein(str1, strs):
17 | return ifast_comp(str1, str2, transpositions=False)
18 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/evaluation/distance/_fastcomp.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | def fast_comp(seq1, seq2, transpositions=False):
4 | """Compute the distance between the two sequences `seq1` and `seq2` up to a
5 | maximum of 2 included, and return it. If the edit distance between the two
6 | sequences is higher than that, -1 is returned.
7 |
8 | If `transpositions` is `True`, transpositions will be taken into account for
9 | the computation of the distance. This can make a difference, e.g.:
10 |
11 | >>> fast_comp("abc", "bac", transpositions=False)
12 | 2
13 | >>> fast_comp("abc", "bac", transpositions=True)
14 | 1
15 |
16 | This is faster than `levenshtein` by an order of magnitude, but on the
17 | other hand is of limited use.
18 |
19 | The algorithm comes from `http://writingarchives.sakura.ne.jp/fastcomp`.
20 | I've added transpositions support to the original code.
21 | """
22 | replace, insert, delete = "r", "i", "d"
23 |
24 | L1, L2 = len(seq1), len(seq2)
25 | if L1 < L2:
26 | L1, L2 = L2, L1
27 | seq1, seq2 = seq2, seq1
28 |
29 | ldiff = L1 - L2
30 | if ldiff == 0:
31 | models = (insert+delete, delete+insert, replace+replace)
32 | elif ldiff == 1:
33 | models = (delete+replace, replace+delete)
34 | elif ldiff == 2:
35 | models = (delete+delete,)
36 | else:
37 | return -1
38 |
39 | res = 3
40 | for model in models:
41 | i = j = c = 0
42 | while (i < L1) and (j < L2):
43 | if seq1[i] != seq2[j]:
44 | c = c+1
45 | if 2 < c:
46 | break
47 |
48 | if transpositions and ldiff != 2 \
49 | and i < L1 - 1 and j < L2 - 1 \
50 | and seq1[i+1] == seq2[j] and seq1[i] == seq2[j+1]:
51 | i, j = i+2, j+2
52 | else:
53 | cmd = model[c-1]
54 | if cmd == delete:
55 | i = i+1
56 | elif cmd == insert:
57 | j = j+1
58 | else:
59 | assert cmd == replace
60 | i,j = i+1, j+1
61 | else:
62 | i,j = i+1, j+1
63 |
64 | if 2 < c:
65 | continue
66 | elif i < L1:
67 | if L1-i <= model[c:].count(delete):
68 | c = c + (L1-i)
69 | else:
70 | continue
71 | elif j < L2:
72 | if L2-j <= model[c:].count(insert):
73 | c = c + (L2-j)
74 | else:
75 | continue
76 |
77 | if c < res:
78 | res = c
79 |
80 | if res == 3:
81 | res = -1
82 | return res
83 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/evaluation/distance/_iterators.py:
--------------------------------------------------------------------------------
1 | from ._pyimports import levenshtein, fast_comp
2 |
3 | def ilevenshtein(seq1, seqs, max_dist=-1):
4 | """Compute the Levenshtein distance between the sequence `seq1` and the series
5 | of sequences `seqs`.
6 |
7 | `seq1`: the reference sequence
8 | `seqs`: a series of sequences (can be a generator)
9 | `max_dist`: if provided and > 0, only the sequences which distance from
10 | the reference sequence is lower or equal to this value will be returned.
11 |
12 | The return value is a series of pairs (distance, sequence).
13 |
14 | The sequence objects in `seqs` are expected to be of the same kind than
15 | the reference sequence in the C implementation; the same holds true for
16 | `ifast_comp`.
17 | """
18 | for seq2 in seqs:
19 | dist = levenshtein(seq1, seq2, max_dist=max_dist)
20 | if dist != -1:
21 | yield dist, seq2
22 |
23 |
24 | def ifast_comp(seq1, seqs, transpositions=False):
25 | """Return an iterator over all the sequences in `seqs` which distance from
26 | `seq1` is lower or equal to 2. The sequences which distance from the
27 | reference sequence is higher than that are dropped.
28 |
29 | `seq1`: the reference sequence.
30 | `seqs`: a series of sequences (can be a generator)
31 | `transpositions` has the same sense than in `fast_comp`.
32 |
33 | The return value is a series of pairs (distance, sequence).
34 |
35 | You might want to call `sorted()` on the iterator to get the results in a
36 | significant order:
37 |
38 | >>> g = ifast_comp("foo", ["fo", "bar", "foob", "foo", "foobaz"])
39 | >>> sorted(g)
40 | [(0, 'foo'), (1, 'fo'), (1, 'foob')]
41 | """
42 | for seq2 in seqs:
43 | dist = fast_comp(seq1, seq2, transpositions)
44 | if dist != -1:
45 | yield dist, seq2
46 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/evaluation/distance/_lcsubstrings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from array import array
4 |
5 |
6 | def lcsubstrings(seq1, seq2, positions=False):
7 | """Find the longest common substring(s) in the sequences `seq1` and `seq2`.
8 |
9 | If positions evaluates to `True` only their positions will be returned,
10 | together with their length, in a tuple:
11 |
12 | (length, [(start pos in seq1, start pos in seq2)..])
13 |
14 | Otherwise, the substrings themselves will be returned, in a set.
15 |
16 | Example:
17 |
18 | >>> lcsubstrings("sedentar", "dentist")
19 | {'dent'}
20 | >>> lcsubstrings("sedentar", "dentist", positions=True)
21 | (4, [(2, 0)])
22 | """
23 | L1, L2 = len(seq1), len(seq2)
24 | ms = []
25 | mlen = last = 0
26 | if L1 < L2:
27 | seq1, seq2 = seq2, seq1
28 | L1, L2 = L2, L1
29 |
30 | column = array('L', range(L2))
31 |
32 | for i in range(L1):
33 | for j in range(L2):
34 | old = column[j]
35 | if seq1[i] == seq2[j]:
36 | if i == 0 or j == 0:
37 | column[j] = 1
38 | else:
39 | column[j] = last + 1
40 | if column[j] > mlen:
41 | mlen = column[j]
42 | ms = [(i, j)]
43 | elif column[j] == mlen:
44 | ms.append((i, j))
45 | else:
46 | column[j] = 0
47 | last = old
48 |
49 | if positions:
50 | return (mlen, tuple((i - mlen + 1, j - mlen + 1) for i, j in ms if ms))
51 | return set(seq1[i - mlen + 1:i + 1] for i, _ in ms if ms)
52 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/evaluation/distance/_levenshtein.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from array import array
4 |
5 |
6 | def levenshtein(seq1, seq2, max_dist=-1, normalized=False):
7 | """Compute the absolute Levenshtein distance between the two sequences
8 | `seq1` and `seq2`.
9 |
10 | The Levenshtein distance is the minimum number of edit operations necessary
11 | for transforming one sequence into the other. The edit operations allowed are:
12 |
13 | * deletion: ABC -> BC, AC, AB
14 | * insertion: ABC -> ABCD, EABC, AEBC..
15 | * substitution: ABC -> ABE, ADC, FBC..
16 |
17 | The `max_dist` parameter controls at which moment we should stop computing the
18 | distance between the provided sequences. If it is a negative integer, the
19 | distance will be computed until the sequences are exhausted; otherwise, the
20 | computation will stop at the moment the calculated distance is higher than
21 | `max_dist`, and then return -1. For example:
22 |
23 | >>> levenshtein("abc", "abcd", max_dist=1) # dist = 1
24 | 1
25 | >>> levenshtein("abc", "abcde", max_dist=1) # dist = 2
26 | -1
27 |
28 | This can be a time saver if you're not interested in the exact distance, but
29 | only need to check if the distance between the given sequences is below a
30 | given threshold.
31 |
32 | The `normalized` parameter is here for backward compatibility; providing
33 | it will result in a call to `nlevenshtein`, which should be used directly
34 | instead.
35 | """
36 | if normalized:
37 | return nlevenshtein(seq1, seq2, method=1)
38 |
39 | if seq1 == seq2:
40 | return 0
41 |
42 | len1, len2 = len(seq1), len(seq2)
43 | if max_dist >= 0 and abs(len1 - len2) > max_dist:
44 | return -1
45 | if len1 == 0:
46 | return len2
47 | if len2 == 0:
48 | return len1
49 | if len1 < len2:
50 | len1, len2 = len2, len1
51 | seq1, seq2 = seq2, seq1
52 |
53 | column = array('L', range(len2 + 1))
54 |
55 | for x in range(1, len1 + 1):
56 | column[0] = x
57 | last = x - 1
58 | for y in range(1, len2 + 1):
59 | old = column[y]
60 | cost = int(seq1[x - 1] != seq2[y - 1])
61 | column[y] = min(column[y] + 1, column[y - 1] + 1, last + cost)
62 | last = old
63 | if max_dist >= 0 and min(column) > max_dist:
64 | return -1
65 |
66 | if max_dist >= 0 and column[len2] > max_dist:
67 | # stay consistent, even if we have the exact distance
68 | return -1
69 | return column[len2]
70 |
71 |
72 | def nlevenshtein(seq1, seq2, method=1):
73 | """Compute the normalized Levenshtein distance between `seq1` and `seq2`.
74 |
75 | Two normalization methods are provided. For both of them, the normalized
76 | distance will be a float between 0 and 1, where 0 means equal and 1
77 | completely different. The computation obeys the following patterns:
78 |
79 | 0.0 if seq1 == seq2
80 | 1.0 if len(seq1) == 0 or len(seq2) == 0
81 | edit distance / factor otherwise
82 |
83 | The `method` parameter specifies which normalization factor should be used.
84 | It can have the value 1 or 2, which correspond to the following:
85 |
86 | 1: the length of the shortest alignment between the sequences
87 | (that is, the length of the longest sequence)
88 | 2: the length of the longest alignment between the sequences
89 |
90 | Which normalization factor should be chosen is a matter of taste. The first
91 | one is cheap to compute. The second one is more costly, but it accounts
92 | better than the first one for parallelisms of symbols between the sequences.
93 |
94 | For the rationale behind the use of the second method, see:
95 | Heeringa, "Measuring Dialect Pronunciation Differences using Levenshtein
96 | Distance", 2004, p. 130 sq, which is available online at:
97 | http://www.let.rug.nl/~heeringa/dialectology/thesis/thesis.pdf
98 | """
99 |
100 | if seq1 == seq2:
101 | return 0.0
102 | len1, len2 = len(seq1), len(seq2)
103 | if len1 == 0 or len2 == 0:
104 | return 1.0
105 | if len1 < len2: # minimize the arrays size
106 | len1, len2 = len2, len1
107 | seq1, seq2 = seq2, seq1
108 |
109 | if method == 1:
110 | return levenshtein(seq1, seq2) / float(len1)
111 | if method != 2:
112 | raise ValueError("expected either 1 or 2 for `method` parameter")
113 |
114 | column = array('L', range(len2 + 1))
115 | length = array('L', range(len2 + 1))
116 |
117 | for x in range(1, len1 + 1):
118 |
119 | column[0] = length[0] = x
120 | last = llast = x - 1
121 |
122 | for y in range(1, len2 + 1):
123 |
124 | # dist
125 | old = column[y]
126 | ic = column[y - 1] + 1
127 | dc = column[y] + 1
128 | rc = last + (seq1[x - 1] != seq2[y - 1])
129 | column[y] = min(ic, dc, rc)
130 | last = old
131 |
132 | # length
133 | lold = length[y]
134 | lic = length[y - 1] + 1 if ic == column[y] else 0
135 | ldc = length[y] + 1 if dc == column[y] else 0
136 | lrc = llast + 1 if rc == column[y] else 0
137 | length[y] = max(ldc, lic, lrc)
138 | llast = lold
139 |
140 | return column[y] / float(length[y])
141 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/evaluation/distance/_pyimports.py:
--------------------------------------------------------------------------------
1 | from ._fastcomp import *
2 | from ._lcsubstrings import *
3 | from ._levenshtein import *
4 | from ._simpledists import *
5 | from ._iterators import *
6 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/evaluation/distance/_simpledists.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | def hamming(seq1, seq2, normalized=False):
4 | """Compute the Hamming distance between the two sequences `seq1` and `seq2`.
5 | The Hamming distance is the number of differing items in two ordered
6 | sequences of the same length. If the sequences submitted do not have the
7 | same length, an error will be raised.
8 |
9 | If `normalized` evaluates to `False`, the return value will be an integer
10 | between 0 and the length of the sequences provided, edge values included;
11 | otherwise, it will be a float between 0 and 1 included, where 0 means
12 | equal, and 1 totally different. Normalized hamming distance is computed as:
13 |
14 | 0.0 if len(seq1) == 0
15 | hamming_dist / len(seq1) otherwise
16 | """
17 | L = len(seq1)
18 | if L != len(seq2):
19 | raise ValueError("expected two strings of the same length")
20 | if L == 0:
21 | return 0.0 if normalized else 0 # equal
22 | dist = sum(c1 != c2 for c1, c2 in zip(seq1, seq2))
23 | if normalized:
24 | return dist / float(L)
25 | return dist
26 |
27 | def jaccard(seq1, seq2):
28 | """Compute the Jaccard distance between the two sequences `seq1` and `seq2`.
29 | They should contain hashable items.
30 |
31 | The return value is a float between 0 and 1, where 0 means equal, and 1 totally different.
32 | """
33 | set1, set2 = set(seq1), set(seq2)
34 | return 1 - len(set1 & set2) / float(len(set1 | set2))
35 |
36 |
37 | def sorensen(seq1, seq2):
38 | """Compute the Sorensen distance between the two sequences `seq1` and `seq2`.
39 | They should contain hashable items.
40 |
41 | The return value is a float between 0 and 1, where 0 means equal, and 1 totally different.
42 | """
43 | set1, set2 = set(seq1), set(seq2)
44 | return 1 - (2 * len(set1 & set2) / float(len(set1) + len(set2)))
45 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/evaluation/evaluate_bleu.py:
--------------------------------------------------------------------------------
1 | import os, sys, copy, argparse, shutil, pickle, subprocess, logging
2 |
3 | def process_args(args):
4 | parser = argparse.ArgumentParser(description='Evaluate BLEU score')
5 | parser.add_argument('--result-path', dest='result_path',
6 | type=str, required=True,
7 | help=('Result file containing per line. This should be set to the output file of the model.'
8 | ))
9 | parser.add_argument('--data-path', dest='data_path',
10 | type=str, required=True,
11 | help=('Input file which contains the samples to be evaluated. The format is per line.'
12 | ))
13 | parser.add_argument('--label-path', dest='label_path',
14 | type=str, required=True,
15 | help=('Gold label file which contains a tokenized formula per line.'
16 | ))
17 | parser.add_argument('--log-path', dest="log_path",
18 | type=str, default='log.txt',
19 | help=('Log file path, default=log.txt'
20 | ))
21 | parameters = parser.parse_args(args)
22 | return parameters
23 |
24 | def main(args):
25 | script_path = os.path.realpath(__file__)
26 | script_dir = os.path.dirname(script_path)
27 | app_dir = os.path.join(script_dir, '../..')
28 |
29 | parameters = process_args(args)
30 | logging.basicConfig(
31 | level=logging.INFO,
32 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s',
33 | filename=parameters.log_path)
34 |
35 | console = logging.StreamHandler()
36 | console.setLevel(logging.INFO)
37 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s')
38 | console.setFormatter(formatter)
39 | logging.getLogger('').addHandler(console)
40 |
41 | logging.info('Script being executed: %s'%__file__)
42 |
43 | label_path = parameters.label_path
44 | data_path = parameters.data_path
45 | result_path = parameters.result_path
46 | assert os.path.exists(label_path), 'Label file %s not found'%label_path
47 | assert os.path.exists(data_path), 'Data file %s not found'%data_path
48 | assert os.path.exists(result_path), 'Result file %s not found'%result_path
49 |
50 | labels_tmp = {}
51 | labels = {}
52 | with open(label_path) as flabel:
53 | with open(data_path) as fdata:
54 | line_idx = 0
55 | for line in flabel:
56 | labels_tmp[line_idx] = line.strip()
57 | line_idx += 1
58 | for line in fdata:
59 | img_path, idx = line.strip().split()
60 | labels[img_path] = labels_tmp[int(idx)]
61 |
62 | results = {}
63 | with open(result_path) as fin:
64 | for line_idx,line in enumerate(fin):
65 | if line_idx % 1000 == 0:
66 | print (line_idx)
67 | items = line.strip().split('\t')
68 | if len(items) == 5:
69 | img_path, label_gold, label_pred, score_pred, score_gold = items
70 | if not img_path in labels:
71 | logging.warning('%s in result file while not in the gold file!'%img_path)
72 | results[img_path] = label_pred+'\n'
73 |
74 | fpred = open('.tmp.pred.txt', 'w')
75 | fgold = open('.tmp.gold.txt', 'w')
76 | for img_path in labels:
77 | fpred.write(results.setdefault(img_path, '\n'))
78 | fgold.write(labels[img_path]+'\n')
79 | fpred.close()
80 | fgold.close()
81 | metric = subprocess.check_output('perl third_party/multi-bleu.perl %s < %s'%('.tmp.gold.txt', '.tmp.pred.txt'), shell=True)
82 | #os.remove('.tmp.pred.txt')
83 | #os.remove('.tmp.gold.txt')
84 | logging.info(metric)
85 |
86 | if __name__ == '__main__':
87 | main(sys.argv[1:])
88 | logging.info('Jobs finished')
89 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/evaluation/evaluate_text_edit_distance.py:
--------------------------------------------------------------------------------
1 | import os, sys, argparse, logging
2 | import distance
3 |
4 |
5 | def process_args(args):
6 | parser = argparse.ArgumentParser(description='Evaluate text edit distance.')
7 |
8 | parser.add_argument('--result-path', dest='result_path',
9 | type=str, required=True,
10 | help=('Result file containing per line. This should be set to the output file of the model.'
11 | ))
12 |
13 | parser.add_argument('--log-path', dest="log_path",
14 | type=str, default='log.txt',
15 | help=('Log file path, default=log.txt'
16 | ))
17 | parameters = parser.parse_args(args)
18 | return parameters
19 |
20 | def main(args):
21 | parameters = process_args(args)
22 | logging.basicConfig(
23 | level=logging.INFO,
24 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s',
25 | filename=parameters.log_path)
26 |
27 | console = logging.StreamHandler()
28 | console.setLevel(logging.INFO)
29 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s')
30 | console.setFormatter(formatter)
31 | logging.getLogger('').addHandler(console)
32 |
33 | logging.info('Script being executed: %s'%__file__)
34 |
35 | result_file = parameters.result_path
36 | total_ref = 0
37 | total_edit_distance = 0
38 | with open(result_file) as fin:
39 | for idx,line in enumerate(fin):
40 | if idx % 100 == 0:
41 | print (idx)
42 | items = line.strip().split('\t')
43 | if len(items) == 5:
44 | img_path, label_gold, label_pred, score_pred, score_gold = items
45 | l_pred = label_pred.strip()
46 | l_gold = label_gold.strip()
47 | tokens_pred = l_pred.split(' ')
48 | tokens_gold = l_gold.split(' ')
49 | ref = max(len(tokens_gold), len(tokens_pred))
50 | edit_distance = distance.levenshtein(tokens_gold, tokens_pred)
51 | total_ref += ref
52 | total_edit_distance += edit_distance
53 | logging.info('Edit Distance Accuracy: %f'%(1.-float(total_edit_distance)/total_ref))
54 |
55 | if __name__ == '__main__':
56 | main(sys.argv[1:])
57 | logging.info('Jobs finished')
58 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/evaluation/render_html.py:
--------------------------------------------------------------------------------
1 | import sys, os, re, shutil, argparse, logging
2 | sys.path.insert(0, '%s'%os.path.join(os.path.dirname(__file__), '../utils/'))
3 | from image_utils import *
4 | from multiprocessing import Pool
5 | from multiprocessing.dummy import Pool as ThreadPool
6 |
7 | W=100
8 | H=100
9 |
10 | def process_args(args):
11 | parser = argparse.ArgumentParser(description='Render HTML files for comparison. Note that we render both the predicted results, and the original HTMLs.')
12 |
13 | parser.add_argument('--result-path', dest='result_path',
14 | type=str, required=True,
15 | help=('Result file containing per line. This should be set to the output file of the model.'
16 | ))
17 | parser.add_argument('--output-dir', dest='output_dir',
18 | type=str, required=True,
19 | help=('Output directory to put the rendered images. A subfolder with name "images_gold" will be created for the rendered gold images, and a subfolder with name "images_pred" will be created for the rendered predictions.'
20 | ))
21 |
22 | parser.add_argument('--replace', dest='replace', action='store_true',
23 | help=('Replace flag, if set to false, will ignore the already existing images.'
24 | ))
25 | parser.add_argument('--no-replace', dest='replace', action='store_false')
26 | parser.set_defaults(replace=False)
27 | parser.add_argument('--num-threads', dest='num_threads',
28 | type=int, default=4,
29 | help=('Number of threads, default=4.'
30 | ))
31 | parser.add_argument('--log-path', dest="log_path",
32 | type=str, default='log.txt',
33 | help=('Log file path, default=log.txt'
34 | ))
35 | parameters = parser.parse_args(args)
36 | return parameters
37 |
38 | def main(args):
39 | parameters = process_args(args)
40 | logging.basicConfig(
41 | level=logging.INFO,
42 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s',
43 | filename=parameters.log_path)
44 |
45 | console = logging.StreamHandler()
46 | console.setLevel(logging.INFO)
47 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s')
48 | console.setFormatter(formatter)
49 | logging.getLogger('').addHandler(console)
50 |
51 | logging.info('Script being executed: %s'%__file__)
52 |
53 | result_path = parameters.result_path
54 | output_dir = parameters.output_dir
55 | assert os.path.exists(result_path), result_path
56 |
57 | pred_dir = os.path.join(output_dir, 'images_pred')
58 | gold_dir = os.path.join(output_dir, 'images_gold')
59 | for dirname in [pred_dir, gold_dir]:
60 | if not os.path.exists(dirname):
61 | os.makedirs(dirname)
62 |
63 | lines = []
64 | with open(result_path) as fin:
65 | for idx,line in enumerate(fin):
66 | items = line.strip().split('\t')
67 | if len(items) == 5:
68 | img_path, label_gold, label_pred, score_pred, score_gold = items
69 | img_idx = img_path[:-9]
70 | lines.append((label_pred, img_idx, pred_dir, parameters.replace))
71 | lines.append((label_gold, img_idx, gold_dir, parameters.replace))
72 |
73 | logging.info('Creating pool with %d threads'%parameters.num_threads)
74 | pool = ThreadPool(parameters.num_threads)
75 | logging.info('Jobs running...')
76 | results = pool.map(main_parallel, lines)
77 | pool.close()
78 | pool.join()
79 |
80 |
81 | def main_parallel(l):
82 | label, img_idx, dirname, replace = l
83 | if replace or (not os.path.exists('%s/%s-full.png'%(dirname, img_idx))):
84 | html_name = '%s_%s.html'%(dirname, img_idx)
85 | with open(html_name, 'w') as fout:
86 | fout.write(label)
87 | os.system('webkit2png --clipwidth=1 --clipheight=1 -Fs 1 -W %d -H %d %s -o %s/%s'%(W,H,html_name,dirname,img_idx))
88 | os.remove(html_name)
89 |
90 | if __name__ == '__main__':
91 | main(sys.argv[1:])
92 | logging.info('Jobs finished')
93 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/evaluation/render_latex.py:
--------------------------------------------------------------------------------
1 | import sys, os, re, shutil, argparse, logging
2 | sys.path.insert(0, '%s'%os.path.join(os.path.dirname(__file__), '../utils/'))
3 | from utils import run
4 | from image_utils import *
5 | from multiprocessing import Pool
6 | from multiprocessing.dummy import Pool as ThreadPool
7 |
8 |
9 | TIMEOUT = 10
10 |
11 | # replace \pmatrix with \begin{pmatrix}\end{pmatrix}
12 | # replace \matrix with \begin{matrix}\end{matrix}
13 | template = r"""
14 | \documentclass[12pt]{article}
15 | \pagestyle{empty}
16 | \usepackage{amsmath}
17 | \newcommand{\mymatrix}[1]{\begin{matrix}#1\end{matrix}}
18 | \newcommand{\mypmatrix}[1]{\begin{pmatrix}#1\end{pmatrix}}
19 | \begin{document}
20 | \begin{displaymath}
21 | %s
22 | \end{displaymath}
23 | \end{document}
24 | """
25 |
26 |
27 | def process_args(args):
28 | parser = argparse.ArgumentParser(description='Render latex formulas for comparison. Note that we need to render both the predicted results, and the original formulas, since we need to make sure the same environment of rendering is used.')
29 |
30 | parser.add_argument('--result-path', dest='result_path',
31 | type=str, required=True,
32 | help=('Result file containing per line. This should be set to the output file of the model.'
33 | ))
34 | parser.add_argument('--data-path', dest='data_path',
35 | type=str, required=True,
36 | help=('Input file which contains the samples to be evaluated. The format is per line.'
37 | ))
38 | parser.add_argument('--label-path', dest='label_path',
39 | type=str, required=True,
40 | help=('Gold label file which contains a formula per line. Note that this does not necessarily need to be tokenized, and for comparing against the gold standard, the original (un-preprocessed) label file shall be used.'
41 | ))
42 | parser.add_argument('--output-dir', dest='output_dir',
43 | type=str, required=True,
44 | help=('Output directory to put the rendered images. A subfolder with name "images_gold" will be created for the rendered gold images, and a subfolder with name "images_pred" will be created for the rendered predictions.'
45 | ))
46 |
47 | parser.add_argument('--replace', dest='replace', action='store_true',
48 | help=('Replace flag, if set to false, will ignore the already existing images.'
49 | ))
50 | parser.add_argument('--no-replace', dest='replace', action='store_false')
51 | parser.set_defaults(replace=False)
52 | parser.add_argument('--num-threads', dest='num_threads',
53 | type=int, default=4,
54 | help=('Number of threads, default=4.'
55 | ))
56 | parser.add_argument('--log-path', dest="log_path",
57 | type=str, default='log.txt',
58 | help=('Log file path, default=log.txt'
59 | ))
60 | parameters = parser.parse_args(args)
61 | return parameters
62 |
63 | def main(args):
64 | parameters = process_args(args)
65 | logging.basicConfig(
66 | level=logging.INFO,
67 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s',
68 | filename=parameters.log_path)
69 |
70 | console = logging.StreamHandler()
71 | console.setLevel(logging.INFO)
72 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s')
73 | console.setFormatter(formatter)
74 | logging.getLogger('').addHandler(console)
75 |
76 | logging.info('Script being executed: %s'%__file__)
77 |
78 | result_path = parameters.result_path
79 | data_path = parameters.data_path
80 | label_path = parameters.label_path
81 | output_dir = parameters.output_dir
82 | assert os.path.exists(label_path), label_path
83 | assert os.path.exists(result_path), result_path
84 | assert os.path.exists(data_path), data_path
85 |
86 | pred_dir = os.path.join(output_dir, 'images_pred')
87 | gold_dir = os.path.join(output_dir, 'images_gold')
88 | for dirname in [pred_dir, gold_dir]:
89 | if not os.path.exists(dirname):
90 | os.makedirs(dirname)
91 |
92 |
93 | formulas = open(label_path).readlines()
94 | lines = []
95 | with open(data_path) as fin:
96 | for line in fin:
97 | img_path, line_idx = line.strip().split()
98 | lines.append((img_path, formulas[int(line_idx)], os.path.join(gold_dir, img_path), parameters.replace))
99 | with open(result_path) as fin:
100 | for line in fin:
101 | img_path, label_gold, label_pred, _, _ = line.strip().split('\t')
102 | lines.append((img_path, label_pred, os.path.join(pred_dir, img_path), parameters.replace))
103 | logging.info('Creating pool with %d threads'%parameters.num_threads)
104 | pool = ThreadPool(parameters.num_threads)
105 | logging.info('Jobs running...')
106 | results = pool.map(main_parallel, lines)
107 | pool.close()
108 | pool.join()
109 |
110 | def output_err(output_path, i, reason, img):
111 | logging.info('ERROR: %s %s\n'%(img,reason))
112 |
113 | def main_parallel(line):
114 | img_path, l, output_path, replace = line
115 | pre_name = output_path.replace('/', '_').replace('.','_')
116 | l = l.strip()
117 | l = l.replace(r'\pmatrix', r'\mypmatrix')
118 | l = l.replace(r'\matrix', r'\mymatrix')
119 | # remove leading comments
120 | l = l.strip('%')
121 | if len(l) == 0:
122 | l = '\\hspace{1cm}'
123 | # \hspace {1 . 5 cm} -> \hspace {1.5cm}
124 | for space in ["hspace", "vspace"]:
125 | match = re.finditer(space + " {(.*?)}", l)
126 | if match:
127 | new_l = ""
128 | last = 0
129 | for m in match:
130 | new_l = new_l + l[last:m.start(1)] + m.group(1).replace(" ", "")
131 | last = m.end(1)
132 | new_l = new_l + l[last:]
133 | l = new_l
134 | if replace or (not os.path.exists(output_path)):
135 | tex_filename = pre_name+'.tex'
136 | log_filename = pre_name+'.log'
137 | aux_filename = pre_name+'.aux'
138 | with open(tex_filename, "w") as w:
139 | print >> w, (template%l)
140 | run("pdflatex -interaction=nonstopmode %s >/dev/null"%tex_filename, TIMEOUT)
141 | os.remove(tex_filename)
142 | os.remove(log_filename)
143 | os.remove(aux_filename)
144 | pdf_filename = tex_filename[:-4]+'.pdf'
145 | png_filename = tex_filename[:-4]+'.png'
146 | if not os.path.exists(pdf_filename):
147 | output_err(output_path, 0, 'cannot compile', img_path)
148 | else:
149 | os.system("convert -density 200 -quality 100 %s %s"%(pdf_filename, png_filename))
150 | os.remove(pdf_filename)
151 | if os.path.exists(png_filename):
152 | crop_image(png_filename, output_path)
153 | os.remove(png_filename)
154 |
155 |
156 | if __name__ == '__main__':
157 | main(sys.argv[1:])
158 | logging.info('Jobs finished')
159 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/preprocessing/generate_latex_vocab.py:
--------------------------------------------------------------------------------
1 | import sys, logging, argparse, os
2 |
3 | def process_args(args):
4 | parser = argparse.ArgumentParser(description='Generate vocabulary file.')
5 |
6 | parser.add_argument('--data-path', dest='data_path',
7 | type=str, required=True,
8 | help=('Input file containing per line. This should be the file used for training.'
9 | ))
10 | parser.add_argument('--label-path', dest='label_path',
11 | type=str, required=True,
12 | help=('Input file containing a tokenized formula per line.'
13 | ))
14 | parser.add_argument('--output-file', dest='output_file',
15 | type=str, required=True,
16 | help=('Output file for putting vocabulary.'
17 | ))
18 | parser.add_argument('--unk-threshold', dest='unk_threshold',
19 | type=int, default=1,
20 | help=('If the number of occurences of a token is less than (including) the threshold, then it will be excluded from the generated vocabulary.'
21 | ))
22 | parser.add_argument('--log-path', dest="log_path",
23 | type=str, default='log.txt',
24 | help=('Log file path, default=log.txt'
25 | ))
26 | parameters = parser.parse_args(args)
27 | return parameters
28 |
29 | def main(args):
30 | parameters = process_args(args)
31 | logging.basicConfig(
32 | level=logging.INFO,
33 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s',
34 | filename=parameters.log_path)
35 |
36 | console = logging.StreamHandler()
37 | console.setLevel(logging.INFO)
38 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s')
39 | console.setFormatter(formatter)
40 | logging.getLogger('').addHandler(console)
41 |
42 | logging.info('Script being executed: %s'%__file__)
43 |
44 | label_path = parameters.label_path
45 | assert os.path.exists(label_path), label_path
46 | data_path = parameters.data_path
47 | assert os.path.exists(data_path), data_path
48 |
49 | formulas = open(label_path).readlines()
50 | vocab = {}
51 | max_len = 0
52 | with open(data_path) as fin:
53 | for line in fin:
54 | _, line_idx = line.strip().split()
55 | line_strip = formulas[int(line_idx)].strip()
56 | tokens = line_strip.split()
57 | tokens_out = []
58 | for token in tokens:
59 | tokens_out.append(token)
60 | if token not in vocab:
61 | vocab[token] = 0
62 | vocab[token] += 1
63 |
64 | vocab_sort = sorted(list(vocab.keys()))
65 | vocab_out = []
66 | unk_tokens = []
67 | num_unknown = 0
68 | for word in vocab_sort:
69 | if vocab[word] > parameters.unk_threshold:
70 | vocab_out.append(word)
71 | else:
72 | unk_tokens.append(word)
73 | num_unknown += 1
74 | #vocab = ["'"+word.replace('\\','\\\\').replace('\'', '\\\'')+"'" for word in vocab_out]
75 | vocab = [word for word in vocab_out]
76 |
77 | with open(parameters.output_file, 'w') as fout:
78 | fout.write('\n'.join(vocab))
79 | logging.info('#UNK\'s: %d'%num_unknown)
80 | logging.info('UNK tokens:\n%s', unk_tokens)
81 |
82 | if __name__ == '__main__':
83 | main(sys.argv[1:])
84 | logging.info('Jobs finished')
85 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/preprocessing/preprocess_filter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys, os, argparse, logging
3 | import numpy as np
4 | import PIL
5 | from PIL import Image
6 |
7 | def process_args(args):
8 | parser = argparse.ArgumentParser(description='Process im2latex-100k train, test, development files ( ) for formatting files such that can be used for training. ( >). Additionaly, if flag is set, large images, too long formulas and formulas that cannot be parsed will be discarded.')
9 |
10 | parser.add_argument('--image-dir', dest='image_dir',
11 | type=str, default='',
12 | help=('Directory containing processed images.'
13 | ))
14 | parser.add_argument('--data-path', dest='data_path',
15 | type=str, required=True,
16 | help=('Input file path containing per line. Note that does not contain postfix.'
17 | ))
18 | parser.add_argument('--output-path', dest='output_path',
19 | type=str, required=True,
20 | help=('Output file path containing per line. Note that does contain postfix. If filter flag is set, then the output file may have less lines than original file.'
21 | ))
22 |
23 | parser.add_argument('--label-path', dest='label_path',
24 | type=str, default='',
25 | help=('Input label path containing per line. This is required if filter flag is set, and data point with blank formulas will be discarded.'
26 | ))
27 | parser.add_argument('--filter', dest='filter', action='store_true',
28 | help=('Filter flag, if set, then too large images, formulas that cannot be parsed or have too many tokens will be discarded.'
29 | ))
30 | parser.add_argument('--no-filter', dest='filter', action='store_false')
31 | parser.set_defaults(filter=False)
32 | parser.add_argument('--max-width', dest='max_width',
33 | type=int, default=500,
34 | help=('If filter flag is set, images with width than max-width will be discarded in the output file.'
35 | ))
36 | parser.add_argument('--max-height', dest='max_height',
37 | type=int, default=160,
38 | help=('If filter flag is set, images with larger height than max-width will be discarded in the output file.'
39 | ))
40 | parser.add_argument('--max-tokens', dest='max_tokens',
41 | type=int, default=150,
42 | help=('If filter flag is set, formulas with more than max-tokens tokens will be discarded in the output file.'
43 | ))
44 | parser.add_argument('--log-path', dest="log_path",
45 | type=str, default='log.txt',
46 | help=('Log file path, default=log.txt'
47 | ))
48 | parser.add_argument('--postfix', dest='postfix',
49 | type=str, default='.png',
50 | help=('The format of images, default=".png".'
51 | ))
52 | parameters = parser.parse_args(args)
53 | return parameters
54 |
55 | def main(args):
56 | parameters = process_args(args)
57 | logging.basicConfig(
58 | level=logging.INFO,
59 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s',
60 | filename=parameters.log_path)
61 |
62 | console = logging.StreamHandler()
63 | console.setLevel(logging.INFO)
64 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s')
65 | console.setFormatter(formatter)
66 | logging.getLogger('').addHandler(console)
67 |
68 | logging.info('Script being executed: %s'%__file__)
69 | data_path = parameters.data_path
70 | output_path = parameters.output_path
71 | image_dir = parameters.image_dir
72 |
73 | num_discard = 0
74 | num_nonexist = 0
75 |
76 | if parameters.filter:
77 | assert os.path.isfile(parameters.label_path), parameters.label_path
78 | labels = open(parameters.label_path).readlines()
79 | with open(output_path, 'w') as fout:
80 | with open(data_path, 'r') as fdata:
81 | for line in fdata:
82 | line_strip = line.strip()
83 | if len(line_strip) > 0:
84 | line_idx, img_path, mod = line_strip.split()
85 | img_path = os.path.join(image_dir, img_path) + parameters.postfix
86 | if parameters.filter:
87 | if not os.path.exists(img_path):
88 | logging.warning('%s does not exist!'%os.path.basename(img_path))
89 | num_nonexist += 1
90 | continue
91 | old_im = Image.open(img_path)
92 | old_size = old_im.size
93 | w = old_size[0]
94 | h = old_size[1]
95 | else:
96 | w = 0
97 | h = 0
98 | if (not parameters.filter) or (w <= parameters.max_width and h <= parameters.max_height):
99 | if parameters.filter:
100 | label = labels[int(line_idx)]
101 | if len(label.strip()) == 0:
102 | logging.info('%s discarded due to cannot-be-parsed formula!'%os.path.basename(img_path))
103 | continue
104 | if len(label.strip().split()) > parameters.max_tokens:
105 | logging.info('%s discarded due to too many tokens!'%os.path.basename(img_path))
106 | continue
107 | fout.write('%s %s\n'%(os.path.basename(img_path),line_idx))
108 | else:
109 | logging.info('%s discarded due to large image size!'%os.path.basename(img_path))
110 | num_discard += 1
111 | logging.info('%d discarded. %d not found in %s.'%(num_discard, num_nonexist, image_dir))
112 |
113 |
114 | if __name__ == '__main__':
115 | main(sys.argv[1:])
116 | logging.info('Jobs finished')
117 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/preprocessing/preprocess_formulas.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # tokenize latex formulas
3 | import sys, os, argparse, logging, subprocess, shutil
4 |
5 | def is_ascii(str):
6 | try:
7 | str.decode('ascii')
8 | return True
9 | except UnicodeError:
10 | return False
11 |
12 | def process_args(args):
13 | parser = argparse.ArgumentParser(description='Preprocess (tokenize or normalize) latex formulas')
14 |
15 | parser.add_argument('--mode', dest='mode',
16 | choices=['tokenize', 'normalize'], required=True,
17 | help=('Tokenize (split to tokens seperated by space) or normalize (further translate to an equivalent standard form).'
18 | ))
19 | parser.add_argument('--input-file', dest='input_file',
20 | type=str, required=True,
21 | help=('Input file containing latex formulas. One formula per line.'
22 | ))
23 | parser.add_argument('--output-file', dest='output_file',
24 | type=str, required=True,
25 | help=('Output file.'
26 | ))
27 | parser.add_argument('--num-threads', dest='num_threads',
28 | type=int, default=4,
29 | help=('Number of threads, default=4.'
30 | ))
31 | parser.add_argument('--log-path', dest="log_path",
32 | type=str, default='log.txt',
33 | help=('Log file path, default=log.txt'
34 | ))
35 | parameters = parser.parse_args(args)
36 | return parameters
37 |
38 | def main(args):
39 | parameters = process_args(args)
40 | logging.basicConfig(
41 | level=logging.INFO,
42 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s',
43 | filename=parameters.log_path)
44 |
45 | console = logging.StreamHandler()
46 | console.setLevel(logging.INFO)
47 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s')
48 | console.setFormatter(formatter)
49 | logging.getLogger('').addHandler(console)
50 |
51 | logging.info('Script being executed: %s'%__file__)
52 |
53 | input_file = parameters.input_file
54 | output_file = parameters.output_file
55 |
56 | assert os.path.exists(input_file), input_file
57 | cmd = "perl -pe 's|hskip(.*?)(cm\\|in\\|pt\\|mm\\|em)|hspace{\\1\\2}|g' %s > %s"%(input_file, output_file)
58 | ret = subprocess.call(cmd, shell=True)
59 | if ret != 0:
60 | logging.error('FAILED: %s'%cmd)
61 |
62 | temp_file = output_file + '.tmp'
63 | with open(temp_file, 'w') as fout:
64 | with open(output_file) as fin:
65 | for line in fin:
66 | fout.write(line.replace('\r', ' ').strip() + '\n') # delete \r
67 |
68 | cmd = "cat %s | node scripts/preprocessing/preprocess_latex.js %s > %s "%(temp_file, parameters.mode, output_file)
69 | ret = subprocess.call(cmd, shell=True)
70 | os.remove(temp_file)
71 | if ret != 0:
72 | logging.error('FAILED: %s'%cmd)
73 | temp_file = output_file + '.tmp'
74 | shutil.move(output_file, temp_file)
75 | with open(temp_file) as fin:
76 | with open(output_file, 'w') as fout:
77 | for line in fin:
78 | tokens = line.strip().split()
79 | tokens_out = []
80 | for token in tokens:
81 | if is_ascii(token):
82 | tokens_out.append(token)
83 | fout.write(' '.join(tokens_out)+'\n')
84 | os.remove(temp_file)
85 |
86 | if __name__ == '__main__':
87 | main(sys.argv[1:])
88 | logging.info('Jobs finished')
89 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/preprocessing/preprocess_images.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Preprocess images for ease of training
3 | import sys, os, argparse, json, glob, logging
4 | import numpy as np
5 | from PIL import Image
6 | sys.path.insert(0, '%s'%os.path.join(os.path.dirname(__file__), '../utils/'))
7 | from image_utils import *
8 | from multiprocessing import Pool
9 | from multiprocessing.dummy import Pool as ThreadPool
10 |
11 | def process_args(args):
12 | parser = argparse.ArgumentParser(description='Process images for ease of training. Crop images to get rid of the background. For a cropped image of size (w,h), we pad it with PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT, and the result is of size (w+PAD_LEFT+PAD_RIGHT, h+PAD_TOP+PAD_BOTTOM. Then we see which bucket it falls into and pad them with whitespace to match the smallest bucket that can hold it. Finally, downsample images.')
13 |
14 | parser.add_argument('--input-dir', dest='input_dir',
15 | type=str, required=True,
16 | help=('Input directory containing orginal images.'
17 | ))
18 | parser.add_argument('--output-dir', dest='output_dir',
19 | type=str, required=True,
20 | help=('Output directory to put processed images.'
21 | ))
22 | parser.add_argument('--num-threads', dest='num_threads',
23 | type=int, default=4,
24 | help=('Number of threads, default=4.'
25 | ))
26 | parser.add_argument('--crop-blank-default-size', dest='crop_blank_default_size',
27 | type=str, default='[600,60]',
28 | help=('If an image is blank, this is the size of the cropped image, should be a Json string. Default=(600,60).'
29 | ))
30 | parser.add_argument('--pad-size', dest='pad_size',
31 | type=str, default='[8,8,8,8]',
32 | help=('We pad the cropped image to the top, left, bottom, right with whitespace of size PAD_TOP, PAD_LEFT, PAD_BOTTOM, PAD_RIGHT, should be a Json string. Default=(8,8,8,8).'
33 | ))
34 | parser.add_argument('--buckets', dest='buckets',
35 | type=str, default='[[240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100], [560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100], [720, 120], [720, 200], [800, 100], [800, 320], [1000, 200], [1000, 400], [1200, 200], [1600, 200], [1600, 1600]]',
36 | help=('Bucket sizes used for grouping. Should be a Json string. Note that this denotes the bucket size after padding and before downsampling.'
37 | ))
38 | parser.add_argument('--downsample-ratio', dest='downsample_ratio',
39 | type=float, default=2.,
40 | help=('The ratio of downsampling, default=2.0.'
41 | ))
42 | parser.add_argument('--log-path', dest="log_path",
43 | type=str, default='log.txt',
44 | help=('Log file path, default=log.txt'
45 | ))
46 | parser.add_argument('--postfix', dest='postfix',
47 | type=str, default='.png',
48 | help=('The format of images, default=".png".'
49 | ))
50 | parameters = parser.parse_args(args)
51 | return parameters
52 |
53 | def main_parallel(l):
54 | filename, postfix, output_filename, crop_blank_default_size, pad_size, buckets, downsample_ratio = l
55 | postfix_length = len(postfix)
56 | status = crop_image(filename, output_filename, crop_blank_default_size)
57 | if not status:
58 | logging.info('%s is blank, crop a white image of default size!'%filename)
59 | status = pad_group_image(output_filename, output_filename, pad_size, buckets)
60 | if not status:
61 | logging.info('%s (after cropping and padding) is larger than the largest provided bucket size, left unchanged!'%filename)
62 | status = downsample_image(output_filename, output_filename, downsample_ratio)
63 |
64 | def main(args):
65 | parameters = process_args(args)
66 | logging.basicConfig(
67 | level=logging.INFO,
68 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s',
69 | filename=parameters.log_path)
70 |
71 | console = logging.StreamHandler()
72 | console.setLevel(logging.INFO)
73 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s')
74 | console.setFormatter(formatter)
75 | logging.getLogger('').addHandler(console)
76 |
77 | logging.info('Script being executed: %s'%__file__)
78 |
79 | output_dir = parameters.output_dir
80 | if not os.path.exists(output_dir):
81 | os.makedirs(output_dir)
82 |
83 | input_dir = parameters.input_dir
84 | postfix = parameters.postfix
85 | crop_blank_default_size = json.loads(parameters.crop_blank_default_size)
86 | pad_size = json.loads(parameters.pad_size)
87 | buckets = json.loads(parameters.buckets)
88 | downsample_ratio = parameters.downsample_ratio
89 |
90 | filenames = glob.glob(os.path.join(input_dir, '*'+postfix))
91 | logging.info('Creating pool with %d threads'%parameters.num_threads)
92 | pool = ThreadPool(parameters.num_threads)
93 | logging.info('Jobs running...')
94 | results = pool.map(main_parallel, [(filename, postfix, os.path.join(output_dir, os.path.basename(filename)), crop_blank_default_size, pad_size, buckets, downsample_ratio) for filename in filenames])
95 | pool.close()
96 | pool.join()
97 |
98 | if __name__ == '__main__':
99 | main(sys.argv[1:])
100 | logging.info('Jobs finished')
101 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/utils/image_utils.py:
--------------------------------------------------------------------------------
1 | import PIL
2 | from PIL import Image
3 | import numpy as np
4 |
5 | def crop_image(img, output_path, default_size=None):
6 | old_im = Image.open(img).convert('L')
7 | img_data = np.asarray(old_im, dtype=np.uint8) # height, width
8 | nnz_inds = np.where(img_data!=255)
9 | if len(nnz_inds[0]) == 0:
10 | if not default_size:
11 | old_im.save(output_path)
12 | return False
13 | else:
14 | assert len(default_size) == 2, default_size
15 | x_min,y_min,x_max,y_max = 0,0,default_size[0],default_size[1]
16 | old_im = old_im.crop((x_min, y_min, x_max+1, y_max+1))
17 | old_im.save(output_path)
18 | return False
19 | y_min = np.min(nnz_inds[0])
20 | y_max = np.max(nnz_inds[0])
21 | x_min = np.min(nnz_inds[1])
22 | x_max = np.max(nnz_inds[1])
23 | old_im = old_im.crop((x_min, y_min, x_max+1, y_max+1))
24 | old_im.save(output_path)
25 | return True
26 |
27 | def pad_group_image(img, output_path, pad_size, buckets):
28 | PAD_TOP, PAD_LEFT, PAD_BOTTOM, PAD_RIGHT = pad_size
29 | old_im = Image.open(img)
30 | old_size = (old_im.size[0]+PAD_LEFT+PAD_RIGHT, old_im.size[1]+PAD_TOP+PAD_BOTTOM)
31 | j = -1
32 | for i in range(len(buckets)):
33 | if old_size[0]<=buckets[i][0] and old_size[1]<=buckets[i][1]:
34 | j = i
35 | break
36 | if j < 0:
37 | new_size = old_size
38 | new_im = Image.new("RGB", new_size, (255,255,255))
39 | new_im.paste(old_im, (PAD_LEFT,PAD_TOP))
40 | new_im.save(output_path)
41 | return False
42 | new_size = buckets[j]
43 | new_im = Image.new("RGB", new_size, (255,255,255))
44 | new_im.paste(old_im, (PAD_LEFT,PAD_TOP))
45 | new_im.save(output_path)
46 | return True
47 |
48 | def downsample_image(img, output_path, ratio):
49 | assert ratio>=1, ratio
50 | if ratio == 1:
51 | return True
52 | old_im = Image.open(img)
53 | old_size = old_im.size
54 | new_size = (int(old_size[0]/ratio), int(old_size[1]/ratio))
55 |
56 | new_im = old_im.resize(new_size, PIL.Image.LANCZOS)
57 | new_im.save(output_path)
58 | return True
59 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/scripts/utils/utils.py:
--------------------------------------------------------------------------------
1 | import subprocess, shlex
2 | from threading import Timer
3 |
4 | def run(cmd, timeout_sec):
5 | proc = subprocess.Popen(cmd, shell=True)
6 | kill_proc = lambda p: p.kill()
7 | timer = Timer(timeout_sec, kill_proc, [proc])
8 | try:
9 | timer.start()
10 | stdout,stderr = proc.communicate()
11 | finally:
12 | timer.cancel()
13 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/.#katex.js:
--------------------------------------------------------------------------------
1 | srush@beaker.12118:1471814512
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Khan Academy
4 |
5 | This software also uses portions of the underscore.js project, which is
6 | MIT licensed with the following copyright:
7 |
8 | Copyright (c) 2009-2015 Jeremy Ashkenas, DocumentCloud and Investigative
9 | Reporters & Editors
10 |
11 | Permission is hereby granted, free of charge, to any person obtaining a copy
12 | of this software and associated documentation files (the "Software"), to deal
13 | in the Software without restriction, including without limitation the rights
14 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 | copies of the Software, and to permit persons to whom the Software is
16 | furnished to do so, subject to the following conditions:
17 |
18 | The above copyright notice and this permission notice shall be included in all
19 | copies or substantial portions of the Software.
20 |
21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 | SOFTWARE.
28 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/README.md:
--------------------------------------------------------------------------------
1 | # [
](https://khan.github.io/KaTeX/) [](https://travis-ci.org/Khan/KaTeX)
2 |
3 | [](https://gitter.im/Khan/KaTeX?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
4 |
5 | KaTeX is a fast, easy-to-use JavaScript library for TeX math rendering on the web.
6 |
7 | * **Fast:** KaTeX renders its math synchronously and doesn't need to reflow the page. See how it compares to a competitor in [this speed test](http://jsperf.com/katex-vs-mathjax/).
8 | * **Print quality:** KaTeX’s layout is based on Donald Knuth’s TeX, the gold standard for math typesetting.
9 | * **Self contained:** KaTeX has no dependencies and can easily be bundled with your website resources.
10 | * **Server side rendering:** KaTeX produces the same output regardless of browser or environment, so you can pre-render expressions using Node.js and send them as plain HTML.
11 |
12 | KaTeX supports all major browsers, including Chrome, Safari, Firefox, Opera, and IE 8 - IE 11. A list of supported commands can be on the [wiki](https://github.com/Khan/KaTeX/wiki/Function-Support-in-KaTeX).
13 |
14 | ## Usage
15 |
16 | You can [download KaTeX](https://github.com/khan/katex/releases) and host it on your server or include the `katex.min.js` and `katex.min.css` files on your page directly from a CDN:
17 |
18 | ```html
19 |
20 |
21 | ```
22 |
23 | #### In-browser rendering
24 |
25 | Call `katex.render` with a TeX expression and a DOM element to render into:
26 |
27 | ```js
28 | katex.render("c = \\pm\\sqrt{a^2 + b^2}", element);
29 | ```
30 |
31 | If KaTeX can't parse the expression, it throws a `katex.ParseError` error.
32 |
33 | #### Server side rendering or rendering to a string
34 |
35 | To generate HTML on the server or to generate an HTML string of the rendered math, you can use `katex.renderToString`:
36 |
37 | ```js
38 | var html = katex.renderToString("c = \\pm\\sqrt{a^2 + b^2}");
39 | // '...'
40 | ```
41 |
42 | Make sure to include the CSS and font files, but there is no need to include the JavaScript. Like `render`, `renderToString` throws if it can't parse the expression.
43 |
44 | #### Rendering options
45 |
46 | You can provide an object of options as the last argument to `katex.render` and `katex.renderToString`. Available options are:
47 |
48 | - `displayMode`: `boolean`. If `true` the math will be rendered in display mode, which will put the math in display style (so `\int` and `\sum` are large, for example), and will center the math on the page on its own line. If `false` the math will be rendered in inline mode. (default: `false`)
49 | - `throwOnError`: `boolean`. If `true`, KaTeX will throw a `ParseError` when it encounters an unsupported command. If `false`, KaTeX will render the unsupported command as text in the color given by `errorColor`. (default: `true`)
50 | - `errorColor`: `string`. A color string given in the format `"#XXX"` or `"#XXXXXX"`. This option determines the color which unsupported commands are rendered in. (default: `#cc0000`)
51 |
52 | For example:
53 |
54 | ```js
55 | katex.render("c = \\pm\\sqrt{a^2 + b^2}", element, { displayMode: true });
56 | ```
57 |
58 | #### Automatic rendering of math on a page
59 |
60 | Math on the page can be automatically rendered using the auto-render extension. See [the Auto-render README](contrib/auto-render/README.md) for more information.
61 |
62 | ## Contributing
63 |
64 | See [CONTRIBUTING.md](CONTRIBUTING.md)
65 |
66 | ## License
67 |
68 | KaTeX is licensed under the [MIT License](http://opensource.org/licenses/MIT).
69 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/cli.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | // Simple CLI for KaTeX.
3 | // Reads TeX from stdin, outputs HTML to stdout.
4 | /* eslint no-console:0 */
5 |
6 | var katex = require("./");
7 | var input = "";
8 |
9 | // Skip the first two args, which are just "node" and "cli.js"
10 | var args = process.argv.slice(2);
11 |
12 | if (args.indexOf("--help") !== -1) {
13 | console.log(process.argv[0] + " " + process.argv[1] +
14 | " [ --help ]" +
15 | " [ --display-mode ]");
16 |
17 | console.log("\n" +
18 | "Options:");
19 | console.log(" --help Display this help message");
20 | console.log(" --display-mode Render in display mode (not inline mode)");
21 | process.exit();
22 | }
23 |
24 | process.stdin.on("data", function(chunk) {
25 | input += chunk.toString();
26 | });
27 |
28 | process.stdin.on("end", function() {
29 | var options = { displayMode: args.indexOf("--display-mode") !== -1 };
30 | var output = katex.renderToString(input, options);
31 | console.log(output);
32 | });
33 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/katex.js:
--------------------------------------------------------------------------------
1 | /* eslint no-console:0 */
2 | /**
3 | * This is the main entry point for KaTeX. Here, we expose functions for
4 | * rendering expressions either to DOM nodes or to markup strings.
5 | *
6 | * We also expose the ParseError class to check if errors thrown from KaTeX are
7 | * errors in the expression, or errors in javascript handling.
8 | */
9 |
10 | var ParseError = require("./src/ParseError");
11 | var Settings = require("./src/Settings");
12 |
13 | var buildTree = require("./src/buildTree");
14 | var parseTree = require("./src/parseTree");
15 | var utils = require("./src/utils");
16 |
17 | /**
18 | * Parse and build an expression, and place that expression in the DOM node
19 | * given.
20 | */
21 | var render = function(expression, baseNode, options) {
22 | utils.clearNode(baseNode);
23 |
24 | var settings = new Settings(options);
25 |
26 | var tree = parseTree(expression, settings);
27 | var node = buildTree(tree, expression, settings).toNode();
28 |
29 | baseNode.appendChild(node);
30 | };
31 |
32 | // KaTeX's styles don't work properly in quirks mode. Print out an error, and
33 | // disable rendering.
34 | if (typeof document !== "undefined") {
35 | if (document.compatMode !== "CSS1Compat") {
36 | typeof console !== "undefined" && console.warn(
37 | "Warning: KaTeX doesn't work in quirks mode. Make sure your " +
38 | "website has a suitable doctype.");
39 |
40 | render = function() {
41 | throw new ParseError("KaTeX doesn't work in quirks mode.");
42 | };
43 | }
44 | }
45 |
46 | /**
47 | * Parse and build an expression, and return the markup for that.
48 | */
49 | var renderToString = function(expression, options) {
50 | var settings = new Settings(options);
51 |
52 | var tree = parseTree(expression, settings);
53 | return buildTree(tree, expression, settings).toMarkup();
54 | };
55 |
56 | /**
57 | * Parse an expression and return the parse tree.
58 | */
59 | var generateParseTree = function(expression, options) {
60 | var settings = new Settings(options);
61 | return parseTree(expression, settings);
62 | };
63 |
64 | module.exports = {
65 | render: render,
66 | renderToString: renderToString,
67 | /**
68 | * NOTE: This method is not currently recommended for public use.
69 | * The internal tree representation is unstable and is very likely
70 | * to change. Use at your own risk.
71 | */
72 | __parse: generateParseTree,
73 | ParseError: ParseError,
74 | };
75 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "_args": [
3 | [
4 | "katex",
5 | "/home/srush/Projects/im2latex"
6 | ]
7 | ],
8 | "_from": "katex@latest",
9 | "_id": "katex@0.6.0",
10 | "_inCache": true,
11 | "_installable": true,
12 | "_location": "/katex",
13 | "_nodeVersion": "4.2.1",
14 | "_npmOperationalInternal": {
15 | "host": "packages-12-west.internal.npmjs.com",
16 | "tmp": "tmp/katex-0.6.0.tgz_1460769444991_0.38667152682319283"
17 | },
18 | "_npmUser": {
19 | "email": "kevinb7@gmail.com",
20 | "name": "kevinbarabash"
21 | },
22 | "_npmVersion": "2.15.2",
23 | "_phantomChildren": {},
24 | "_requested": {
25 | "name": "katex",
26 | "raw": "katex",
27 | "rawSpec": "",
28 | "scope": null,
29 | "spec": "latest",
30 | "type": "tag"
31 | },
32 | "_requiredBy": [
33 | "#USER"
34 | ],
35 | "_resolved": "https://registry.npmjs.org/katex/-/katex-0.6.0.tgz",
36 | "_shasum": "12418e09121c05c92041b6b3b9fb6bab213cb6f3",
37 | "_shrinkwrap": null,
38 | "_spec": "katex",
39 | "_where": "/home/srush/Projects/im2latex",
40 | "bin": {
41 | "katex": "cli.js"
42 | },
43 | "bugs": {
44 | "url": "https://github.com/Khan/KaTeX/issues"
45 | },
46 | "dependencies": {
47 | "match-at": "^0.1.0"
48 | },
49 | "description": "Fast math typesetting for the web.",
50 | "devDependencies": {
51 | "browserify": "^10.2.4",
52 | "clean-css": "~2.2.15",
53 | "eslint": "^1.10.2",
54 | "express": "~3.3.3",
55 | "glob": "^5.0.15",
56 | "jasmine": "^2.3.2",
57 | "jasmine-core": "^2.3.4",
58 | "js-yaml": "^3.3.1",
59 | "jspngopt": "^0.1.0",
60 | "less": "~1.7.5",
61 | "nomnom": "^1.8.1",
62 | "pako": "0.2.7",
63 | "selenium-webdriver": "^2.46.1",
64 | "uglify-js": "~2.4.15"
65 | },
66 | "directories": {},
67 | "dist": {
68 | "shasum": "12418e09121c05c92041b6b3b9fb6bab213cb6f3",
69 | "tarball": "https://registry.npmjs.org/katex/-/katex-0.6.0.tgz"
70 | },
71 | "files": [
72 | "cli.js",
73 | "dist/",
74 | "katex.js",
75 | "src/"
76 | ],
77 | "gitHead": "b94fc6534d5c23f944906a52a592bee4e0090665",
78 | "homepage": "https://github.com/Khan/KaTeX#readme",
79 | "license": "MIT",
80 | "main": "katex.js",
81 | "maintainers": [
82 | {
83 | "name": "kevinbarabash",
84 | "email": "kevinb7@gmail.com"
85 | },
86 | {
87 | "name": "spicyj",
88 | "email": "ben@benalpert.com"
89 | },
90 | {
91 | "name": "xymostech",
92 | "email": "xymostech@gmail.com"
93 | }
94 | ],
95 | "name": "katex",
96 | "optionalDependencies": {},
97 | "readme": "ERROR: No README data found!",
98 | "repository": {
99 | "type": "git",
100 | "url": "git://github.com/Khan/KaTeX.git"
101 | },
102 | "scripts": {
103 | "prepublish": "make dist",
104 | "start": "node server.js",
105 | "test": "make lint test"
106 | },
107 | "version": "0.6.0"
108 | }
109 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/src/Lexer.js:
--------------------------------------------------------------------------------
1 | /**
2 | * The Lexer class handles tokenizing the input in various ways. Since our
3 | * parser expects us to be able to backtrack, the lexer allows lexing from any
4 | * given starting point.
5 | *
6 | * Its main exposed function is the `lex` function, which takes a position to
7 | * lex from and a type of token to lex. It defers to the appropriate `_innerLex`
8 | * function.
9 | *
10 | * The various `_innerLex` functions perform the actual lexing of different
11 | * kinds.
12 | */
13 |
14 | var matchAt = require("../../match-at");
15 |
16 | var ParseError = require("./ParseError");
17 |
18 | // The main lexer class
19 | function Lexer(input) {
20 | this._input = input;
21 | }
22 |
23 | // The resulting token returned from `lex`.
24 | function Token(text, data, position) {
25 | this.text = text;
26 | this.data = data;
27 | this.position = position;
28 | }
29 |
30 | /* The following tokenRegex
31 | * - matches typical whitespace (but not NBSP etc.) using its first group
32 | * - matches symbol combinations which result in a single output character
33 | * - does not match any control character \x00-\x1f except whitespace
34 | * - does not match a bare backslash
35 | * - matches any ASCII character except those just mentioned
36 | * - does not match the BMP private use area \uE000-\uF8FF
37 | * - does not match bare surrogate code units
38 | * - matches any BMP character except for those just described
39 | * - matches any valid Unicode surrogate pair
40 | * - matches a backslash followed by one or more letters
41 | * - matches a backslash followed by any BMP character, including newline
42 | * Just because the Lexer matches something doesn't mean it's valid input:
43 | * If there is no matching function or symbol definition, the Parser will
44 | * still reject the input.
45 | */
46 | var tokenRegex = new RegExp(
47 | "([ \r\n\t]+)|(" + // whitespace
48 | "---?" + // special combinations
49 | "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
50 | "|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair
51 | "|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" + // function name
52 | ")"
53 | );
54 |
55 | var whitespaceRegex = /\s*/;
56 |
57 | /**
58 | * This function lexes a single normal token. It takes a position and
59 | * whether it should completely ignore whitespace or not.
60 | */
61 | Lexer.prototype._innerLex = function(pos, ignoreWhitespace) {
62 | var input = this._input;
63 | if (pos === input.length) {
64 | return new Token("EOF", null, pos);
65 | }
66 | var match = matchAt(tokenRegex, input, pos);
67 | if (match === null) {
68 | throw new ParseError(
69 | "Unexpected character: '" + input[pos] + "'",
70 | this, pos);
71 | } else if (match[2]) { // matched non-whitespace
72 | return new Token(match[2], null, pos + match[2].length);
73 | } else if (ignoreWhitespace) {
74 | return this._innerLex(pos + match[1].length, true);
75 | } else { // concatenate whitespace to a single space
76 | return new Token(" ", null, pos + match[1].length);
77 | }
78 | };
79 |
80 | // A regex to match a CSS color (like #ffffff or BlueViolet)
81 | var cssColor = /#[a-z0-9]+|[a-z]+/i;
82 |
83 | /**
84 | * This function lexes a CSS color.
85 | */
86 | Lexer.prototype._innerLexColor = function(pos) {
87 | var input = this._input;
88 |
89 | // Ignore whitespace
90 | var whitespace = matchAt(whitespaceRegex, input, pos)[0];
91 | pos += whitespace.length;
92 |
93 | var match;
94 | if ((match = matchAt(cssColor, input, pos))) {
95 | // If we look like a color, return a color
96 | return new Token(match[0], null, pos + match[0].length);
97 | } else {
98 | throw new ParseError("Invalid color", this, pos);
99 | }
100 | };
101 |
102 | // A regex to match a dimension. Dimensions look like
103 | // "1.2em" or ".4pt" or "1 ex"
104 | var sizeRegex = /(-?)\s*(\d+(?:\.\d*)?|\.\d+)\s*([a-z]{2})/;
105 |
106 | /**
107 | * This function lexes a dimension.
108 | */
109 | Lexer.prototype._innerLexSize = function(pos) {
110 | var input = this._input;
111 |
112 | // Ignore whitespace
113 | var whitespace = matchAt(whitespaceRegex, input, pos)[0];
114 | pos += whitespace.length;
115 |
116 | var match;
117 | if ((match = matchAt(sizeRegex, input, pos))) {
118 | var unit = match[3];
119 | // We only currently handle "em" and "ex" units
120 | // if (unit !== "em" && unit !== "ex") {
121 | // throw new ParseError("Invalid unit: '" + unit + "'", this, pos);
122 | // }
123 | return new Token(match[0], {
124 | number: +(match[1] + match[2]),
125 | unit: unit,
126 | }, pos + match[0].length);
127 | }
128 |
129 | throw new ParseError("Invalid size", this, pos);
130 | };
131 |
132 | /**
133 | * This function lexes a string of whitespace.
134 | */
135 | Lexer.prototype._innerLexWhitespace = function(pos) {
136 | var input = this._input;
137 |
138 | var whitespace = matchAt(whitespaceRegex, input, pos)[0];
139 | pos += whitespace.length;
140 |
141 | return new Token(whitespace[0], null, pos);
142 | };
143 |
144 | /**
145 | * This function lexes a single token starting at `pos` and of the given mode.
146 | * Based on the mode, we defer to one of the `_innerLex` functions.
147 | */
148 | Lexer.prototype.lex = function(pos, mode) {
149 | if (mode === "math") {
150 | return this._innerLex(pos, true);
151 | } else if (mode === "text") {
152 | return this._innerLex(pos, false);
153 | } else if (mode === "color") {
154 | return this._innerLexColor(pos);
155 | } else if (mode === "size") {
156 | return this._innerLexSize(pos);
157 | } else if (mode === "whitespace") {
158 | return this._innerLexWhitespace(pos);
159 | }
160 | };
161 |
162 | module.exports = Lexer;
163 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/src/Options.js:
--------------------------------------------------------------------------------
1 | /**
2 | * This file contains information about the options that the Parser carries
3 | * around with it while parsing. Data is held in an `Options` object, and when
4 | * recursing, a new `Options` object can be created with the `.with*` and
5 | * `.reset` functions.
6 | */
7 |
8 | /**
9 | * This is the main options class. It contains the style, size, color, and font
10 | * of the current parse level. It also contains the style and size of the parent
11 | * parse level, so size changes can be handled efficiently.
12 | *
13 | * Each of the `.with*` and `.reset` functions passes its current style and size
14 | * as the parentStyle and parentSize of the new options class, so parent
15 | * handling is taken care of automatically.
16 | */
17 | function Options(data) {
18 | this.style = data.style;
19 | this.color = data.color;
20 | this.size = data.size;
21 | this.phantom = data.phantom;
22 | this.font = data.font;
23 |
24 | if (data.parentStyle === undefined) {
25 | this.parentStyle = data.style;
26 | } else {
27 | this.parentStyle = data.parentStyle;
28 | }
29 |
30 | if (data.parentSize === undefined) {
31 | this.parentSize = data.size;
32 | } else {
33 | this.parentSize = data.parentSize;
34 | }
35 | }
36 |
37 | /**
38 | * Returns a new options object with the same properties as "this". Properties
39 | * from "extension" will be copied to the new options object.
40 | */
41 | Options.prototype.extend = function(extension) {
42 | var data = {
43 | style: this.style,
44 | size: this.size,
45 | color: this.color,
46 | parentStyle: this.style,
47 | parentSize: this.size,
48 | phantom: this.phantom,
49 | font: this.font,
50 | };
51 |
52 | for (var key in extension) {
53 | if (extension.hasOwnProperty(key)) {
54 | data[key] = extension[key];
55 | }
56 | }
57 |
58 | return new Options(data);
59 | };
60 |
61 | /**
62 | * Create a new options object with the given style.
63 | */
64 | Options.prototype.withStyle = function(style) {
65 | return this.extend({
66 | style: style,
67 | });
68 | };
69 |
70 | /**
71 | * Create a new options object with the given size.
72 | */
73 | Options.prototype.withSize = function(size) {
74 | return this.extend({
75 | size: size,
76 | });
77 | };
78 |
79 | /**
80 | * Create a new options object with the given color.
81 | */
82 | Options.prototype.withColor = function(color) {
83 | return this.extend({
84 | color: color,
85 | });
86 | };
87 |
88 | /**
89 | * Create a new options object with "phantom" set to true.
90 | */
91 | Options.prototype.withPhantom = function() {
92 | return this.extend({
93 | phantom: true,
94 | });
95 | };
96 |
97 | /**
98 | * Create a new options objects with the give font.
99 | */
100 | Options.prototype.withFont = function(font) {
101 | return this.extend({
102 | font: font,
103 | });
104 | };
105 |
106 | /**
107 | * Create a new options object with the same style, size, and color. This is
108 | * used so that parent style and size changes are handled correctly.
109 | */
110 | Options.prototype.reset = function() {
111 | return this.extend({});
112 | };
113 |
114 | /**
115 | * A map of color names to CSS colors.
116 | * TODO(emily): Remove this when we have real macros
117 | */
118 | var colorMap = {
119 | "katex-blue": "#6495ed",
120 | "katex-orange": "#ffa500",
121 | "katex-pink": "#ff00af",
122 | "katex-red": "#df0030",
123 | "katex-green": "#28ae7b",
124 | "katex-gray": "gray",
125 | "katex-purple": "#9d38bd",
126 | "katex-blueA": "#c7e9f1",
127 | "katex-blueB": "#9cdceb",
128 | "katex-blueC": "#58c4dd",
129 | "katex-blueD": "#29abca",
130 | "katex-blueE": "#1c758a",
131 | "katex-tealA": "#acead7",
132 | "katex-tealB": "#76ddc0",
133 | "katex-tealC": "#5cd0b3",
134 | "katex-tealD": "#55c1a7",
135 | "katex-tealE": "#49a88f",
136 | "katex-greenA": "#c9e2ae",
137 | "katex-greenB": "#a6cf8c",
138 | "katex-greenC": "#83c167",
139 | "katex-greenD": "#77b05d",
140 | "katex-greenE": "#699c52",
141 | "katex-goldA": "#f7c797",
142 | "katex-goldB": "#f9b775",
143 | "katex-goldC": "#f0ac5f",
144 | "katex-goldD": "#e1a158",
145 | "katex-goldE": "#c78d46",
146 | "katex-redA": "#f7a1a3",
147 | "katex-redB": "#ff8080",
148 | "katex-redC": "#fc6255",
149 | "katex-redD": "#e65a4c",
150 | "katex-redE": "#cf5044",
151 | "katex-maroonA": "#ecabc1",
152 | "katex-maroonB": "#ec92ab",
153 | "katex-maroonC": "#c55f73",
154 | "katex-maroonD": "#a24d61",
155 | "katex-maroonE": "#94424f",
156 | "katex-purpleA": "#caa3e8",
157 | "katex-purpleB": "#b189c6",
158 | "katex-purpleC": "#9a72ac",
159 | "katex-purpleD": "#715582",
160 | "katex-purpleE": "#644172",
161 | "katex-mintA": "#f5f9e8",
162 | "katex-mintB": "#edf2df",
163 | "katex-mintC": "#e0e5cc",
164 | "katex-grayA": "#fdfdfd",
165 | "katex-grayB": "#f7f7f7",
166 | "katex-grayC": "#eeeeee",
167 | "katex-grayD": "#dddddd",
168 | "katex-grayE": "#cccccc",
169 | "katex-grayF": "#aaaaaa",
170 | "katex-grayG": "#999999",
171 | "katex-grayH": "#555555",
172 | "katex-grayI": "#333333",
173 | "katex-kaBlue": "#314453",
174 | "katex-kaGreen": "#639b24",
175 | };
176 |
177 | /**
178 | * Gets the CSS color of the current options object, accounting for the
179 | * `colorMap`.
180 | */
181 | Options.prototype.getColor = function() {
182 | if (this.phantom) {
183 | return "transparent";
184 | } else {
185 | return colorMap[this.color] || this.color;
186 | }
187 | };
188 |
189 | module.exports = Options;
190 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/src/ParseError.js:
--------------------------------------------------------------------------------
1 | /**
2 | * This is the ParseError class, which is the main error thrown by KaTeX
3 | * functions when something has gone wrong. This is used to distinguish internal
4 | * errors from errors in the expression that the user provided.
5 | */
6 | function ParseError(message, lexer, position) {
7 | var error = "KaTeX parse error: " + message;
8 |
9 | if (lexer !== undefined && position !== undefined) {
10 | // If we have the input and a position, make the error a bit fancier
11 |
12 | // Prepend some information
13 | error += " at position " + position + ": ";
14 |
15 | // Get the input
16 | var input = lexer._input;
17 | // Insert a combining underscore at the correct position
18 | input = input.slice(0, position) + "\u0332" +
19 | input.slice(position);
20 |
21 | // Extract some context from the input and add it to the error
22 | var begin = Math.max(0, position - 15);
23 | var end = position + 15;
24 | error += input.slice(begin, end);
25 | }
26 |
27 | // Some hackery to make ParseError a prototype of Error
28 | // See http://stackoverflow.com/a/8460753
29 | var self = new Error(error);
30 | self.name = "ParseError";
31 | self.__proto__ = ParseError.prototype;
32 |
33 | self.position = position;
34 | return self;
35 | }
36 |
37 | // More hackery
38 | ParseError.prototype.__proto__ = Error.prototype;
39 |
40 | module.exports = ParseError;
41 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/src/Settings.js:
--------------------------------------------------------------------------------
1 | /**
2 | * This is a module for storing settings passed into KaTeX. It correctly handles
3 | * default settings.
4 | */
5 |
6 | /**
7 | * Helper function for getting a default value if the value is undefined
8 | */
9 | function get(option, defaultValue) {
10 | return option === undefined ? defaultValue : option;
11 | }
12 |
13 | /**
14 | * The main Settings object
15 | *
16 | * The current options stored are:
17 | * - displayMode: Whether the expression should be typeset by default in
18 | * textstyle or displaystyle (default false)
19 | */
20 | function Settings(options) {
21 | // allow null options
22 | options = options || {};
23 | this.displayMode = get(options.displayMode, false);
24 | this.throwOnError = get(options.throwOnError, true);
25 | this.errorColor = get(options.errorColor, "#cc0000");
26 | }
27 |
28 | module.exports = Settings;
29 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/src/Style.js:
--------------------------------------------------------------------------------
1 | /**
2 | * This file contains information and classes for the various kinds of styles
3 | * used in TeX. It provides a generic `Style` class, which holds information
4 | * about a specific style. It then provides instances of all the different kinds
5 | * of styles possible, and provides functions to move between them and get
6 | * information about them.
7 | */
8 |
9 | /**
10 | * The main style class. Contains a unique id for the style, a size (which is
11 | * the same for cramped and uncramped version of a style), a cramped flag, and a
12 | * size multiplier, which gives the size difference between a style and
13 | * textstyle.
14 | */
15 | function Style(id, size, multiplier, cramped) {
16 | this.id = id;
17 | this.size = size;
18 | this.cramped = cramped;
19 | this.sizeMultiplier = multiplier;
20 | }
21 |
22 | /**
23 | * Get the style of a superscript given a base in the current style.
24 | */
25 | Style.prototype.sup = function() {
26 | return styles[sup[this.id]];
27 | };
28 |
29 | /**
30 | * Get the style of a subscript given a base in the current style.
31 | */
32 | Style.prototype.sub = function() {
33 | return styles[sub[this.id]];
34 | };
35 |
36 | /**
37 | * Get the style of a fraction numerator given the fraction in the current
38 | * style.
39 | */
40 | Style.prototype.fracNum = function() {
41 | return styles[fracNum[this.id]];
42 | };
43 |
44 | /**
45 | * Get the style of a fraction denominator given the fraction in the current
46 | * style.
47 | */
48 | Style.prototype.fracDen = function() {
49 | return styles[fracDen[this.id]];
50 | };
51 |
52 | /**
53 | * Get the cramped version of a style (in particular, cramping a cramped style
54 | * doesn't change the style).
55 | */
56 | Style.prototype.cramp = function() {
57 | return styles[cramp[this.id]];
58 | };
59 |
60 | /**
61 | * HTML class name, like "displaystyle cramped"
62 | */
63 | Style.prototype.cls = function() {
64 | return sizeNames[this.size] + (this.cramped ? " cramped" : " uncramped");
65 | };
66 |
67 | /**
68 | * HTML Reset class name, like "reset-textstyle"
69 | */
70 | Style.prototype.reset = function() {
71 | return resetNames[this.size];
72 | };
73 |
74 | // IDs of the different styles
75 | var D = 0;
76 | var Dc = 1;
77 | var T = 2;
78 | var Tc = 3;
79 | var S = 4;
80 | var Sc = 5;
81 | var SS = 6;
82 | var SSc = 7;
83 |
84 | // String names for the different sizes
85 | var sizeNames = [
86 | "displaystyle textstyle",
87 | "textstyle",
88 | "scriptstyle",
89 | "scriptscriptstyle",
90 | ];
91 |
92 | // Reset names for the different sizes
93 | var resetNames = [
94 | "reset-textstyle",
95 | "reset-textstyle",
96 | "reset-scriptstyle",
97 | "reset-scriptscriptstyle",
98 | ];
99 |
100 | // Instances of the different styles
101 | var styles = [
102 | new Style(D, 0, 1.0, false),
103 | new Style(Dc, 0, 1.0, true),
104 | new Style(T, 1, 1.0, false),
105 | new Style(Tc, 1, 1.0, true),
106 | new Style(S, 2, 0.7, false),
107 | new Style(Sc, 2, 0.7, true),
108 | new Style(SS, 3, 0.5, false),
109 | new Style(SSc, 3, 0.5, true),
110 | ];
111 |
112 | // Lookup tables for switching from one style to another
113 | var sup = [S, Sc, S, Sc, SS, SSc, SS, SSc];
114 | var sub = [Sc, Sc, Sc, Sc, SSc, SSc, SSc, SSc];
115 | var fracNum = [T, Tc, S, Sc, SS, SSc, SS, SSc];
116 | var fracDen = [Tc, Tc, Sc, Sc, SSc, SSc, SSc, SSc];
117 | var cramp = [Dc, Dc, Tc, Tc, Sc, Sc, SSc, SSc];
118 |
119 | // We only export some of the styles. Also, we don't export the `Style` class so
120 | // no more styles can be generated.
121 | module.exports = {
122 | DISPLAY: styles[D],
123 | TEXT: styles[T],
124 | SCRIPT: styles[S],
125 | SCRIPTSCRIPT: styles[SS],
126 | };
127 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/src/buildTree.js:
--------------------------------------------------------------------------------
1 | var buildHTML = require("./buildHTML");
2 | var buildMathML = require("./buildMathML");
3 | var buildCommon = require("./buildCommon");
4 | var Options = require("./Options");
5 | var Settings = require("./Settings");
6 | var Style = require("./Style");
7 |
8 | var makeSpan = buildCommon.makeSpan;
9 |
10 | var buildTree = function(tree, expression, settings) {
11 | settings = settings || new Settings({});
12 |
13 | var startStyle = Style.TEXT;
14 | if (settings.displayMode) {
15 | startStyle = Style.DISPLAY;
16 | }
17 |
18 | // Setup the default options
19 | var options = new Options({
20 | style: startStyle,
21 | size: "size5",
22 | });
23 |
24 | // `buildHTML` sometimes messes with the parse tree (like turning bins ->
25 | // ords), so we build the MathML version first.
26 | var mathMLNode = buildMathML(tree, expression, options);
27 | var htmlNode = buildHTML(tree, options);
28 |
29 | var katexNode = makeSpan(["katex"], [
30 | mathMLNode, htmlNode,
31 | ]);
32 |
33 | if (settings.displayMode) {
34 | return makeSpan(["katex-display"], [katexNode]);
35 | } else {
36 | return katexNode;
37 | }
38 | };
39 |
40 | module.exports = buildTree;
41 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/src/domTree.js:
--------------------------------------------------------------------------------
1 | /**
2 | * These objects store the data about the DOM nodes we create, as well as some
3 | * extra data. They can then be transformed into real DOM nodes with the
4 | * `toNode` function or HTML markup using `toMarkup`. They are useful for both
5 | * storing extra properties on the nodes, as well as providing a way to easily
6 | * work with the DOM.
7 | *
8 | * Similar functions for working with MathML nodes exist in mathMLTree.js.
9 | */
10 |
11 | var utils = require("./utils");
12 |
13 | /**
14 | * Create an HTML className based on a list of classes. In addition to joining
15 | * with spaces, we also remove null or empty classes.
16 | */
17 | var createClass = function(classes) {
18 | classes = classes.slice();
19 | for (var i = classes.length - 1; i >= 0; i--) {
20 | if (!classes[i]) {
21 | classes.splice(i, 1);
22 | }
23 | }
24 |
25 | return classes.join(" ");
26 | };
27 |
28 | /**
29 | * This node represents a span node, with a className, a list of children, and
30 | * an inline style. It also contains information about its height, depth, and
31 | * maxFontSize.
32 | */
33 | function span(classes, children, height, depth, maxFontSize, style) {
34 | this.classes = classes || [];
35 | this.children = children || [];
36 | this.height = height || 0;
37 | this.depth = depth || 0;
38 | this.maxFontSize = maxFontSize || 0;
39 | this.style = style || {};
40 | this.attributes = {};
41 | }
42 |
43 | /**
44 | * Sets an arbitrary attribute on the span. Warning: use this wisely. Not all
45 | * browsers support attributes the same, and having too many custom attributes
46 | * is probably bad.
47 | */
48 | span.prototype.setAttribute = function(attribute, value) {
49 | this.attributes[attribute] = value;
50 | };
51 |
52 | /**
53 | * Convert the span into an HTML node
54 | */
55 | span.prototype.toNode = function() {
56 | var span = document.createElement("span");
57 |
58 | // Apply the class
59 | span.className = createClass(this.classes);
60 |
61 | // Apply inline styles
62 | for (var style in this.style) {
63 | if (Object.prototype.hasOwnProperty.call(this.style, style)) {
64 | span.style[style] = this.style[style];
65 | }
66 | }
67 |
68 | // Apply attributes
69 | for (var attr in this.attributes) {
70 | if (Object.prototype.hasOwnProperty.call(this.attributes, attr)) {
71 | span.setAttribute(attr, this.attributes[attr]);
72 | }
73 | }
74 |
75 | // Append the children, also as HTML nodes
76 | for (var i = 0; i < this.children.length; i++) {
77 | span.appendChild(this.children[i].toNode());
78 | }
79 |
80 | return span;
81 | };
82 |
83 | /**
84 | * Convert the span into an HTML markup string
85 | */
86 | span.prototype.toMarkup = function() {
87 | var markup = "";
119 |
120 | // Add the markup of the children, also as markup
121 | for (var i = 0; i < this.children.length; i++) {
122 | markup += this.children[i].toMarkup();
123 | }
124 |
125 | markup += "";
126 |
127 | return markup;
128 | };
129 |
130 | /**
131 | * This node represents a document fragment, which contains elements, but when
132 | * placed into the DOM doesn't have any representation itself. Thus, it only
133 | * contains children and doesn't have any HTML properties. It also keeps track
134 | * of a height, depth, and maxFontSize.
135 | */
136 | function documentFragment(children, height, depth, maxFontSize) {
137 | this.children = children || [];
138 | this.height = height || 0;
139 | this.depth = depth || 0;
140 | this.maxFontSize = maxFontSize || 0;
141 | }
142 |
143 | /**
144 | * Convert the fragment into a node
145 | */
146 | documentFragment.prototype.toNode = function() {
147 | // Create a fragment
148 | var frag = document.createDocumentFragment();
149 |
150 | // Append the children
151 | for (var i = 0; i < this.children.length; i++) {
152 | frag.appendChild(this.children[i].toNode());
153 | }
154 |
155 | return frag;
156 | };
157 |
158 | /**
159 | * Convert the fragment into HTML markup
160 | */
161 | documentFragment.prototype.toMarkup = function() {
162 | var markup = "";
163 |
164 | // Simply concatenate the markup for the children together
165 | for (var i = 0; i < this.children.length; i++) {
166 | markup += this.children[i].toMarkup();
167 | }
168 |
169 | return markup;
170 | };
171 |
172 | /**
173 | * A symbol node contains information about a single symbol. It either renders
174 | * to a single text node, or a span with a single text node in it, depending on
175 | * whether it has CSS classes, styles, or needs italic correction.
176 | */
177 | function symbolNode(value, height, depth, italic, skew, classes, style) {
178 | this.value = value || "";
179 | this.height = height || 0;
180 | this.depth = depth || 0;
181 | this.italic = italic || 0;
182 | this.skew = skew || 0;
183 | this.classes = classes || [];
184 | this.style = style || {};
185 | this.maxFontSize = 0;
186 | }
187 |
188 | /**
189 | * Creates a text node or span from a symbol node. Note that a span is only
190 | * created if it is needed.
191 | */
192 | symbolNode.prototype.toNode = function() {
193 | var node = document.createTextNode(this.value);
194 | var span = null;
195 |
196 | if (this.italic > 0) {
197 | span = document.createElement("span");
198 | span.style.marginRight = this.italic + "em";
199 | }
200 |
201 | if (this.classes.length > 0) {
202 | span = span || document.createElement("span");
203 | span.className = createClass(this.classes);
204 | }
205 |
206 | for (var style in this.style) {
207 | if (this.style.hasOwnProperty(style)) {
208 | span = span || document.createElement("span");
209 | span.style[style] = this.style[style];
210 | }
211 | }
212 |
213 | if (span) {
214 | span.appendChild(node);
215 | return span;
216 | } else {
217 | return node;
218 | }
219 | };
220 |
221 | /**
222 | * Creates markup for a symbol node.
223 | */
224 | symbolNode.prototype.toMarkup = function() {
225 | // TODO(alpert): More duplication than I'd like from
226 | // span.prototype.toMarkup and symbolNode.prototype.toNode...
227 | var needsSpan = false;
228 |
229 | var markup = " 0) {
241 | styles += "margin-right:" + this.italic + "em;";
242 | }
243 | for (var style in this.style) {
244 | if (this.style.hasOwnProperty(style)) {
245 | styles += utils.hyphenate(style) + ":" + this.style[style] + ";";
246 | }
247 | }
248 |
249 | if (styles) {
250 | needsSpan = true;
251 | markup += " style=\"" + utils.escape(styles) + "\"";
252 | }
253 |
254 | var escaped = utils.escape(this.value);
255 | if (needsSpan) {
256 | markup += ">";
257 | markup += escaped;
258 | markup += "";
259 | return markup;
260 | } else {
261 | return escaped;
262 | }
263 | };
264 |
265 | module.exports = {
266 | span: span,
267 | documentFragment: documentFragment,
268 | symbolNode: symbolNode,
269 | };
270 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/src/environments.js:
--------------------------------------------------------------------------------
1 | /* eslint no-constant-condition:0 */
2 | var fontMetrics = require("./fontMetrics");
3 | var parseData = require("./parseData");
4 | var ParseError = require("./ParseError");
5 |
6 | var ParseNode = parseData.ParseNode;
7 |
8 | /**
9 | * Parse the body of the environment, with rows delimited by \\ and
10 | * columns delimited by &, and create a nested list in row-major order
11 | * with one group per cell.
12 | */
13 | var q = 0 ;
14 | function parseArray(parser, result) {
15 | var row = [];
16 | var body = [row];
17 | var rowGaps = [];
18 |
19 | while (true) {
20 |
21 | // if (q == 1) console.error(parser.nextToken.text);
22 | try {
23 | var cell = parser.parseExpression(false, null);
24 | } catch (e) {
25 | // console.error(e);
26 | exit();
27 | }
28 | // if (q == 1) exit();
29 | row.push(new ParseNode("ordgroup", cell, parser.mode));
30 | var next = parser.nextToken.text;
31 | if (next === "&") {
32 | parser.consume();
33 | } else if (next === "\\end" || next == "}") {
34 | break;
35 | } else if (next === "\\\\" || next === "\\cr") {
36 | var cr = parser.parseFunction();
37 | rowGaps.push(cr.value.size);
38 | row = [];
39 | body.push(row);
40 | } else {
41 | // TODO: Clean up the following hack once #385 got merged
42 | var pos = Math.min(parser.pos + 1, parser.lexer._input.length);
43 | throw new ParseError("Expected & or \\\\ or \\end",
44 | parser.lexer, pos);
45 | }
46 | }
47 | result.body = body;
48 | result.rowGaps = rowGaps;
49 | // if (q == 1) exit();
50 | var node = new ParseNode(result.type, result, parser.mode);
51 | return node;
52 | }
53 |
54 | /*
55 | * An environment definition is very similar to a function definition:
56 | * it is declared with a name or a list of names, a set of properties
57 | * and a handler containing the actual implementation.
58 | *
59 | * The properties include:
60 | * - numArgs: The number of arguments after the \begin{name} function.
61 | * - argTypes: (optional) Just like for a function
62 | * - allowedInText: (optional) Whether or not the environment is allowed inside
63 | * text mode (default false) (not enforced yet)
64 | * - numOptionalArgs: (optional) Just like for a function
65 | * A bare number instead of that object indicates the numArgs value.
66 | *
67 | * The handler function will receive two arguments
68 | * - context: information and references provided by the parser
69 | * - args: an array of arguments passed to \begin{name}
70 | * The context contains the following properties:
71 | * - envName: the name of the environment, one of the listed names.
72 | * - parser: the parser object
73 | * - lexer: the lexer object
74 | * - positions: the positions associated with these arguments from args.
75 | * The handler must return a ParseResult.
76 | */
77 |
78 | function defineEnvironment(names, props, handler) {
79 | if (typeof names === "string") {
80 | names = [names];
81 | }
82 | if (typeof props === "number") {
83 | props = { numArgs: props };
84 | }
85 | // Set default values of environments
86 | var data = {
87 | numArgs: props.numArgs || 0,
88 | argTypes: props.argTypes,
89 | greediness: 1,
90 | allowedInText: !!props.allowedInText,
91 | numOptionalArgs: props.numOptionalArgs || 0,
92 | handler: handler,
93 | };
94 | for (var i = 0; i < names.length; ++i) {
95 | module.exports[names[i]] = data;
96 | }
97 | }
98 |
99 | // Arrays are part of LaTeX, defined in lttab.dtx so its documentation
100 | // is part of the source2e.pdf file of LaTeX2e source documentation.
101 | defineEnvironment("array", {
102 | numArgs: 1,
103 | }, function(context, args) {
104 | var colalign = args[0];
105 | colalign = colalign.value.map ? colalign.value : [colalign];
106 | var cols = colalign.map(function(node) {
107 | var ca = node.value;
108 | if ("lcr".indexOf(ca) !== -1) {
109 | return {
110 | type: "align",
111 | align: ca,
112 | };
113 | } else if (ca === "|") {
114 | return {
115 | type: "separator",
116 | separator: "|",
117 | };
118 | }
119 | // throw new ParseError(
120 | // "Unknown column alignment: " + node.value,
121 | // context.lexer, context.positions[1]);
122 | });
123 | var res = {
124 | type: "array",
125 | style: "array",
126 | cols: cols,
127 | hskipBeforeAndAfter: true, // \@preamble in lttab.dtx
128 | };
129 | res = parseArray(context.parser, res);
130 | return res;
131 | });
132 |
133 | defineEnvironment("tabular", {
134 | numArgs: 1,
135 | }, function(context, args) {
136 | var colalign = args[0];
137 | colalign = colalign.value.map ? colalign.value : [colalign];
138 | var cols = colalign.map(function(node) {
139 | var ca = node.value;
140 | if ("lcr".indexOf(ca) !== -1) {
141 | return {
142 | type: "align",
143 | align: ca,
144 | };
145 | } else if (ca === "|") {
146 | return {
147 | type: "separator",
148 | separator: "|",
149 | };
150 | }
151 | // throw new ParseError(
152 | // "Unknown column alignment: " + node.value,
153 | // context.lexer, context.positions[1]);
154 | });
155 | var res = {
156 | type: "array",
157 | style: "tabular",
158 | cols: cols,
159 | hskipBeforeAndAfter: true, // \@preamble in lttab.dtx
160 | };
161 | res = parseArray(context.parser, res);
162 | return res;
163 | });
164 |
165 | // The matrix environments of amsmath builds on the array environment
166 | // of LaTeX, which is discussed above.
167 | defineEnvironment([
168 | "matrix",
169 | "pmatrix",
170 | "bmatrix",
171 | "Bmatrix",
172 | "vmatrix",
173 | "Vmatrix",
174 | ], {
175 | }, function(context) {
176 | var delimiters = {
177 | "matrix": null,
178 | "pmatrix": ["(", ")"],
179 | "bmatrix": ["[", "]"],
180 | "Bmatrix": ["\\{", "\\}"],
181 | "vmatrix": ["|", "|"],
182 | "Vmatrix": ["\\Vert", "\\Vert"],
183 | }[context.envName];
184 | var res = {
185 | type: "array",
186 | style: "matrix",
187 | hskipBeforeAndAfter: false, // \hskip -\arraycolsep in amsmath
188 | };
189 | q = 1;
190 | res = parseArray(context.parser, res);
191 |
192 | if (delimiters) {
193 | res = new ParseNode("leftright", {
194 | body: [res],
195 | left: delimiters[0],
196 | right: delimiters[1],
197 | }, context.mode);
198 | }
199 | return res;
200 | });
201 |
202 | // A cases environment (in amsmath.sty) is almost equivalent to
203 | // \def\arraystretch{1.2}%
204 | // \left\{\begin{array}{@{}l@{\quad}l@{}} … \end{array}\right.
205 | defineEnvironment("picture", {
206 | }, function(context) {
207 | var res = {
208 | type: "array",
209 | style: "picture",
210 | arraystretch: 1.2,
211 | cols: [{
212 | type: "align",
213 | align: "l",
214 | pregap: 0,
215 | postgap: fontMetrics.metrics.quad,
216 | }, {
217 | type: "align",
218 | align: "l",
219 | pregap: 0,
220 | postgap: 0,
221 | }],
222 | };
223 | res = parseArray(context.parser, res);
224 | res = new ParseNode("leftright", {
225 | body: [res],
226 | left: "\\{",
227 | right: ".",
228 | }, context.mode);
229 | return res;
230 | });
231 |
232 | defineEnvironment("cases", {
233 | }, function(context) {
234 | var res = {
235 | type: "array",
236 | style: "cases",
237 | arraystretch: 1.2,
238 | cols: [{
239 | type: "align",
240 | align: "l",
241 | pregap: 0,
242 | postgap: fontMetrics.metrics.quad,
243 | }, {
244 | type: "align",
245 | align: "l",
246 | pregap: 0,
247 | postgap: 0,
248 | }],
249 | };
250 | res = parseArray(context.parser, res);
251 | res = new ParseNode("leftright", {
252 | body: [res],
253 | left: "\\{",
254 | right: ".",
255 | }, context.mode);
256 | return res;
257 | });
258 |
259 | // An aligned environment is like the align* environment
260 | // except it operates within math mode.
261 | // Note that we assume \nomallineskiplimit to be zero,
262 | // so that \strut@ is the same as \strut.
263 | defineEnvironment("aligned", {
264 | }, function(context) {
265 | var res = {
266 | type: "array",
267 | style: "aligned",
268 | cols: [],
269 | };
270 | res = parseArray(context.parser, res);
271 | var emptyGroup = new ParseNode("ordgroup", [], context.mode);
272 | var numCols = 0;
273 | res.value.body.forEach(function(row) {
274 | var i;
275 | for (i = 1; i < row.length; i += 2) {
276 | row[i].value.unshift(emptyGroup);
277 | }
278 | if (numCols < row.length) {
279 | numCols = row.length;
280 | }
281 | });
282 | for (var i = 0; i < numCols; ++i) {
283 | var align = "r";
284 | var pregap = 0;
285 | if (i % 2 === 1) {
286 | align = "l";
287 | } else if (i > 0) {
288 | pregap = 2; // one \qquad between columns
289 | }
290 | res.value.cols[i] = {
291 | type: "align",
292 | align: align,
293 | pregap: pregap,
294 | postgap: 0,
295 | };
296 | }
297 | return res;
298 | });
299 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/src/fontMetrics.js:
--------------------------------------------------------------------------------
1 | /* eslint no-unused-vars:0 */
2 |
3 | var Style = require("./Style");
4 |
5 | /**
6 | * This file contains metrics regarding fonts and individual symbols. The sigma
7 | * and xi variables, as well as the metricMap map contain data extracted from
8 | * TeX, TeX font metrics, and the TTF files. These data are then exposed via the
9 | * `metrics` variable and the getCharacterMetrics function.
10 | */
11 |
12 | // These font metrics are extracted from TeX by using
13 | // \font\a=cmmi10
14 | // \showthe\fontdimenX\a
15 | // where X is the corresponding variable number. These correspond to the font
16 | // parameters of the symbol fonts. In TeX, there are actually three sets of
17 | // dimensions, one for each of textstyle, scriptstyle, and scriptscriptstyle,
18 | // but we only use the textstyle ones, and scale certain dimensions accordingly.
19 | // See the TeXbook, page 441.
20 | var sigma1 = 0.025;
21 | var sigma2 = 0;
22 | var sigma3 = 0;
23 | var sigma4 = 0;
24 | var sigma5 = 0.431;
25 | var sigma6 = 1;
26 | var sigma7 = 0;
27 | var sigma8 = 0.677;
28 | var sigma9 = 0.394;
29 | var sigma10 = 0.444;
30 | var sigma11 = 0.686;
31 | var sigma12 = 0.345;
32 | var sigma13 = 0.413;
33 | var sigma14 = 0.363;
34 | var sigma15 = 0.289;
35 | var sigma16 = 0.150;
36 | var sigma17 = 0.247;
37 | var sigma18 = 0.386;
38 | var sigma19 = 0.050;
39 | var sigma20 = 2.390;
40 | var sigma21 = 1.01;
41 | var sigma21Script = 0.81;
42 | var sigma21ScriptScript = 0.71;
43 | var sigma22 = 0.250;
44 |
45 | // These font metrics are extracted from TeX by using
46 | // \font\a=cmex10
47 | // \showthe\fontdimenX\a
48 | // where X is the corresponding variable number. These correspond to the font
49 | // parameters of the extension fonts (family 3). See the TeXbook, page 441.
50 | var xi1 = 0;
51 | var xi2 = 0;
52 | var xi3 = 0;
53 | var xi4 = 0;
54 | var xi5 = 0.431;
55 | var xi6 = 1;
56 | var xi7 = 0;
57 | var xi8 = 0.04;
58 | var xi9 = 0.111;
59 | var xi10 = 0.166;
60 | var xi11 = 0.2;
61 | var xi12 = 0.6;
62 | var xi13 = 0.1;
63 |
64 | // This value determines how large a pt is, for metrics which are defined in
65 | // terms of pts.
66 | // This value is also used in katex.less; if you change it make sure the values
67 | // match.
68 | var ptPerEm = 10.0;
69 |
70 | // The space between adjacent `|` columns in an array definition. From
71 | // `\showthe\doublerulesep` in LaTeX.
72 | var doubleRuleSep = 2.0 / ptPerEm;
73 |
74 | /**
75 | * This is just a mapping from common names to real metrics
76 | */
77 | var metrics = {
78 | xHeight: sigma5,
79 | quad: sigma6,
80 | num1: sigma8,
81 | num2: sigma9,
82 | num3: sigma10,
83 | denom1: sigma11,
84 | denom2: sigma12,
85 | sup1: sigma13,
86 | sup2: sigma14,
87 | sup3: sigma15,
88 | sub1: sigma16,
89 | sub2: sigma17,
90 | supDrop: sigma18,
91 | subDrop: sigma19,
92 | axisHeight: sigma22,
93 | defaultRuleThickness: xi8,
94 | bigOpSpacing1: xi9,
95 | bigOpSpacing2: xi10,
96 | bigOpSpacing3: xi11,
97 | bigOpSpacing4: xi12,
98 | bigOpSpacing5: xi13,
99 | ptPerEm: ptPerEm,
100 | emPerEx: sigma5 / sigma6,
101 | doubleRuleSep: doubleRuleSep,
102 |
103 | // TODO(alpert): Missing parallel structure here. We should probably add
104 | // style-specific metrics for all of these.
105 | delim1: sigma20,
106 | getDelim2: function(style) {
107 | if (style.size === Style.TEXT.size) {
108 | return sigma21;
109 | } else if (style.size === Style.SCRIPT.size) {
110 | return sigma21Script;
111 | } else if (style.size === Style.SCRIPTSCRIPT.size) {
112 | return sigma21ScriptScript;
113 | }
114 | throw new Error("Unexpected style size: " + style.size);
115 | },
116 | };
117 |
118 | // This map contains a mapping from font name and character code to character
119 | // metrics, including height, depth, italic correction, and skew (kern from the
120 | // character to the corresponding \skewchar)
121 | // This map is generated via `make metrics`. It should not be changed manually.
122 | var metricMap = require("./fontMetricsData");
123 |
124 | /**
125 | * This function is a convenience function for looking up information in the
126 | * metricMap table. It takes a character as a string, and a style.
127 | *
128 | * Note: the `width` property may be undefined if fontMetricsData.js wasn't
129 | * built using `Make extended_metrics`.
130 | */
131 | var getCharacterMetrics = function(character, style) {
132 | var metrics = metricMap[style][character.charCodeAt(0)];
133 | if (metrics) {
134 | return {
135 | depth: metrics[0],
136 | height: metrics[1],
137 | italic: metrics[2],
138 | skew: metrics[3],
139 | width: metrics[4],
140 | };
141 | }
142 | };
143 |
144 | module.exports = {
145 | metrics: metrics,
146 | getCharacterMetrics: getCharacterMetrics,
147 | };
148 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/src/mathMLTree.js:
--------------------------------------------------------------------------------
1 | /**
2 | * These objects store data about MathML nodes. This is the MathML equivalent
3 | * of the types in domTree.js. Since MathML handles its own rendering, and
4 | * since we're mainly using MathML to improve accessibility, we don't manage
5 | * any of the styling state that the plain DOM nodes do.
6 | *
7 | * The `toNode` and `toMarkup` functions work simlarly to how they do in
8 | * domTree.js, creating namespaced DOM nodes and HTML text markup respectively.
9 | */
10 |
11 | var utils = require("./utils");
12 |
13 | /**
14 | * This node represents a general purpose MathML node of any type. The
15 | * constructor requires the type of node to create (for example, `"mo"` or
16 | * `"mspace"`, corresponding to `` and `` tags).
17 | */
18 | function MathNode(type, children) {
19 | this.type = type;
20 | this.attributes = {};
21 | this.children = children || [];
22 | }
23 |
24 | /**
25 | * Sets an attribute on a MathML node. MathML depends on attributes to convey a
26 | * semantic content, so this is used heavily.
27 | */
28 | MathNode.prototype.setAttribute = function(name, value) {
29 | this.attributes[name] = value;
30 | };
31 |
32 | /**
33 | * Converts the math node into a MathML-namespaced DOM element.
34 | */
35 | MathNode.prototype.toNode = function() {
36 | var node = document.createElementNS(
37 | "http://www.w3.org/1998/Math/MathML", this.type);
38 |
39 | for (var attr in this.attributes) {
40 | if (Object.prototype.hasOwnProperty.call(this.attributes, attr)) {
41 | node.setAttribute(attr, this.attributes[attr]);
42 | }
43 | }
44 |
45 | for (var i = 0; i < this.children.length; i++) {
46 | node.appendChild(this.children[i].toNode());
47 | }
48 |
49 | return node;
50 | };
51 |
52 | /**
53 | * Converts the math node into an HTML markup string.
54 | */
55 | MathNode.prototype.toMarkup = function() {
56 | var markup = "<" + this.type;
57 |
58 | // Add the attributes
59 | for (var attr in this.attributes) {
60 | if (Object.prototype.hasOwnProperty.call(this.attributes, attr)) {
61 | markup += " " + attr + "=\"";
62 | markup += utils.escape(this.attributes[attr]);
63 | markup += "\"";
64 | }
65 | }
66 |
67 | markup += ">";
68 |
69 | for (var i = 0; i < this.children.length; i++) {
70 | markup += this.children[i].toMarkup();
71 | }
72 |
73 | markup += "" + this.type + ">";
74 |
75 | return markup;
76 | };
77 |
78 | /**
79 | * This node represents a piece of text.
80 | */
81 | function TextNode(text) {
82 | this.text = text;
83 | }
84 |
85 | /**
86 | * Converts the text node into a DOM text node.
87 | */
88 | TextNode.prototype.toNode = function() {
89 | return document.createTextNode(this.text);
90 | };
91 |
92 | /**
93 | * Converts the text node into HTML markup (which is just the text itself).
94 | */
95 | TextNode.prototype.toMarkup = function() {
96 | return utils.escape(this.text);
97 | };
98 |
99 | module.exports = {
100 | MathNode: MathNode,
101 | TextNode: TextNode,
102 | };
103 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/src/parseData.js:
--------------------------------------------------------------------------------
1 | /**
2 | * The resulting parse tree nodes of the parse tree.
3 | */
4 | function ParseNode(type, value, mode) {
5 | this.type = type;
6 | this.value = value;
7 | this.mode = mode;
8 | }
9 |
10 | module.exports = {
11 | ParseNode: ParseNode,
12 | };
13 |
14 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/src/parseTree.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Provides a single function for parsing an expression using a Parser
3 | * TODO(emily): Remove this
4 | */
5 |
6 | var Parser = require("./Parser");
7 |
8 | /**
9 | * Parses an expression using a Parser, then returns the parsed result.
10 | */
11 | var parseTree = function(toParse, settings) {
12 | var parser = new Parser(toParse, settings);
13 |
14 | return parser.parse();
15 | };
16 |
17 | module.exports = parseTree;
18 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/katex/src/utils.js:
--------------------------------------------------------------------------------
1 | /**
2 | * This file contains a list of utility functions which are useful in other
3 | * files.
4 | */
5 |
6 | /**
7 | * Provide an `indexOf` function which works in IE8, but defers to native if
8 | * possible.
9 | */
10 | var nativeIndexOf = Array.prototype.indexOf;
11 | var indexOf = function(list, elem) {
12 | if (list == null) {
13 | return -1;
14 | }
15 | if (nativeIndexOf && list.indexOf === nativeIndexOf) {
16 | return list.indexOf(elem);
17 | }
18 | var i = 0;
19 | var l = list.length;
20 | for (; i < l; i++) {
21 | if (list[i] === elem) {
22 | return i;
23 | }
24 | }
25 | return -1;
26 | };
27 |
28 | /**
29 | * Return whether an element is contained in a list
30 | */
31 | var contains = function(list, elem) {
32 | return indexOf(list, elem) !== -1;
33 | };
34 |
35 | /**
36 | * Provide a default value if a setting is undefined
37 | */
38 | var deflt = function(setting, defaultIfUndefined) {
39 | return setting === undefined ? defaultIfUndefined : setting;
40 | };
41 |
42 | // hyphenate and escape adapted from Facebook's React under Apache 2 license
43 |
44 | var uppercase = /([A-Z])/g;
45 | var hyphenate = function(str) {
46 | return str.replace(uppercase, "-$1").toLowerCase();
47 | };
48 |
49 | var ESCAPE_LOOKUP = {
50 | "&": "&",
51 | ">": ">",
52 | "<": "<",
53 | "\"": """,
54 | "'": "'",
55 | };
56 |
57 | var ESCAPE_REGEX = /[&><"']/g;
58 |
59 | function escaper(match) {
60 | return ESCAPE_LOOKUP[match];
61 | }
62 |
63 | /**
64 | * Escapes text to prevent scripting attacks.
65 | *
66 | * @param {*} text Text value to escape.
67 | * @return {string} An escaped string.
68 | */
69 | function escape(text) {
70 | return ("" + text).replace(ESCAPE_REGEX, escaper);
71 | }
72 |
73 | /**
74 | * A function to set the text content of a DOM element in all supported
75 | * browsers. Note that we don't define this if there is no document.
76 | */
77 | var setTextContent;
78 | if (typeof document !== "undefined") {
79 | var testNode = document.createElement("span");
80 | if ("textContent" in testNode) {
81 | setTextContent = function(node, text) {
82 | node.textContent = text;
83 | };
84 | } else {
85 | setTextContent = function(node, text) {
86 | node.innerText = text;
87 | };
88 | }
89 | }
90 |
91 | /**
92 | * A function to clear a node.
93 | */
94 | function clearNode(node) {
95 | setTextContent(node, "");
96 | }
97 |
98 | module.exports = {
99 | contains: contains,
100 | deflt: deflt,
101 | escape: escape,
102 | hyphenate: hyphenate,
103 | indexOf: indexOf,
104 | setTextContent: setTextContent,
105 | clearNode: clearNode,
106 | };
107 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/match-at/README.md:
--------------------------------------------------------------------------------
1 | # match-at [](https://travis-ci.org/spicyj/match-at)
2 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/match-at/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "match-at",
3 | "version": "0.1.0",
4 | "description": "Relocatable regular expressions.",
5 | "repository": {
6 | "type": "git",
7 | "url": "https://github.com/spicyj/match-at"
8 | },
9 | "main": "lib/matchAt.js",
10 | "files": [
11 | "lib/"
12 | ],
13 | "devDependencies": {
14 | "babel": "^4.7.16",
15 | "jest-cli": "^0.4.0",
16 | "react-tools": "^0.13.1"
17 | },
18 | "jest": {
19 | "scriptPreprocessor": "/jestSupport/preprocessor.js",
20 | "unmockedModulePathPatterns": [
21 | ""
22 | ]
23 | },
24 | "scripts": {
25 | "prepublish": "babel -d lib/ src/",
26 | "test": "jest"
27 | },
28 | "gitHead": "4197daff69720734c72ba3321ed68a41c0527fb2",
29 | "bugs": {
30 | "url": "https://github.com/spicyj/match-at/issues"
31 | },
32 | "homepage": "https://github.com/spicyj/match-at",
33 | "_id": "match-at@0.1.0",
34 | "_shasum": "f561e7709ff9a105b85cc62c6b8ee7c15bf24f31",
35 | "_from": "match-at@",
36 | "_npmVersion": "2.2.0",
37 | "_nodeVersion": "0.10.35",
38 | "_npmUser": {
39 | "name": "spicyj",
40 | "email": "ben@benalpert.com"
41 | },
42 | "maintainers": [
43 | {
44 | "name": "spicyj",
45 | "email": "ben@benalpert.com"
46 | }
47 | ],
48 | "dist": {
49 | "shasum": "f561e7709ff9a105b85cc62c6b8ee7c15bf24f31",
50 | "tarball": "https://registry.npmjs.org/match-at/-/match-at-0.1.0.tgz"
51 | },
52 | "directories": {},
53 | "_resolved": "https://registry.npmjs.org/match-at/-/match-at-0.1.0.tgz"
54 | }
55 |
--------------------------------------------------------------------------------
/thirdparty/harvardnlp_im2markup/third_party/multi-bleu.perl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env perl
2 | #
3 | # This file is part of moses. Its use is licensed under the GNU Lesser General
4 | # Public License version 2.1 or, at your option, any later version.
5 |
6 | # $Id$
7 | use warnings;
8 | use strict;
9 |
10 | my $lowercase = 0;
11 | if ($ARGV[0] eq "-lc") {
12 | $lowercase = 1;
13 | shift;
14 | }
15 |
16 | my $stem = $ARGV[0];
17 | if (!defined $stem) {
18 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
19 | print STDERR "Reads the references from reference or reference0, reference1, ...\n";
20 | exit(1);
21 | }
22 |
23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
24 |
25 | my @REF;
26 | my $ref=0;
27 | while(-e "$stem$ref") {
28 | &add_to_ref("$stem$ref",\@REF);
29 | $ref++;
30 | }
31 | &add_to_ref($stem,\@REF) if -e $stem;
32 | die("ERROR: could not find reference file $stem") unless scalar @REF;
33 |
34 | # add additional references explicitly specified on the command line
35 | shift;
36 | foreach my $stem (@ARGV) {
37 | &add_to_ref($stem,\@REF) if -e $stem;
38 | }
39 |
40 |
41 |
42 | sub add_to_ref {
43 | my ($file,$REF) = @_;
44 | my $s=0;
45 | if ($file =~ /.gz$/) {
46 | open(REF,"gzip -dc $file|") or die "Can't read $file";
47 | } else {
48 | open(REF,$file) or die "Can't read $file";
49 | }
50 | while([) {
51 | chop;
52 | push @{$$REF[$s++]}, $_;
53 | }
54 | close(REF);
55 | }
56 |
57 | my(@CORRECT,@TOTAL,$length_translation,$length_reference);
58 | my $s=0;
59 | while() {
60 | chop;
61 | $_ = lc if $lowercase;
62 | my @WORD = split;
63 | my %REF_NGRAM = ();
64 | my $length_translation_this_sentence = scalar(@WORD);
65 | my ($closest_diff,$closest_length) = (9999,9999);
66 | foreach my $reference (@{$REF[$s]}) {
67 | # print "$s $_ <=> $reference\n";
68 | $reference = lc($reference) if $lowercase;
69 | my @WORD = split(' ',$reference);
70 | my $length = scalar(@WORD);
71 | my $diff = abs($length_translation_this_sentence-$length);
72 | if ($diff < $closest_diff) {
73 | $closest_diff = $diff;
74 | $closest_length = $length;
75 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
76 | } elsif ($diff == $closest_diff) {
77 | $closest_length = $length if $length < $closest_length;
78 | # from two references with the same closeness to me
79 | # take the *shorter* into account, not the "first" one.
80 | }
81 | for(my $n=1;$n<=4;$n++) {
82 | my %REF_NGRAM_N = ();
83 | for(my $start=0;$start<=$#WORD-($n-1);$start++) {
84 | my $ngram = "$n";
85 | for(my $w=0;$w<$n;$w++) {
86 | $ngram .= " ".$WORD[$start+$w];
87 | }
88 | $REF_NGRAM_N{$ngram}++;
89 | }
90 | foreach my $ngram (keys %REF_NGRAM_N) {
91 | if (!defined($REF_NGRAM{$ngram}) ||
92 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
93 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
94 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}]
\n";
95 | }
96 | }
97 | }
98 | }
99 | $length_translation += $length_translation_this_sentence;
100 | $length_reference += $closest_length;
101 | for(my $n=1;$n<=4;$n++) {
102 | my %T_NGRAM = ();
103 | for(my $start=0;$start<=$#WORD-($n-1);$start++) {
104 | my $ngram = "$n";
105 | for(my $w=0;$w<$n;$w++) {
106 | $ngram .= " ".$WORD[$start+$w];
107 | }
108 | $T_NGRAM{$ngram}++;
109 | }
110 | foreach my $ngram (keys %T_NGRAM) {
111 | $ngram =~ /^(\d+) /;
112 | my $n = $1;
113 | # my $corr = 0;
114 | # print "$i e $ngram $T_NGRAM{$ngram}
\n";
115 | $TOTAL[$n] += $T_NGRAM{$ngram};
116 | if (defined($REF_NGRAM{$ngram})) {
117 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
118 | $CORRECT[$n] += $T_NGRAM{$ngram};
119 | # $corr = $T_NGRAM{$ngram};
120 | # print "$i e correct1 $T_NGRAM{$ngram}
\n";
121 | }
122 | else {
123 | $CORRECT[$n] += $REF_NGRAM{$ngram};
124 | # $corr = $REF_NGRAM{$ngram};
125 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n";
126 | }
127 | }
128 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
129 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
130 | }
131 | }
132 | $s++;
133 | }
134 | my $brevity_penalty = 1;
135 | my $bleu = 0;
136 |
137 | my @bleu=();
138 |
139 | for(my $n=1;$n<=4;$n++) {
140 | if (defined ($TOTAL[$n])){
141 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
142 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
143 | }else{
144 | $bleu[$n]=0;
145 | }
146 | }
147 |
148 | if ($length_reference==0){
149 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
150 | exit(1);
151 | }
152 |
153 | if ($length_translation<$length_reference) {
154 | $brevity_penalty = exp(1-$length_reference/$length_translation);
155 | }
156 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
157 | my_log( $bleu[2] ) +
158 | my_log( $bleu[3] ) +
159 | my_log( $bleu[4] ) ) / 4) ;
160 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
161 | 100*$bleu,
162 | 100*$bleu[1],
163 | 100*$bleu[2],
164 | 100*$bleu[3],
165 | 100*$bleu[4],
166 | $brevity_penalty,
167 | $length_translation / $length_reference,
168 | $length_translation,
169 | $length_reference;
170 |
171 | sub my_log {
172 | return -9999999999 unless $_[0];
173 | return log($_[0]);
174 | }
175 |
176 |
--------------------------------------------------------------------------------