├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── bin ├── kill.bash ├── kll ├── runChrome ├── runCode ├── runFilemerge.bash ├── runJupyter.bash ├── runJupyterLab.bash ├── runSpyder.bash ├── runTensorboard.bash ├── runTensorboard2.bash ├── runi2l.bash └── start_ssh_agent ├── docs └── index.html ├── papers ├── 4878-understanding-dropout.pdf ├── ADAM Optimizer.pdf ├── BLEU.pdf ├── BLEU_SmoothingTechniques.pdf ├── CTC_paper.pdf ├── ConvNet.numbers ├── Dropout.pdf ├── Google_NMT_System.pdf ├── Grammar As A Foreign Language.pdf ├── Image-to-Markup Generation with Coarse-to-Fine Attention.pdf ├── LatexCommands.pdf ├── Learning to combine foveal glimpses with a third-order Boltzmann machine.pdf ├── MULTIPLE OBJECT RECOGNITION WITH VISUAL ATTENTION.pdf ├── Neural Machine Translation by Jointly Learning to Align and Translate.pdf ├── Recurrent Models of Visual Attention.pdf ├── Recurrent Neural Network Regularization.pdf ├── Show, Attend and Tell- Neural Image Caption Generation with Visual Attention slides.pdf ├── Show, Attend and Tell- Neural Image Caption Generation with Visual Attention.pdf ├── VERY DEEP CONVOLUTIONAL NETWORKS FOR LARGE-SCALE IMAGE RECOGNITION.pdf ├── Visualizing and understanding convolutional networks slides.pdf ├── Visualizing and understanding convolutional networks.pdf ├── What You Get Is What You See- A Visual Markup Decompiler.pdf ├── amsldoc.pdf ├── candidate_sampling.pdf ├── dropout_hinton.pdf ├── glorot10a.pdf └── symbols-letter.pdf ├── src ├── README.md ├── commons │ ├── data_commons.py │ ├── data_reader.py │ ├── dl_commons.py │ ├── dl_commons_tests.py │ ├── pub_commons.py │ ├── test_tf_commons.py │ ├── tf_commons.py │ └── viz_commons.py ├── convnet.py ├── model │ ├── CALSTM.py │ ├── Im2LatexModel.py │ ├── hyper_params.py │ ├── tf_dynamic_decode.py │ └── tf_tutorial_code.py ├── postprocessing │ └── evaluate_images.ipynb ├── preprocessing │ ├── README.md │ ├── preprocessing_step_0.ipynb │ ├── preprocessing_step_1.ipynb │ ├── preprocessing_step_2_tokenizer.ipynb │ ├── preprocessing_step_3_filter.ipynb │ ├── preprocessing_step_4_binning.ipynb │ └── preprocessing_step_5_padding.ipynb ├── run.py ├── tools │ ├── bulk_disp_alpha.py │ ├── diff_params.ipynb │ ├── disp.ipynb │ ├── disp_alpha.ipynb │ ├── eval_runs.ipynb │ ├── predict.ipynb │ ├── prune_logs.ipynb │ ├── publishing.ipynb │ ├── sample_preds.ipynb │ ├── sample_strs.ipynb │ └── visualize.ipynb └── train_multi_gpu.py └── thirdparty ├── data ├── im2latex_formulas_downloaded.lst └── im2latex_formulas_downloaded.norm.lst └── harvardnlp_im2markup ├── LICENSE ├── Readme.md ├── scripts ├── evaluation │ ├── LevSeq.py │ ├── distance │ │ ├── __init__.py │ │ ├── _fastcomp.py │ │ ├── _iterators.py │ │ ├── _lcsubstrings.py │ │ ├── _levenshtein.py │ │ ├── _pyimports.py │ │ └── _simpledists.py │ ├── evaluate_bleu.py │ ├── evaluate_image.py │ ├── evaluate_text_edit_distance.py │ ├── render_html.py │ └── render_latex.py ├── preprocessing │ ├── generate_latex_vocab.py │ ├── preprocess_filter.py │ ├── preprocess_formulas.py │ ├── preprocess_images.py │ └── preprocess_latex.js └── utils │ ├── image_utils.py │ └── utils.py └── third_party ├── katex ├── .#katex.js ├── LICENSE.txt ├── README.md ├── cli.js ├── katex.js ├── package.json └── src │ ├── Lexer.js │ ├── Options.js │ ├── ParseError.js │ ├── Parser.js │ ├── Settings.js │ ├── Style.js │ ├── buildCommon.js │ ├── buildHTML.js │ ├── buildMathML.js │ ├── buildTree.js │ ├── delimiter.js │ ├── domTree.js │ ├── environments.js │ ├── fontMetrics.js │ ├── fontMetricsData.js │ ├── functions.js │ ├── mathMLTree.js │ ├── parseData.js │ ├── parseTree.js │ ├── symbols.js │ └── utils.js ├── match-at ├── README.md └── package.json └── multi-bleu.perl /.gitattributes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/.gitattributes -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | data 104 | data/ 105 | ShowAndTellSrc/ 106 | tb_metrics_*/ 107 | tb_metrics/ 108 | logdir/ 109 | *.out 110 | .vscode/ 111 | *.py~ 112 | *.tgz 113 | bin/conda 114 | bin/activate 115 | bin/deactivate 116 | .DS_Store 117 | ._.DS_Store 118 | .idea/ 119 | zpool_3TB 120 | scratch.* 121 | gallery/ 122 | gallery 123 | -------------------------------------------------------------------------------- /bin/kill.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pgrep -fl $1 3 | pkill -fa $1 4 | pgrep -fl $1 5 | -------------------------------------------------------------------------------- /bin/kll: -------------------------------------------------------------------------------- 1 | kill.bash -------------------------------------------------------------------------------- /bin/runChrome: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | google-chrome --disable-gpu & 3 | -------------------------------------------------------------------------------- /bin/runCode: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | code --disable-gpu 3 | -------------------------------------------------------------------------------- /bin/runFilemerge.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | open /Applications/Xcode.app/Contents/Applications/FileMerge.app 3 | -------------------------------------------------------------------------------- /bin/runJupyter.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | LOGFILE=~/logs/jupyter.out 3 | cd ~ 4 | nohup jupyter notebook --ip=* --port 50001 > $LOGFILE 2>&1 & 5 | tail -f $LOGFILE 6 | -------------------------------------------------------------------------------- /bin/runJupyterLab.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | LOGFILE=~/logs/jupyter_lab.out 3 | cd ~ 4 | nohup jupyter lab --ip=* --port 50003 > $LOGFILE 2>&1 & 5 | tail -f $LOGFILE 6 | -------------------------------------------------------------------------------- /bin/runSpyder.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | LOGFILE=~/logs/spyder.out 3 | cd ~ 4 | nohup spyder > $LOGFILE 2>&1 & 5 | tail -f $LOGFILE 6 | -------------------------------------------------------------------------------- /bin/runTensorboard.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | LOGFILE=~/logs/tensorboard.out 3 | cd ~ 4 | nohup tensorboard --logdir ~/predictions/logdir --purge_orphaned_data --port 50002 > $LOGFILE 2>&1 & 5 | tail -f $LOGFILE 6 | -------------------------------------------------------------------------------- /bin/runTensorboard2.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | LOGFILE=~/logs/tensorboard2.out 3 | cd ~ 4 | nohup tensorboard --logdir ~/im2latex/src/tb_metrics_dev --purge_orphaned_data --port 50003 > $LOGFILE 2>&1 & 5 | tail -f $LOGFILE 6 | -------------------------------------------------------------------------------- /bin/runi2l.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | LOGFILE=~/logs/run.out 3 | cd ~ 4 | rm LOGFILE 5 | ## ./run.py -e -1 -b 40 -p -i 0 --r-lambda 0.00005 -k 1.0 -w 10 --squash-input-seq --logdir ./tb_metrics_dev 6 | ## ./run.py -e -1 -b 40 -p -i 0 --r-lambda 0.00005 -k 1.0 -w 10 --squash-input-seq --logdir ./tb_metrics_dev --logdir-tag test_3.1LSTM_2init_3out_3attConv_1beta 7 | nohup ./run.py -e -1 -b 64 -w 10 -k 1.0 -p -i 0 >$LOGFILE 2>&1 & 8 | tail -f $LOGFILE 9 | -------------------------------------------------------------------------------- /bin/start_ssh_agent: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | eval "$(ssh-agent -s)" 3 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /papers/4878-understanding-dropout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/4878-understanding-dropout.pdf -------------------------------------------------------------------------------- /papers/ADAM Optimizer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/ADAM Optimizer.pdf -------------------------------------------------------------------------------- /papers/BLEU.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/BLEU.pdf -------------------------------------------------------------------------------- /papers/BLEU_SmoothingTechniques.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/BLEU_SmoothingTechniques.pdf -------------------------------------------------------------------------------- /papers/CTC_paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/CTC_paper.pdf -------------------------------------------------------------------------------- /papers/ConvNet.numbers: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/ConvNet.numbers -------------------------------------------------------------------------------- /papers/Dropout.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Dropout.pdf -------------------------------------------------------------------------------- /papers/Google_NMT_System.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Google_NMT_System.pdf -------------------------------------------------------------------------------- /papers/Grammar As A Foreign Language.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Grammar As A Foreign Language.pdf -------------------------------------------------------------------------------- /papers/Image-to-Markup Generation with Coarse-to-Fine Attention.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Image-to-Markup Generation with Coarse-to-Fine Attention.pdf -------------------------------------------------------------------------------- /papers/LatexCommands.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/LatexCommands.pdf -------------------------------------------------------------------------------- /papers/Learning to combine foveal glimpses with a third-order Boltzmann machine.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Learning to combine foveal glimpses with a third-order Boltzmann machine.pdf -------------------------------------------------------------------------------- /papers/MULTIPLE OBJECT RECOGNITION WITH VISUAL ATTENTION.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/MULTIPLE OBJECT RECOGNITION WITH VISUAL ATTENTION.pdf -------------------------------------------------------------------------------- /papers/Neural Machine Translation by Jointly Learning to Align and Translate.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Neural Machine Translation by Jointly Learning to Align and Translate.pdf -------------------------------------------------------------------------------- /papers/Recurrent Models of Visual Attention.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Recurrent Models of Visual Attention.pdf -------------------------------------------------------------------------------- /papers/Recurrent Neural Network Regularization.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Recurrent Neural Network Regularization.pdf -------------------------------------------------------------------------------- /papers/Show, Attend and Tell- Neural Image Caption Generation with Visual Attention.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Show, Attend and Tell- Neural Image Caption Generation with Visual Attention.pdf -------------------------------------------------------------------------------- /papers/VERY DEEP CONVOLUTIONAL NETWORKS FOR LARGE-SCALE IMAGE RECOGNITION.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/VERY DEEP CONVOLUTIONAL NETWORKS FOR LARGE-SCALE IMAGE RECOGNITION.pdf -------------------------------------------------------------------------------- /papers/Visualizing and understanding convolutional networks slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Visualizing and understanding convolutional networks slides.pdf -------------------------------------------------------------------------------- /papers/Visualizing and understanding convolutional networks.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/Visualizing and understanding convolutional networks.pdf -------------------------------------------------------------------------------- /papers/What You Get Is What You See- A Visual Markup Decompiler.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/What You Get Is What You See- A Visual Markup Decompiler.pdf -------------------------------------------------------------------------------- /papers/amsldoc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/amsldoc.pdf -------------------------------------------------------------------------------- /papers/candidate_sampling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/candidate_sampling.pdf -------------------------------------------------------------------------------- /papers/dropout_hinton.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/dropout_hinton.pdf -------------------------------------------------------------------------------- /papers/glorot10a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/glorot10a.pdf -------------------------------------------------------------------------------- /papers/symbols-letter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/papers/symbols-letter.pdf -------------------------------------------------------------------------------- /src/README.md: -------------------------------------------------------------------------------- 1 | # Notes about the Framework 2 | 1. Input is streamed, not loaded at once into memory 3 | 2. Use 'n' GPUs 4 | 3. Snapshots are taken every 'n' epochs or based on other dynamic conditions (e.g. best observed validation accuracy) 5 | 4. Snapshot taken when model training is interrupted 6 | 5. Metrics viewed in tensorboard 7 | 6. All hyperparameters are saved alongside model weights 8 | 7. Very flexible class for specifying hyperparameters (includes model architecture as well as training parameters) 9 | 8. ... -------------------------------------------------------------------------------- /src/commons/data_commons.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Copyright 2017 Sumeet S Singh 5 | 6 | This file is part of im2latex solution by Sumeet S Singh. 7 | 8 | This program is free software: you can redistribute it and/or modify 9 | it under the terms of the Affero GNU General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | This program is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | Affero GNU General Public License for more details. 17 | 18 | You should have received a copy of the Affero GNU General Public License 19 | along with this program. If not, see . 20 | 21 | Created on Mon Jul 17 19:58:00 2017 22 | 23 | @author: Sumeet S Singh 24 | """ 25 | import os 26 | import time 27 | import logging 28 | # from six.moves import cPickle as pickle 29 | import dill as pickle 30 | import numpy as np 31 | import h5py 32 | 33 | dict_id2word = None 34 | i2w_ufunc = None 35 | logger = logging 36 | 37 | 38 | def setLogLevel(logger_, level): 39 | logging_levels = (logging.CRITICAL, logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG) 40 | logger_.setLevel(logging_levels[level - 1]) 41 | 42 | 43 | def makeFormatter(): 44 | return logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 45 | 46 | 47 | def makeLogger(logging_level=3, name='default', set_global=False): 48 | global logger 49 | logger_ = logging.Logger(name) 50 | ch = logging.StreamHandler() 51 | ch.setFormatter(makeFormatter()) 52 | logger_.addHandler(ch) 53 | setLogLevel(logger_, logging_level) 54 | if set_global: 55 | logger = logger_ 56 | return logger_ 57 | 58 | 59 | def initialize(training_data_dir, params): 60 | global i2w_ufunc, dict_id2word 61 | # if logger is None: 62 | # logger = params.logger 63 | if i2w_ufunc is None: 64 | data_props = load(training_data_dir, 'data_props.pkl') 65 | dict_id2word = data_props['id2word'] 66 | K = len(dict_id2word.keys()) 67 | CTCBlankTokenID = params.CTCBlankTokenID 68 | if (CTCBlankTokenID is not None) and (CTCBlankTokenID >= K): 69 | dict_id2word[CTCBlankTokenID] = u'<>' ## CTC Blank Token 70 | dict_id2word[-1] = u'<-1>' ## Catch -1s that beamsearch emits after EOS. 71 | def i2w(id): 72 | try: 73 | return dict_id2word[id] 74 | except KeyError as e: 75 | logger.critical('i2w: KeyError: %s', e) 76 | return '<%d>'%(id,) 77 | i2w_ufunc = np.frompyfunc(i2w, 1, 1) 78 | return i2w_ufunc 79 | 80 | 81 | def seq2str(arr, label, separator=None): 82 | """ 83 | Converts a matrix of id-sequences - shaped (B,T) - to an array of strings shaped (B,). 84 | Uses the supplied dict_id2word to map ids to words. The dictionary must map dtype of 85 | to string. 86 | """ 87 | assert i2w_ufunc is not None, "i2w_ufunc is None. Please call initialize first in order to setup i2w_ufunc." 88 | str_arr = i2w_ufunc(arr) # (B, T) 89 | if separator is None: 90 | func1d = lambda vec: label + u" " + u"".join(vec) 91 | else: 92 | func1d = lambda vec: label + u" " + unicode(separator).join(vec) 93 | return [func1d(vec) for vec in str_arr] 94 | 95 | 96 | def join(*paths): 97 | return os.path.join(*paths) 98 | 99 | 100 | def dump(ar, *paths): 101 | path = join(*paths) 102 | assert not os.path.exists(path), 'A file already exists at path %s'%path 103 | with open(path, 'wb') as f: 104 | pickle.dump(ar, f, pickle.HIGHEST_PROTOCOL) 105 | 106 | 107 | def load(*paths): 108 | with open(join(*paths), 'rb') as f: 109 | return pickle.load(f) 110 | 111 | 112 | class Storer(object): 113 | def __init__(self, args, prefix, step): 114 | self._path = os.path.join(args.storedir, '%s_%d.h5'%(prefix, step)) 115 | self._h5 = h5py.File(self._path, mode="w-", swmr=False) 116 | 117 | def __enter__(self): 118 | return self 119 | 120 | def __exit__(self, *err): 121 | self.close() 122 | 123 | def flush(self): 124 | self._h5.flush() 125 | 126 | def close(self): 127 | self._h5.close() 128 | 129 | def write(self, key, ar, dtype=None, batch_axis=0, doUnwrap=True): 130 | """ 131 | WARNING: ar must either be an numpy.ndarray (not numpy scalar) or a python list/tuple of numpy.ndarray. 132 | Nothing else will work. 133 | :param key: 134 | :param ar: 135 | :param dtype: 136 | :param batch_axis: 137 | :param doUnwrap: 138 | :return: 139 | """ 140 | if (isinstance(ar, tuple) or isinstance(ar, list)) and doUnwrap: 141 | return self._write(key, ar, dtype, batch_axis) 142 | else: 143 | return self._write(key, [ar], dtype, batch_axis) 144 | 145 | def _write(self, key, np_ar_list, dtype, batch_axis): 146 | """ 147 | WARNING: np_ar_list must be a python list/tuple of numpy.ndarray. Nothing else will work. 148 | 149 | Stacks the tensors in the list along axis=batch_axis and writes them to disk. 150 | Dimensions along axis=batch_axis are summed up (since we're stacking along that dimension). 151 | Other dimensions are padded to the maximum size 152 | with a dtype-suitable value (np.nan for float, -2 for integer) 153 | """ 154 | ## Assuming all arrays have same rank, find the max dims 155 | shapes = [ar.shape for ar in np_ar_list] 156 | dims = zip(*shapes) 157 | max_shape = [max(d) for d in dims] 158 | ## We'll concatenate all arrays along axis=batch_axis 159 | max_shape[batch_axis] = sum(dims[batch_axis]) 160 | if dtype == np.unicode_: 161 | dt = h5py.special_dtype(vlen=unicode) 162 | dataset = self._h5.create_dataset(key, max_shape, dtype=dt) 163 | else: 164 | dataset = self._h5.create_dataset(key, max_shape, dtype=dtype, fillvalue=-2 if np.issubdtype(dtype, np.integer) else np.nan) 165 | 166 | def make_slice(row, shape, batch_axis): 167 | """ 168 | Create a slice to place shape into the receiving dataset starting at rownum along axis=batch_axis, 169 | and starting at 0 along all other axes 170 | """ 171 | s = [slice(0,d) for d in shape] 172 | s[batch_axis] = slice(row, row+shape[batch_axis]) 173 | return tuple(s), row+shape[batch_axis] 174 | 175 | row = 0 176 | for ar in np_ar_list: 177 | s, row = make_slice(row, ar.shape, batch_axis) 178 | # logger.info('row=%d, slice=%s', row, s) 179 | dataset[s] = ar 180 | 181 | 182 | def makeLogfileName(logdir, name): 183 | prefix, ext = os.path.splitext(os.path.basename(name)) 184 | filenames = os.listdir(logdir) 185 | if not (prefix + ext) in filenames: 186 | return os.path.join(logdir, prefix + ext) 187 | else: 188 | for i in xrange(2,101): 189 | if '%s_%d%s'%(prefix,i,ext) not in filenames: 190 | return os.path.join(logdir, '%s_%d%s'%(prefix,i,ext)) 191 | 192 | raise Exception('logfile number limit (100) reached.') 193 | 194 | 195 | def exists(*paths): 196 | return os.path.exists(os.path.join(*paths)) 197 | 198 | 199 | def makeLogDir(root, dirname): 200 | dirpath = makeLogfileName(root, dirname) 201 | os.makedirs(dirpath) 202 | return dirpath 203 | 204 | 205 | def makeTBDir(tb_logdir, logdir_tag=None): 206 | if logdir_tag is None: 207 | dirpath = os.path.join(tb_logdir, time.strftime('%Y-%m-%d %H-%M-%S %Z')) 208 | else: 209 | dirpath = os.path.join(tb_logdir, time.strftime('%Y-%m-%d %H-%M-%S %Z') + ' ' + logdir_tag) 210 | 211 | os.makedirs(dirpath) 212 | return dirpath 213 | 214 | 215 | def readlines_to_df(path, colname): 216 | # return pd.read_csv(output_file, sep='\t', header=None, names=['formula'], index_col=False, dtype=str, skipinitialspace=True, skip_blank_lines=True) 217 | rows = [] 218 | n = 0 219 | with open(path, 'r') as f: 220 | print 'opened file %s'%path 221 | for line in f: 222 | n += 1 223 | line = line.strip() # remove \n 224 | if len(line) > 0: 225 | rows.append(line.encode('utf-8')) 226 | print 'processed %d lines resulting in %d rows'%(n, len(rows)) 227 | return pd.DataFrame({colname:rows}, dtype=np.str_) 228 | 229 | 230 | def readlines_to_sr(path): 231 | rows = [] 232 | n = 0 233 | with open(path, 'r') as f: 234 | print 'opened file %s'%path 235 | for line in f: 236 | n += 1 237 | line = line.strip() # remove \n 238 | if len(line) > 0: 239 | rows.append(line.encode('utf-8')) 240 | print 'processed %d lines resulting in %d rows'%(n, len(rows)) 241 | return pd.Series(rows, dtype=np.str_) 242 | 243 | 244 | def sr_to_lines(sr, path): 245 | # df.to_csv(path, header=False, index=False, columns=['formula'], encoding='utf-8', quoting=csv.QUOTE_NONE, escapechar=None, sep='\t') 246 | assert sr.dtype == np.str_ or sr.dtype == np.object_ 247 | with open(path, 'w') as f: 248 | for s in sr: 249 | assert '\n' not in s 250 | f.write(s.strip()) 251 | f.write('\n') 252 | -------------------------------------------------------------------------------- /src/commons/dl_commons_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Copyright 2017 - 2018 Sumeet S Singh 5 | 6 | This file is part of im2latex solution by Sumeet S Singh. 7 | 8 | This program is free software: you can redistribute it and/or modify 9 | it under the terms of the Affero GNU General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | This program is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | Affero GNU General Public License for more details. 17 | 18 | You should have received a copy of the Affero GNU General Public License 19 | along with this program. If not, see . 20 | 21 | @author: Sumeet S Singh 22 | 23 | Works on python 2.7 24 | """ 25 | 26 | import unittest 27 | import dl_commons as dlc 28 | from dl_commons import PD, LambdaVal, integer, integerOrNone, instanceof, equalto 29 | #import tf_commons as tfc 30 | 31 | class Props(dlc.Params): 32 | proto = ( 33 | PD('m', '', 34 | integer(), 35 | 64), 36 | PD('D', '', 37 | integer(), 38 | 512) 39 | ) 40 | def __init__(self, initVals={}): 41 | dlc.Params.__init__(self, self.proto, initVals) 42 | 43 | class Props2(dlc.Params): 44 | def makeProto(self, GLOBAL): 45 | return Props.proto + ( 46 | PD('i', '', 47 | integer(), 48 | LambdaVal(lambda _, __: GLOBAL.m + GLOBAL.D) 49 | ), 50 | PD('m2', '', 51 | integer(), 52 | equalto('m', GLOBAL)), 53 | PD('D2', '', 54 | integer(), 55 | equalto('D', GLOBAL)), 56 | PD('j', '', 57 | integerOrNone(), 58 | None 59 | ), 60 | PD('k', '', 61 | integerOrNone(), 62 | 1 63 | ), 64 | ) 65 | def __init__(self, initVals={}): 66 | dlc.Params.__init__(self, self.makeProto(initVals), initVals) 67 | 68 | class Props3(dlc.Params): 69 | def makeProto(self, GLOBAL): 70 | return Props.proto + ( 71 | PD('i', '', 72 | integer(), 73 | equalto('i', GLOBAL) 74 | ), 75 | PD('m3', '', 76 | integer(), 77 | equalto('m2', GLOBAL)), 78 | PD('D3', '', 79 | integer(), 80 | equalto('D2', GLOBAL)), 81 | PD('j', '', 82 | integerOrNone(), 83 | 2 84 | ), 85 | PD('k', '', 86 | integerOrNone(), 87 | 2 88 | ), 89 | PD('l', '', 90 | integerOrNone(), 91 | 2 92 | ), 93 | ) 94 | def __init__(self, initVals={}): 95 | dlc.Params.__init__(self, self.makeProto(initVals), initVals) 96 | 97 | 98 | 99 | class TestCaseBase(unittest.TestCase): 100 | @staticmethod 101 | def dictSet(d, name, val): 102 | d[name] = val 103 | 104 | @staticmethod 105 | def dictGet(d, name): 106 | return d[name] 107 | 108 | @staticmethod 109 | def instantiate(cls, *args): 110 | cls(*args) 111 | 112 | class PropertiesTest(TestCaseBase): 113 | def __init__(self, *args): 114 | unittest.TestCase.__init__(self, *args) 115 | 116 | def test_good_props(self): 117 | props = { 118 | 'model_name':'im2latex', 119 | 'num_layers':None, 120 | 'unset':None 121 | } 122 | open = dlc.Properties(props) 123 | sealed = dlc.Properties(open).seal() 124 | props['num_layers'] = 10 125 | frozen = dlc.Properties(props).freeze() 126 | 127 | open.layer_type = 'MLP' # create new property 128 | self.assertEqual(open.layer_type, 'MLP') 129 | self.assertEqual(open['layer_type'], 'MLP') 130 | open['layer_type'] = 'CNN' 131 | self.assertEqual(open.layer_type, 'CNN') 132 | self.assertEqual(open['layer_type'], 'CNN') 133 | 134 | self.assertEqual(frozen.model_name, 'im2latex') 135 | self.assertEqual(frozen.unset, None) 136 | self.assertEqual(frozen['unset'], None) 137 | self.assertEqual(frozen['num_layers'], 10) 138 | self.assertEqual(frozen.num_layers, 10) 139 | 140 | 141 | def test_bad_props(self): 142 | props = { 143 | 'model_name':'im2latex', 144 | 'num_layers':None, 145 | 'unset':None 146 | } 147 | open = dlc.Properties(props) 148 | sealed = dlc.Properties(open).seal() 149 | props['num_layers'] = 10 150 | frozen = dlc.Properties(props).freeze() 151 | 152 | self.assertRaises(dlc.AccessDeniedError, setattr, sealed, "x", "MyNeuralNetwork") 153 | self.assertRaises(dlc.AccessDeniedError, self.dictSet, sealed, "x", "MyNeuralNetwork") 154 | self.assertRaises(dlc.AccessDeniedError, setattr, frozen, "name", "MyNeuralNetwork") 155 | self.assertRaises(dlc.AccessDeniedError, self.dictSet, frozen, "name", "MyNeuralNetwork") 156 | 157 | self.assertRaises(KeyError, getattr, sealed, "x") 158 | self.assertRaises(KeyError, self.dictGet, sealed, "x") 159 | 160 | def test_good_params(self): 161 | sealed = dlc.Params(( 162 | dlc.ParamDesc('model_name', 'Name of Model', None, 'im2latex'), 163 | dlc.ParamDesc('layer_type', 'Type of layers to be created', ['CNN', 'MLP', 'LSTM', 'RNN'], 'LSTM'), 164 | dlc.ParamDesc('num_layers', 'Number of layers to create', range(1,11)), 165 | dlc.ParamDesc('unset', 'Unset property', range(1,11)) 166 | ) 167 | ).seal() 168 | frozen = dlc.Params(sealed, {'num_layers':10}).freeze() 169 | sealed.layer_type = 'MLP' 170 | self.assertEqual(sealed.layer_type, 'MLP') 171 | self.assertEqual(sealed['layer_type'], 'MLP') 172 | sealed['layer_type'] = 'CNN' 173 | self.assertEqual(sealed.layer_type, 'CNN') 174 | self.assertEqual(sealed['layer_type'], 'CNN') 175 | 176 | self.assertEqual(frozen.model_name, 'im2latex') 177 | self.assertEqual(frozen.layer_type, 'LSTM') 178 | self.assertEqual(frozen['num_layers'], 10) 179 | self.assertEqual(frozen.num_layers, 10) 180 | 181 | 182 | def test_bad_params(self): 183 | proto = ( 184 | dlc.ParamDesc('model_name', 'Name of Model', None, 'im2latex'), 185 | dlc.ParamDesc('layer_type', 'Type of layers to be created', ['CNN', 'MLP', 'LSTM', 'RNN']), 186 | dlc.ParamDesc('num_layers', 'Number of layers to create', range(1,11)), 187 | dlc.ParamDesc('unset', 'Unset property', range(1,11)) 188 | ) 189 | sealed = dlc.Params(proto).seal() 190 | frozen = dlc.Params(proto, {'num_layers':10}).freeze() 191 | self.assertRaises(KeyError, setattr, sealed, "x", "MyNeuralNetwork") 192 | self.assertRaises(KeyError, self.dictSet, sealed, "x", "MyNeuralNetwork") 193 | self.assertRaises(KeyError, setattr, frozen, "name", "MyNeuralNetwork") 194 | self.assertRaises(KeyError, self.dictSet, frozen, "name", "MyNeuralNetwork") 195 | 196 | self.assertRaises(ValueError, setattr, sealed, "layer_type", "SVM") 197 | self.assertRaises(ValueError, self.dictSet, sealed, "layer_type", "SVM") 198 | 199 | self.assertRaises(KeyError, getattr, sealed, "x") 200 | self.assertRaises(KeyError, self.dictGet, sealed, "x") 201 | 202 | def test_good_hyperparams(self): 203 | sealed = dlc.HyperParams(( 204 | dlc.ParamDesc('model_name', 'Name of Model', None, 'im2latex'), 205 | dlc.ParamDesc('layer_type', 'Type of layers to be created', ['CNN', 'MLP', 'LSTM', 'RNN'], 'MLP'), 206 | dlc.ParamDesc('num_layers', 'Number of layers to create', range(1,11)), 207 | dlc.ParamDesc('unset', 'Unset property', range(1,11)), 208 | dlc.ParamDesc('none', 'None property', (None,), None) 209 | ) 210 | ).seal() 211 | frozen = dlc.HyperParams(sealed, {'num_layers':10}).freeze() 212 | self.assertRaises(dlc.OneValError, setattr, sealed, "model_name", "xyz") 213 | self.assertRaises(dlc.OneValError, setattr, sealed, "layer_type", "xyz") 214 | self.assertEqual(sealed.layer_type, 'MLP') 215 | self.assertEqual(sealed['layer_type'], 'MLP') 216 | 217 | self.assertEqual(frozen.model_name, 'im2latex') 218 | self.assertEqual(frozen['num_layers'], 10) 219 | self.assertEqual(frozen.num_layers, 10) 220 | self.assertEqual(frozen.none, None) 221 | self.assertEqual(frozen['none'], None) 222 | self.assertEqual(sealed.none, None) 223 | self.assertEqual(sealed['none'], None) 224 | 225 | def test_bad_hyperparams(self): 226 | sealed = dlc.HyperParams(( 227 | dlc.ParamDesc('model_name', 'Name of Model', None, 'im2latex'), 228 | dlc.ParamDesc('layer_type', 'Type of layers to be created', ['CNN', 'MLP', 'LSTM', 'RNN']), 229 | dlc.ParamDesc('num_layers', 'Number of layers to create', range(1, 11)), 230 | dlc.ParamDesc('unset', 'Unset property', range(1, 11)), 231 | dlc.ParamDesc('none', 'None property', (None,), None) 232 | )).seal() 233 | frozen = dlc.HyperParams(sealed, {'num_layers': 10}).freeze() 234 | self.assertRaises(KeyError, setattr, sealed, "x", "MyNeuralNetwork") 235 | self.assertRaises(KeyError, self.dictSet, sealed, "x", "MyNeuralNetwork") 236 | self.assertRaises(KeyError, setattr, frozen, "name", "MyNeuralNetwork") 237 | self.assertRaises(KeyError, self.dictSet, frozen, "name", "MyNeuralNetwork") 238 | 239 | self.assertRaises(ValueError, setattr, sealed, "layer_type", "SVM") 240 | self.assertRaises(ValueError, self.dictSet, sealed, "layer_type", "SVM") 241 | 242 | self.assertRaises(KeyError, getattr, sealed, "x") 243 | self.assertRaises(KeyError, self.dictGet, sealed, "x") 244 | self.assertRaises(KeyError, getattr, frozen, 'layer_type') 245 | self.assertRaises(KeyError, getattr, sealed, 'layer_type') 246 | 247 | 248 | def test_lambda_vals(self): 249 | p = Props() 250 | p2 = Props2(p) 251 | p3 = Props3(p2) 252 | self.assertEqual(p.m, 64) 253 | self.assertEqual(p.D, 512) 254 | self.assertEqual(p2.m, 64) 255 | self.assertEqual(p2.D, 512) 256 | self.assertEqual(p2.i, 512+64) 257 | self.assertEqual(p2.m2, 64) 258 | self.assertEqual(p2.D2, 512) 259 | self.assertEqual(p3.m, 64) 260 | self.assertEqual(p3.D, 512) 261 | self.assertEqual(p3.i, 512+64) 262 | self.assertEqual(p3.m3, 64) 263 | self.assertEqual(p3.D3, 512) 264 | 265 | p.m = 128 266 | self.assertEqual(p.m, 128) 267 | self.assertEqual(p.D, 512) 268 | self.assertEqual(p2.m, 64) 269 | self.assertEqual(p2.D, 512) 270 | self.assertEqual(p2.i, 512+128) 271 | self.assertEqual(p2.m2, 128) 272 | self.assertEqual(p2.D2, 512) 273 | self.assertEqual(p3.m, 64) 274 | self.assertEqual(p3.D, 512) 275 | self.assertEqual(p3.i, 512+128) 276 | self.assertEqual(p3.m3, 128) 277 | self.assertEqual(p3.D3, 512) 278 | self.assertEqual(p3.j, None) 279 | self.assertEqual(p3.k, 1) 280 | self.assertEqual(p3.l, 2) 281 | 282 | 283 | unittest.TextTestRunner(verbosity=2).run(unittest.TestLoader().loadTestsFromTestCase(PropertiesTest)) 284 | -------------------------------------------------------------------------------- /src/commons/pub_commons.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Copyright 2017 - 2018 Sumeet S Singh 4 | 5 | This file is part of im2latex solution by Sumeet S Singh. 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the Affero GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | Affero GNU General Public License for more details. 16 | 17 | You should have received a copy of the Affero GNU General Public License 18 | along with this program. If not, see . 19 | 20 | @author: Sumeet S Singh 21 | 22 | Works on python 2.7 23 | """ 24 | import os 25 | import pandas as pd 26 | import data_commons as dtc 27 | from viz_commons import VisualizeStep, VisualizeDir 28 | 29 | pd.options.display.max_rows = 150 30 | pd.options.display.max_columns = None 31 | pd.options.display.max_colwidth = -1 32 | pd.options.display.width = None 33 | pd.options.display.max_seq_items = None 34 | pd.options.display.expand_frame_repr = True 35 | # pd.options.display.colheader_justify = 'right' 36 | # display.pprint_nest_depth = 1 37 | 38 | 39 | def verbatim(s): 40 | s = s.strip('$') 41 | if r'\begin' in s: 42 | s = s.replace(r'\begin', r'\begIn') # Needed fool Mathjax into not rendering the LaTeX 43 | return s 44 | # return r'\begin{verbatim}\n%s\n\end{verbatim}\n' % (s,) if r'\begin' in s else s 45 | 46 | 47 | def get_strs(dir): 48 | vd = VisualizeDir(dir) 49 | last_step = vd.get_steps()[1][-1] 50 | vs = VisualizeStep(vd, 'test', last_step) 51 | df_strs = vs.strs( 'y', 'predicted_ids', mingle=False, trim=True, wrap_strs=True, keys=['image_name']) 52 | df_strs['image_name_trunc'] = df_strs.image_name.str.replace('_basic.png', '.png') 53 | return df_strs 54 | 55 | 56 | def DISP_ALPHA(storedir, graph, step, normalized_dataset=True, 57 | sample_num=0, invert_alpha=True, words=None, gamma=1, cmap='gist_gray', image=None, show_image=True): 58 | dtc.makeLogger(3, set_global=True) 59 | # Note: Good cmap values are: gist_gray, gist_yarg, gist_heat 60 | # Good values of gamma_correction are 1 and 2.2/2.3 61 | vs = VisualizeStep(VisualizeDir(storedir, normalized_dataset=normalized_dataset), graph, step) 62 | df_strs = vs.strs('y', 'predicted_ids', mingle=False, trim=True, wrap_strs=True, keys=['image_name']) 63 | if image: 64 | if not image.endswith('_basic.png'): 65 | image = image.replace('.png', '_basic.png') 66 | df_strs = df_strs[df_strs.image_name.isin([image])] 67 | assert sample_num == 0 68 | else: 69 | df_strs = df_strs.iloc[sample_num:sample_num+1] 70 | 71 | vs.alpha(sample_num, invert_alpha=invert_alpha, words=words, gamma_correction=gamma, 72 | cmap=cmap, index=df_strs.index, show_image=show_image) 73 | 74 | # df_ = pd.DataFrame(data={ 75 | # '$\mathbf{\hat{y}}$': [df_strs.predicted_ids.iloc[0], df_strs.predicted_ids.iloc[0].strip('$')], 76 | # '$\mathbf{\hat{y}}$_len': [df_strs.predicted_ids_len.iloc[0]]*2, 77 | # '$\mathbf{y}$': [df_strs.y.iloc[0], df_strs.y.iloc[0].strip('$')] , 78 | # '$\mathbf{y}$_len': [df_strs.y_len.iloc[0]]*2 79 | # }) 80 | 81 | 82 | df_ = pd.DataFrame(data={ 83 | 'length': [df_strs.y_len.iloc[0], df_strs.predicted_ids_len.iloc[0]]*2 + [''], 84 | 'value': [df_strs.y.iloc[0], df_strs.predicted_ids.iloc[0]] + 85 | [verbatim(df_strs.y.iloc[0]), verbatim(df_strs.predicted_ids.iloc[0])] + 86 | [df_strs.ed.iloc[0]], 87 | }, 88 | index=['$\mathbf{y}$', '$\mathbf{\hat{y}}$', '$\mathbf{y}$_seq', '$\mathbf{\hat{y}}$_seq', 'edit distance']) 89 | 90 | display(df_[['value', 'length']]) 91 | 92 | 93 | def rmtails(s, *tails): 94 | for t in tails: 95 | s = s.rsplit(t, 1)[0] 96 | return s 97 | 98 | 99 | rmtail = rmtails 100 | 101 | 102 | def rmheads(s, *heads): 103 | for h in heads: 104 | s = s.split(h, 1)[1] 105 | return s 106 | 107 | 108 | rmhead = rmheads 109 | 110 | 111 | def get_unmatched_images(rendered_dir, strip=False): 112 | with open(os.path.join(rendered_dir, 'unmatched_filenames.txt'), 'r') as f: 113 | unmatched = []; 114 | missing = [] 115 | for fname in f: 116 | fname = os.path.basename(fname.strip()) 117 | path = os.path.join(rendered_dir, 'images_pred', fname) 118 | if not os.path.exists(path): 119 | if strip: 120 | missing.append(fname.rsplit('.png', 1)[0]) 121 | else: 122 | missing.append(fname) 123 | else: 124 | if strip: 125 | unmatched.append(fname.rsplit('.png', 1)[0]) 126 | else: 127 | unmatched.append(fname) 128 | 129 | return unmatched, missing 130 | 131 | 132 | def strip_image_name(df, col='image_name'): 133 | """Changes name of images from xx_basic.png to xxx.png""" 134 | df[col] = df[col].str.replace('_basic.png', '.png') 135 | return df 136 | 137 | def disp_matched_strs(dir): 138 | df = pd.read_pickle(os.path.join(dir, 'gallery_data', 'df_strs_matched_100.pkl')) 139 | df_out = pd.DataFrame({ 140 | 'edit_distance': df.ed, 141 | '$\mathbf{y}$_len': df.y_len, 142 | '$\mathbf{y}$': df.y, 143 | '$\mathbf{\hat{y}}$_len': df.predicted_ids_len, 144 | '$\mathbf{\hat{y}}$': df.predicted_ids 145 | }).reset_index(drop=True)[ 146 | ['edit_distance', '$\mathbf{y}$_len', '$\mathbf{y}$', '$\mathbf{\hat{y}}$_len', '$\mathbf{\hat{y}}$']] 147 | return df_out 148 | 149 | def disp_matched_strs2(dir): 150 | df = pd.read_pickle(os.path.join(dir, 'gallery_data', 'df_strs_matched_100.pkl')) 151 | df_out = pd.DataFrame({ 152 | 'edit_distance': df.ed, 153 | '$\mathbf{y}$_len': df.y_len, 154 | '$\mathbf{y}$': df.y, 155 | '$\mathbf{\hat{y}}$_len': df.predicted_ids_len, 156 | '$\mathbf{\hat{y}}$': df.predicted_ids, 157 | '$\mathbf{y}$_seq': df.y.apply(verbatim, convert_dtype=False), 158 | '$\mathbf{\hat{y}}$_seq': df.predicted_ids.apply(verbatim, convert_dtype=False) 159 | }).reset_index(drop=True)[['edit_distance', '$\mathbf{y}$_len', '$\mathbf{y}$', '$\mathbf{\hat{y}}$_len', 160 | '$\mathbf{\hat{y}}$', '$\mathbf{y}$_seq', '$\mathbf{\hat{y}}$_seq']] 161 | 162 | return df_out 163 | 164 | def disp_unmatched(dir): 165 | df = pd.read_pickle(os.path.join(dir, 'gallery_data', 'unmatched_preds_sample.pkl')) 166 | df_out = pd.DataFrame({ 167 | 'edit_distance': df.ed, 168 | '$\mathbf{y}$_len': df.target_len, 169 | '$\mathbf{y}$': df.y, 170 | '$\mathbf{\hat{y}}$_len': df.pred_len, 171 | '$\mathbf{\hat{y}}$': df['$\hat{y}$'], 172 | '$\mathbf{y}$_seq': df.target_seq.apply(verbatim, convert_dtype=False), 173 | '$\mathbf{\hat{y}}$_seq': df.pred_seq.apply(verbatim, convert_dtype=False) 174 | }).reset_index(drop=True)[['edit_distance', '$\mathbf{y}$_len', '$\mathbf{y}$', '$\mathbf{\hat{y}}$_len', 175 | '$\mathbf{\hat{y}}$', '$\mathbf{y}$_seq', '$\mathbf{\hat{y}}$_seq']] 176 | 177 | return df_out 178 | 179 | def disp_rand_sample(dir): 180 | df = pd.read_pickle(os.path.join(dir, 'gallery_data', 'rand_sample_100.pkl')) 181 | df_out = pd.DataFrame({ 182 | 'edit_distance': df.ed, 183 | '$\mathbf{y}$_len': df.y_len, 184 | '$\mathbf{y}$': df.y, 185 | '$\mathbf{\hat{y}}$_len': df.predicted_ids_len, 186 | '$\mathbf{\hat{y}}$': df.predicted_ids, 187 | '$\mathbf{y}$_seq': df.y.apply(verbatim, convert_dtype=False), 188 | '$\mathbf{\hat{y}}$_seq': df.predicted_ids.apply(verbatim, convert_dtype=False) 189 | }).reset_index(drop=True)[['edit_distance', '$\mathbf{y}$_len', '$\mathbf{y}$', '$\mathbf{\hat{y}}$_len', 190 | '$\mathbf{\hat{y}}$', '$\mathbf{y}$_seq', '$\mathbf{\hat{y}}$_seq']] 191 | 192 | return df_out 193 | 194 | -------------------------------------------------------------------------------- /src/commons/test_tf_commons.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Copyright 2017 Sumeet S Singh 5 | 6 | This file is part of im2latex solution by Sumeet S Singh. 7 | 8 | This program is free software: you can redistribute it and/or modify 9 | it under the terms of the Affero GNU General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | This program is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | Affero GNU General Public License for more details. 17 | 18 | You should have received a copy of the Affero GNU General Public License 19 | along with this program. If not, see . 20 | 21 | Created on Sun Jul 9 11:44:46 2017 22 | Tested on python 2.7 23 | 24 | @author: Sumeet S Singh 25 | """ 26 | import tensorflow as tf 27 | import tf_commons as tfc 28 | from tf_commons import K 29 | import numpy as np 30 | 31 | def flatten(h,l): 32 | B, k, T = K.int_shape(h) 33 | return tf.reshape(h, (B*k, -1)), tf.reshape(l, (B*k,)) 34 | 35 | with tf.device('/cpu:*'): 36 | ############### Tensors with ED == 0 37 | h1 = tf.constant([[[1,2,3],[4,5,6]], 38 | [[7,8,9],[10,11,12]], 39 | [[13,14,15],[16,17,18]] ]) 40 | l1 = tf.constant([[3, 3], 41 | [3, 3], 42 | [3, 3]]) 43 | print 'Shapes: h1:%s, l1:%s'%(K.int_shape(h1), K.int_shape(l1)) 44 | h2 = tf.constant([[[1,2,3,0,0,0,0],[4,5,6,0,0,0,0]], 45 | [[7,8,100,9,101,0,0],[100,10,100,11,12,0,0]], 46 | [[13,14,15,100,100,100,0],[101,16,17,18,0,0,0]] ]) 47 | l2 = tf.constant([[3, 3], 48 | [5, 5], 49 | [6, 4]]) 50 | print 'Shapes: h2:%s, l2:%s'%(K.int_shape(h2), K.int_shape(l2)) 51 | h1_s, l1_s = tfc.squash_3d(3, 2, h1, l1, 100) 52 | print 'Shapes: h1_s:%s, l1_s:%s'%(K.int_shape(h1_s), K.int_shape(l1_s)) 53 | h2_s, l2_s = tfc.squash_3d(3, 2, h2, l2, 100) 54 | print 'Shapes: h2_s:%s, l2_s:%s'%(K.int_shape(h2_s), K.int_shape(l2_s)) 55 | ed1 = tfc.edit_distance3D(3, 2, h2, l2, h1, l1, 100, 101) 56 | mean1 = tf.reduce_mean(ed1) 57 | acc1 = tf.reduce_mean(tf.to_float(tf.equal(ed1, 0))) 58 | ed1_s = tfc.edit_distance3D(3, 2, h2_s, l2_s, h1_s, l1_s, 100, 101) 59 | mean1_s = tf.reduce_mean(ed1_s) 60 | acc1_s = tf.reduce_mean(tf.to_float(tf.equal(ed1_s, 0))) 61 | 62 | _h1, _l1 = flatten(h1, l1) 63 | _h1_s, _l1_s = flatten(h1_s, l1_s) 64 | _h2, _l2 = flatten(h2, l2) 65 | _h2_s, _l2_s = flatten(h2_s, l2_s) 66 | 67 | _ed1 = tfc.edit_distance2D(6, _h2, _l2, _h1, _l1, 100, 101) 68 | _mean1 = tf.reduce_mean(_ed1) 69 | _acc1 = tf.reduce_mean(tf.to_float(tf.equal(_ed1, 0))) 70 | _ed1_s = tfc.edit_distance2D(6, _h2_s, _l2_s, _h1_s, _l1_s, 100, 101) 71 | _mean1_s = tf.reduce_mean(_ed1_s) 72 | _acc1_s = tf.reduce_mean(tf.to_float(tf.equal(_ed1_s, 0))) 73 | 74 | ######################## Tensor with ED > 0 75 | h2_2 = tf.constant([[[1,2,3,99,0,0,0],[4,5,6,0,0,0,0]], 76 | [[7,8,100,99,0,0,0],[100,10,100,11,12,0,0]], 77 | [[13,100,15,100,100,100,0],[100,16,17,18,0,0,0]] ]) 78 | l2_2 = tf.constant([[4, 3], 79 | [4, 5], 80 | [6, 4]]) 81 | h2_2_s, l2_2_s = tfc.squash_3d(3, 2, h2_2, l2_2, 100) 82 | _h2_2, _l2_2 = flatten(h2_2, l2_2) 83 | _h2_2_s, _l2_2_s = flatten(h2_2_s, l2_2_s) 84 | 85 | ed2 = tfc.edit_distance3D(3, 2, h2_2, l2_2, h1, l1, 100, 101) 86 | acc2 = tf.reduce_mean(tf.to_float(tf.equal(ed2, 0))) 87 | sum2 = tf.reduce_sum(ed2) 88 | ed2_s = tfc.edit_distance3D(3, 2, h2_2_s, l2_2_s, h1_s, l1_s, 100, 101) 89 | acc2_s = tf.reduce_mean(tf.to_float(tf.equal(ed2_s, 0))) 90 | sum2_s = tf.reduce_sum(ed2_s) 91 | print 'Shape of ed1=%s'%(K.int_shape(ed1),) 92 | print 'Shape of ed2_s=%s'%(K.int_shape(ed2_s),) 93 | _ed2 = tfc.edit_distance2D(6, _h2_2, _l2_2, _h1, _l1, 100, 101) 94 | _ed2_s = tfc.edit_distance2D(6, _h2_2_s, _l2_2_s, _h1_s, _l1_s, 100, 101) 95 | _sum2 = tf.reduce_sum(_ed2) 96 | _sum2_s = tf.reduce_sum(_ed2_s) 97 | _acc2 = tf.reduce_mean(tf.to_float(tf.equal(_ed2, 0))) 98 | _acc2_s = tf.reduce_mean(tf.to_float(tf.equal(_ed2_s, 0))) 99 | 100 | # tf.reduce_mean(tf.to_float(tf.equal(top1_ed, 0))) 101 | 102 | ## Test seqlens 103 | b = [] 104 | for i in range(11): 105 | b.append([i]*11) 106 | b[i][i] = 0 107 | b.append([11]*11) 108 | b = np.asarray(b) 109 | 110 | tf_b = tf.constant(b) 111 | tf_lens1 = tfc.seqlens(tf.constant(b)) 112 | tf_lens2 = tfc.seqlens(tf.constant(b.reshape((3,4,11)) ) ) 113 | tf_lens1_2 = tfc.seqlens(tf.constant(b), include_eos_token=False) 114 | tf_lens2_2 = tfc.seqlens(tf.constant(b.reshape((3,4,11))) ,include_eos_token=False) 115 | len_1 = np.arange(1,13); len_1[11] = 11 116 | len_2 = np.arange(12) 117 | 118 | with tf.Session(): 119 | print 'ed1 = \n%s'%ed1.eval() 120 | assert mean1.eval() == 0. 121 | assert acc1.eval() == 1 122 | print '_ed1 = \n%s'%_ed1.eval() 123 | assert _mean1.eval() == 0. 124 | assert _acc1.eval() == 1 125 | print 'ed1_s = \n%s'%ed1_s.eval() 126 | assert mean1_s.eval() == 0. 127 | assert acc1_s.eval() == 1 128 | print '_ed1_s = \n%s'%_ed1_s.eval() 129 | assert _mean1_s.eval() == 0. 130 | assert _acc1_s.eval() == 1 131 | 132 | 133 | print 'ed2 = \n%s'%ed2.eval() 134 | assert sum2.eval() == 1. 135 | assert acc2.eval() == 1./2. 136 | print '_ed2 = \n%s'%_ed2.eval() 137 | assert _sum2.eval() == 1. 138 | assert _acc2.eval() == 1./2. 139 | print 'ed2_s = \n%s'%ed2_s.eval() 140 | assert sum2_s.eval() == 1. 141 | assert acc2_s.eval() == 1./2. 142 | print '_ed2_s = \n%s'%_ed2_s.eval() 143 | assert _sum2_s.eval() == 1. 144 | assert _acc2_s.eval() == 1./2. 145 | 146 | print tf_lens1.eval() 147 | print tf_lens1_2.eval() 148 | assert np.all(tf_lens1.eval() == len_1 ) 149 | assert np.all(tf_lens1_2.eval() == len_2) 150 | print tf_lens2.eval() 151 | print tf_lens2_2.eval() 152 | assert np.all(tf_lens2.eval() == len_1.reshape(3,4)) 153 | assert np.all(tf_lens2_2.eval() == len_2.reshape(3,4)) 154 | print "Success !" 155 | -------------------------------------------------------------------------------- /src/convnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Copyright 2017 Sumeet S Singh 5 | 6 | This file is part of im2latex solution by Sumeet S Singh. 7 | 8 | This program is free software: you can redistribute it and/or modify 9 | it under the terms of the Affero GNU General Public License as published by 10 | the Free Software Foundation, either version 3 of the License, or 11 | (at your option) any later version. 12 | 13 | This program is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | Affero GNU General Public License for more details. 17 | 18 | You should have received a copy of the Affero GNU General Public License 19 | along with this program. If not, see . 20 | Created on Mon Jul 24 12:28:55 2017 21 | 22 | @author: Sumeet S Singh 23 | """ 24 | 25 | import os 26 | import argparse as arg 27 | import time 28 | from six.moves import cPickle as pickle 29 | import pandas as pd 30 | import tensorflow as tf 31 | from tensorflow.contrib.keras import backend as K 32 | from hyper_params import make_hyper 33 | import data_commons as dtc 34 | import dl_commons as dlc 35 | import tf_commons as tfc 36 | from data_reader import BatchImageIterator2, ImagenetProcessor 37 | from Im2LatexModel import build_vgg_context 38 | 39 | def get_df(params): 40 | # image_features_folder = params.vgg16_folder 41 | # raw_data_folder = params.raw_data_folder 42 | # image_features_folder = params.vgg16_folder 43 | 44 | # Join the two data-frames 45 | df_train = pd.read_pickle(os.path.join(params.raw_data_folder, 'df_train.pkl')) 46 | df_test = pd.read_pickle(os.path.join(params.raw_data_folder, 'df_test.pkl')) 47 | df = df_train.append(df_test) 48 | 49 | # Remove images that have been processed already. But round-up to batch-size 50 | image_list = [os.path.splitext(s)[0]+'.png' for s in filter(lambda s: s.endswith('.pkl'), os.listdir(params.vgg16_folder))] 51 | df = df[~df.image.isin(image_list)] 52 | if (df.shape[0] % params.B) != 0: 53 | shortfall = params.B - (df.shape[0] % params.B) 54 | df_remove = df[df.image.isin(image_list)] 55 | df = df.append(df_remove.sample(n=shortfall), verify_integrity=True) 56 | 57 | dtc.logger.info('Working with %d images', df.shape[0]) 58 | 59 | # Set all bin_len to max to ensure only one bin 60 | df.bin_len = df.bin_len.max() 61 | return df 62 | 63 | def run_convnet(params): 64 | HYPER = make_hyper(params) 65 | image_folder = params.image_folder 66 | raw_data_folder = params.raw_data_folder 67 | image_features_folder = params.vgg16_folder 68 | logger = HYPER.logger 69 | 70 | df = get_df(params) 71 | if df.shape[0] == 0: 72 | logger.info('No images remaining to process. All done.') 73 | else: 74 | logger.info('Processing %d images.', df.shape[0]) 75 | 76 | logger.info('\n#################### Args: ####################\n%s', params.pformat()) 77 | logger.info('##################################################################\n') 78 | logger.info( '\n######################### HYPER Params: #########################\n%s', HYPER.pformat()) 79 | logger.info('##################################################################\n') 80 | 81 | b_it = BatchImageIterator2( 82 | raw_data_folder, 83 | image_folder, 84 | HYPER, 85 | image_processor=ImagenetProcessor(HYPER), 86 | df=df, 87 | num_steps=params.num_steps, 88 | num_epochs=params.num_epochs) 89 | 90 | graph = tf.Graph() 91 | with graph.as_default(): 92 | 93 | config=tf.ConfigProto(log_device_placement=False) 94 | config.gpu_options.allow_growth = True 95 | tf_session = tf.Session(config=config) 96 | with tf_session.as_default(): 97 | K.set_session(tf_session) 98 | 99 | tf_im = tf.placeholder(dtype=HYPER.dtype, shape=((HYPER.B,)+HYPER.image_shape), name='image') 100 | with tf.device('/gpu:1'): # change this to gpu:0 if you only have one gpu 101 | tf_a_batch = build_vgg_context(HYPER, tf_im) 102 | tf_a_list = tf.unstack(tf_a_batch, axis=0) 103 | 104 | t_n = tfc.printVars('Trainable Variables', tf.trainable_variables()) 105 | g_n = tfc.printVars('Global Variables', tf.global_variables()) 106 | l_n = tfc.printVars('Local Variables', tf.local_variables()) 107 | assert t_n == g_n 108 | assert g_n == l_n 109 | 110 | print '\nUninitialized params' 111 | print tf_session.run(tf.report_uninitialized_variables()) 112 | 113 | print 'Flushing graph to disk' 114 | tf_sw = tf.summary.FileWriter(tfc.makeTBDir(HYPER.tb), graph=graph) 115 | tf_sw.flush() 116 | 117 | print '\n' 118 | start_time = time.clock() 119 | for step, b in enumerate(b_it, start=1): 120 | # if b.epoch > 1 or (params.num_steps >= 0 and step > params.num_steps): 121 | # break 122 | feed_dict = {tf_im: b.im, K.learning_phase(): 0} 123 | a_list = tf_session.run(tf_a_list, feed_dict = feed_dict) 124 | assert len(a_list) == len(b.image_name) 125 | for i, a in enumerate(a_list): 126 | ## print 'Writing %s, shape=%s'%(b.image_name[i], a.shape) 127 | with open(os.path.join(image_features_folder, os.path.splitext(b.image_name[i])[0] + '.pkl'), 128 | 'wb') as f: 129 | pickle.dump(a, f, pickle.HIGHEST_PROTOCOL) 130 | if step % 10 == 0: 131 | print('Elapsed time for %d steps = %d seconds'%(step, time.clock()-start_time)) 132 | print('Elapsed time for %d steps = %d seconds'%(step, time.clock()-start_time)) 133 | print 'done' 134 | 135 | def main(): 136 | _data_folder = '../data/dataset3' 137 | 138 | parser = arg.ArgumentParser(description='train model') 139 | parser.add_argument("--num-steps", "-n", dest="num_steps", type=int, 140 | help="Number of training steps to run. Defaults to -1 if unspecified, i.e. run to completion", 141 | default=-1) 142 | parser.add_argument("--num-epochs", "-e", dest="num_epochs", type=int, 143 | help="Number of training steps to run. Defaults to 1 if unspecified.", 144 | default=1) 145 | parser.add_argument("--batch-size", "-b", dest="batch_size", type=int, 146 | help="Batchsize. If unspecified, defaults to the default value in hyper_params", 147 | default=None) 148 | parser.add_argument("--print-steps", "-s", dest="print_steps", type=int, 149 | help="Number of training steps after which to log results. Defaults to 10 if unspecified", 150 | default=100) 151 | parser.add_argument("--data-folder", "-d", dest="data_folder", type=str, 152 | help="Data folder. If unspecified, defaults to " + _data_folder, 153 | default=_data_folder) 154 | parser.add_argument("--raw-data-folder", dest="raw_data_folder", type=str, 155 | help="Raw data folder. If unspecified, defaults to data_folder/training", 156 | default=None) 157 | parser.add_argument("--vgg16-folder", dest="vgg16_folder", type=str, 158 | help="vgg16 data folder. If unspecified, defaults to data_folder/vgg16_features", 159 | default=None) 160 | parser.add_argument("--image-folder", dest="image_folder", type=str, 161 | help="image folder. If unspecified, defaults to data_folder/formula_images", 162 | default=None) 163 | parser.add_argument("--partial-batch", "-p", dest="partial_batch", action='store_true', 164 | help="Sets assert_whole_batch hyper param to False. Default hyper_param value will be used if unspecified") 165 | parser.add_argument("--logging-level", "-l", dest="logging_level", type=int, 166 | help="Logging verbosity level from 1 to 5 in increasing order of verbosity.", 167 | default=4) 168 | 169 | 170 | args = parser.parse_args() 171 | data_folder = args.data_folder 172 | params = dlc.Properties({'num_steps': args.num_steps, 173 | 'print_steps':args.print_steps, 174 | 'num_epochs': args.num_epochs, 175 | 'logger': dtc.makeLogger(args.logging_level, set_global=True), 176 | 'build_image_context': 1, 177 | 'weights_regularizer': None, 178 | 'num_gpus': 1, 179 | 'tb': tfc.TensorboardParams({'tb_logdir': 'tb_metrics_convnet'}).freeze() 180 | }) 181 | if args.image_folder: 182 | params.image_folder = args.image_folder 183 | else: 184 | params.image_folder = os.path.join(data_folder,'formula_images') 185 | 186 | if args.raw_data_folder: 187 | params.raw_data_folder = args.raw_data_folder 188 | else: 189 | params.raw_data_folder = os.path.join(data_folder, 'training') 190 | params.raw_data_dir = params.raw_data_folder 191 | 192 | if args.vgg16_folder: 193 | params.vgg16_folder = args.vgg16_folder 194 | else: 195 | params.vgg16_folder = os.path.join(data_folder, 'vgg16_features') 196 | 197 | if args.batch_size is not None: 198 | params.B = args.batch_size 199 | if args.partial_batch: 200 | params.assert_whole_batch = False 201 | 202 | data_props = dtc.load(params.raw_data_folder, 'data_props.pkl') 203 | params.image_shape = (data_props['padded_image_dim']['height'], data_props['padded_image_dim']['width'], 3) 204 | run_convnet(params) 205 | 206 | main() 207 | -------------------------------------------------------------------------------- /src/model/tf_dynamic_decode.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Seq2seq layer operations for use in neural networks.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import abc 22 | import six 23 | 24 | from tensorflow.python.framework import constant_op 25 | from tensorflow.python.framework import dtypes 26 | from tensorflow.python.framework import ops 27 | from tensorflow.python.framework import tensor_shape 28 | from tensorflow.python.framework import tensor_util 29 | from tensorflow.python.ops import array_ops 30 | from tensorflow.python.ops import control_flow_ops 31 | from tensorflow.python.ops import math_ops 32 | from tensorflow.python.ops import rnn 33 | from tensorflow.python.ops import tensor_array_ops 34 | from tensorflow.python.ops import variable_scope 35 | from tensorflow.python.util import nest 36 | from tensorflow.contrib.seq2seq import Decoder 37 | import tf_commons as tfc 38 | import data_commons as dtc 39 | 40 | __all__ = ["dynamic_decode"] 41 | 42 | 43 | _transpose_batch_time = rnn._transpose_batch_time # pylint: disable=protected-access 44 | 45 | 46 | 47 | 48 | 49 | def _create_zero_outputs(size, dtype, batch_size): 50 | """Create a zero outputs Tensor structure.""" 51 | def _t(s): 52 | return (s if isinstance(s, ops.Tensor) else constant_op.constant( 53 | tensor_shape.TensorShape(s).as_list(), 54 | dtype=dtypes.int32, 55 | name="zero_suffix_shape")) 56 | 57 | def _create(s, d): 58 | return array_ops.zeros( 59 | array_ops.concat( 60 | ([batch_size], _t(s)), axis=0), dtype=d) 61 | 62 | return nest.map_structure(_create, size, dtype) 63 | 64 | 65 | def dynamic_decode(decoder, 66 | output_time_major=False, 67 | impute_finished=False, 68 | maximum_iterations=None, 69 | parallel_iterations=32, 70 | swap_memory=False, 71 | scope=None): 72 | """Perform dynamic decoding with `decoder`. 73 | 74 | Calls initialize() once and step() repeatedly on the Decoder object. 75 | 76 | Args: 77 | decoder: A `Decoder` instance. 78 | output_time_major: Python boolean. Default: `False` (batch major). If 79 | `True`, outputs are returned as time major tensors (this mode is faster). 80 | Otherwise, outputs are returned as batch major tensors (this adds extra 81 | time to the computation). 82 | impute_finished: Python boolean. If `True`, then states for batch 83 | entries which are marked as finished get copied through and the 84 | corresponding outputs get zeroed out. This causes some slowdown at 85 | each time step, but ensures that the final state and outputs have 86 | the correct values and that backprop ignores time steps that were 87 | marked as finished. 88 | maximum_iterations: `int32` scalar, maximum allowed number of decoding 89 | steps. Default is `None` (decode until the decoder is fully done). 90 | parallel_iterations: Argument passed to `tf.while_loop`. 91 | swap_memory: Argument passed to `tf.while_loop`. 92 | scope: Optional variable scope to use. 93 | 94 | Returns: 95 | `(final_outputs, final_state, final_sequence_lengths)`. 96 | 97 | Raises: 98 | TypeError: if `decoder` is not an instance of `Decoder`. 99 | ValueError: if `maximum_iterations` is provided but is not a scalar. 100 | """ 101 | if not isinstance(decoder, Decoder): 102 | raise TypeError("Expected decoder to be type Decoder, but saw: %s" % 103 | type(decoder)) 104 | 105 | with variable_scope.variable_scope(scope, "decoder") as varscope: 106 | # Properly cache variable values inside the while_loop 107 | if varscope.caching_device is None: 108 | varscope.set_caching_device(lambda op: op.device) 109 | 110 | if maximum_iterations is not None: 111 | maximum_iterations = ops.convert_to_tensor( 112 | maximum_iterations, dtype=dtypes.int32, name="maximum_iterations") 113 | if maximum_iterations.get_shape().ndims != 0: 114 | raise ValueError("maximum_iterations must be a scalar") 115 | 116 | initial_finished, initial_inputs, initial_state = decoder.initialize() 117 | 118 | zero_outputs = _create_zero_outputs(decoder.output_size, 119 | decoder.output_dtype, 120 | decoder.batch_size) 121 | 122 | if maximum_iterations is not None: 123 | initial_finished = math_ops.logical_or( 124 | initial_finished, 0 >= maximum_iterations) 125 | initial_sequence_lengths = array_ops.zeros_like( 126 | initial_finished, dtype=dtypes.int32) 127 | initial_time = constant_op.constant(0, dtype=dtypes.int32) 128 | 129 | def _shape(batch_size, from_shape): 130 | if not isinstance(from_shape, tensor_shape.TensorShape): 131 | return tensor_shape.TensorShape(None) 132 | else: 133 | batch_size = tensor_util.constant_value( 134 | ops.convert_to_tensor( 135 | batch_size, name="batch_size")) 136 | return tensor_shape.TensorShape([batch_size]).concatenate(from_shape) 137 | 138 | def _create_ta(s, d): 139 | return tensor_array_ops.TensorArray( 140 | dtype=d, 141 | size=0, 142 | dynamic_size=True, 143 | element_shape=_shape(decoder.batch_size, s)) 144 | 145 | def _create_states_ta(t): 146 | return tensor_array_ops.TensorArray( 147 | dtype=t.dtype, 148 | size=0, 149 | dynamic_size=True, 150 | clear_after_read=True, 151 | element_shape=t.shape 152 | ) 153 | 154 | initial_outputs_ta = nest.map_structure(_create_ta, decoder.output_size, 155 | decoder.output_dtype) 156 | # dtc.logger.info('decoder.initial_state.shape=%s', tfc.nested_tf_shape(initial_state)) 157 | initial_states_ta = nest.map_structure(_create_states_ta, initial_state) 158 | # dtc.logger.info('initial_states_ta=%s', initial_states_ta) 159 | 160 | def condition(unused_time, unused_outputs_ta, unused_state, unused_inputs, 161 | finished, unused_sequence_lengths, unused_states_ta): 162 | return math_ops.logical_not(math_ops.reduce_all(finished)) 163 | 164 | def body(time, outputs_ta, state, inputs, finished, sequence_lengths, states_ta): 165 | """Internal while_loop body. 166 | 167 | Args: 168 | time: scalar int32 tensor. 169 | outputs_ta: structure of TensorArray. 170 | state: (structure of) state tensors and TensorArrays. 171 | inputs: (structure of) input tensors. 172 | finished: bool tensor (keeping track of what's finished). 173 | sequence_lengths: int32 tensor (keeping track of time of finish). 174 | states_ta: structure of TensorArray for storing states. 175 | 176 | Returns: 177 | `(time + 1, outputs_ta, next_state, next_inputs, next_finished, 178 | next_sequence_lengths, states_ta)`. 179 | ``` 180 | """ 181 | (next_outputs, decoder_state, next_inputs, 182 | decoder_finished) = decoder.step(time, inputs, state) 183 | next_finished = math_ops.logical_or(decoder_finished, finished) 184 | if maximum_iterations is not None: 185 | next_finished = math_ops.logical_or( 186 | next_finished, time + 1 >= maximum_iterations) 187 | next_sequence_lengths = array_ops.where( 188 | math_ops.logical_and(math_ops.logical_not(finished), next_finished), 189 | array_ops.fill(array_ops.shape(sequence_lengths), time + 1), 190 | sequence_lengths) 191 | 192 | nest.assert_same_structure(state, decoder_state) 193 | nest.assert_same_structure(states_ta, decoder_state) 194 | nest.assert_same_structure(outputs_ta, next_outputs) 195 | nest.assert_same_structure(inputs, next_inputs) 196 | 197 | # Zero out output values past finish 198 | if impute_finished: 199 | emit = nest.map_structure( 200 | lambda out, zero: array_ops.where(finished, zero, out), 201 | next_outputs, 202 | zero_outputs) 203 | else: 204 | emit = next_outputs 205 | 206 | # Copy through states past finish 207 | def _maybe_copy_state(new, cur): 208 | # TensorArrays and scalar states get passed through. 209 | if isinstance(cur, tensor_array_ops.TensorArray): 210 | pass_through = True 211 | else: 212 | new.set_shape(cur.shape) 213 | pass_through = (new.shape.ndims == 0) 214 | return new if pass_through else array_ops.where(finished, cur, new) 215 | 216 | if impute_finished: 217 | next_state = nest.map_structure( 218 | _maybe_copy_state, decoder_state, state) 219 | else: 220 | next_state = decoder_state 221 | 222 | outputs_ta = nest.map_structure(lambda ta, out: ta.write(time, out), 223 | outputs_ta, emit) 224 | states_ta = nest.map_structure(lambda ta, st: ta.write(time, st), 225 | states_ta, next_state) 226 | 227 | return (time + 1, outputs_ta, next_state, next_inputs, next_finished, 228 | next_sequence_lengths, states_ta) 229 | 230 | res = control_flow_ops.while_loop( 231 | condition, 232 | body, 233 | loop_vars=[ 234 | initial_time, initial_outputs_ta, initial_state, initial_inputs, 235 | initial_finished, initial_sequence_lengths, initial_states_ta 236 | ], 237 | parallel_iterations=parallel_iterations, 238 | swap_memory=swap_memory) 239 | 240 | final_outputs_ta = res[1] 241 | final_state = res[2] 242 | final_sequence_lengths = res[5] 243 | final_states_ta = res[6] 244 | 245 | final_outputs = nest.map_structure(lambda ta: ta.stack(), final_outputs_ta) 246 | final_states = nest.map_structure(lambda ta: ta.stack(), final_states_ta) 247 | 248 | try: 249 | final_outputs, final_state = decoder.finalize( 250 | final_outputs, final_state, final_sequence_lengths) 251 | except NotImplementedError: 252 | pass 253 | 254 | if not output_time_major: 255 | final_outputs = nest.map_structure(_transpose_batch_time, final_outputs) 256 | final_states = nest.map_structure(_transpose_batch_time, final_states) 257 | 258 | return final_outputs, final_state, final_sequence_lengths, final_states 259 | -------------------------------------------------------------------------------- /src/model/tf_tutorial_code.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | import tensorflow as tf 16 | 17 | 18 | def average_gradients(tower_grads): 19 | """Calculate the average gradient for each shared variable across all towers. 20 | Note that this function provides a synchronization point across all towers. 21 | Args: 22 | tower_grads: List of lists of (gradient, variable) tuples. The outer list 23 | is over individual gradients. The inner list is over the gradient 24 | calculation for each tower. 25 | Returns: 26 | List of pairs of (gradient, variable) where the gradient has been averaged 27 | across all towers. 28 | """ 29 | average_grads = [] 30 | for grad_and_vars in zip(*tower_grads): 31 | # Note that each grad_and_vars looks like the following: 32 | # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) 33 | grads = [] 34 | i = 0 35 | for g, v in grad_and_vars: 36 | i += 1 37 | # Add 0 dimension to the gradients to represent the tower. 38 | expanded_g = tf.expand_dims(g, 0) 39 | 40 | # Append on a 'tower' dimension which we will average over below. 41 | grads.append(expanded_g) 42 | 43 | # Average over the 'tower' dimension. 44 | grad = tf.concat(axis=0, values=grads) 45 | grad = tf.reduce_mean(grad, 0) 46 | 47 | # Keep in mind that the Variables are redundant because they are shared 48 | # across towers. So .. we will just return the first tower's pointer to 49 | # the Variable. 50 | v = grad_and_vars[0][1] 51 | grad_and_var = (grad, v) 52 | average_grads.append(grad_and_var) 53 | return average_grads 54 | -------------------------------------------------------------------------------- /src/preprocessing/README.md: -------------------------------------------------------------------------------- 1 | # Preprocessing Notes 2 | 1. Download latex files from KDD Cup 2003 dataset. 3 | 2. Parase the files and extract suitable latex math equations (formulas from now on) 4 | 3. Normalize the extracted formulas by parsing them using katex parser and then regenerating latex from the parse-tree. 5 | 4. Render formulas to jpeg images (using pdflatex) 6 | 5. Create a pandas dataframe mapping images to normalized formula 7 | 6. Clean formula text 8 | 7. Tokenize formulas and extract vocabulary. Clean vocabulary and bad formulas. 9 | 8. Analyze data and remove very large images and latex sequences (in order to limit image size) 10 | 9. ... 11 | -------------------------------------------------------------------------------- /src/tools/bulk_disp_alpha.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | """ 3 | This file executes disp_alpha.ipynb in bulk, once for each of the images below. It then exports the notebook into 4 | HTML in the current folder. Run it in a folder such as gallery/I2L-NOPOOL along with the companion disp_alpha.ipynb. 5 | """ 6 | import sys, os 7 | from multiprocessing import Pool 8 | 9 | def do(image_name): 10 | def rmtail(s, t): 11 | return s.rsplit(t, 1)[0] 12 | 13 | # import nbformat 14 | # from nbconvert.preprocessors import ExecutePreprocessor 15 | 16 | os.environ['image_name'] = repr(image_name) 17 | os.putenv('image_name', image_name) 18 | 19 | # with open('disp_alpha.ipynb') as f: 20 | # nb = nbformat.read(f, as_version=4) 21 | # ep = ExecutePreprocessor(timeout=600) # kernel_name='python2') 22 | # ep.preprocess(nb) 23 | 24 | command = 'jupyter nbconvert --to HTML --output alpha_%s_gray --execute --ExecutePreprocessor.timeout=300 disp_alpha.ipynb'%rmtail(image_name, '.png') 25 | print('Executing command: %s'%command) 26 | os.system(command) 27 | 28 | if __name__ == '__main__': 29 | if 'image_name' in os.environ: 30 | print('Processing image %s'%os.environ['image_name']) 31 | exit(0) 32 | 33 | p = Pool(2) 34 | I2L_NOPOOL_50_MATCHED = [ 35 | u'6de537f98f51a70.png', 36 | u'e151d0cb6a1f4b8.png', 37 | u'125e0edbdc14c16.png', 38 | u'c1a595cf0e1b410.png', 39 | u'976c67c09595d48.png', 40 | u'48e151a0a2e1d66.png', 41 | u'eb4edff43972a77.png', 42 | u'f535e2d3ffd72a9.png', 43 | u'fbf3c74e173ede6.png', 44 | u'b727765af13988d.png', 45 | u'c236ef8f2d69db4.png', 46 | u'17806d8a43ed4d7.png', 47 | u'7bf25eec600c770.png', 48 | u'd67b0016af15368.png', 49 | u'beac5a98ad0bba3.png', 50 | u'6589b8b41dec5f5.png', 51 | u'c53968dbdf5e491.png', 52 | u'f7df71e09e679fa.png', 53 | u'88085cbe4db62f4.png', 54 | u'a4069d6109fdb32.png', 55 | u'7e7c82bcbbab14d.png', 56 | u'4cab7f4e7119975.png', 57 | u'ee3f8d415a17042.png', 58 | u'09c406611c97ca6.png', 59 | u'acc6b030ec1db54.png', 60 | u'5bde325cdc5c9fb.png', 61 | u'fc51f4f92be6b9e.png', 62 | u'831233abfc981bb.png', 63 | u'2af02fe9dda544b.png', 64 | u'dc311ef87140544.png', 65 | u'62d52e5875f15f2.png', 66 | u'082d6f67587ff53.png', 67 | u'be5020af1c11fb0.png', 68 | u'0ebe66af564fdea.png', 69 | u'8f17277609baf0d.png', 70 | u'cd7ee25bb44ee96.png', 71 | u'bde00b1efb71c8f.png', 72 | u'1637deef28fa753.png', 73 | u'ba84027cf12d913.png', 74 | u'ca7098dc8853675.png', 75 | u'5be77b312bfa0c1.png', 76 | u'9afabb69abb8665.png', 77 | u'e75a0c252c98431.png', 78 | u'05a32153f52b845.png', 79 | u'c450aeeee50eacb.png', 80 | u'8f249bcfcbd0d4a.png', 81 | u'c9908dd9001ae2a.png', 82 | u'8a7278fd1af0571.png', 83 | u'780ce6e35d2dfb2.png', 84 | u'04237c2640a6ef2.png'] 85 | 86 | I2L_STRIPS_50_MATCHED = [ 87 | u'4ef63353075e5b6.png', 88 | u'6de537f98f51a70.png', 89 | u'9dc9caeac24960d.png', 90 | u'125e0edbdc14c16.png', 91 | u'48e151a0a2e1d66.png', 92 | u'd4b25f217be4cca.png', 93 | u'7e7c82bcbbab14d.png', 94 | u'c236ef8f2d69db4.png', 95 | u'fbf3c74e173ede6.png', 96 | u'e8bd11a6b2feacf.png', 97 | u'b727765af13988d.png', 98 | u'7bf25eec600c770.png', 99 | u'21b2c45e268829b.png', 100 | u'f7df71e09e679fa.png', 101 | u'd67b0016af15368.png', 102 | u'6201fd941a8d4da.png', 103 | u'6f3d3d2ed89345d.png', 104 | u'beac5a98ad0bba3.png', 105 | u'136ca940c9932d4.png', 106 | u'cda328a07cba902.png', 107 | u'7fec9f1799b13ec.png', 108 | u'6147055797ca25d.png', 109 | u'938f5c3d05f5cf4.png', 110 | u'eebbeeddab4c0af.png', 111 | u'f16ea5d12d68b60.png', 112 | u'0734f11afe9aa90.png', 113 | u'186678817078727.png', 114 | u'2590ff270553f09.png', 115 | u'ee3f8d415a17042.png', 116 | u'4cab7f4e7119975.png', 117 | u'e99ef7e83d7b337.png', 118 | u'5bde325cdc5c9fb.png', 119 | u'a4d4967273292d2.png', 120 | u'23b08d245124d3c.png', 121 | u'a535502c45b16f6.png', 122 | u'8b27d32b2738fce.png', 123 | u'62d52e5875f15f2.png', 124 | u'acc6b030ec1db54.png', 125 | u'db4e9e9fba352e8.png', 126 | u'93cdbab1859dd05.png', 127 | u'dc311ef87140544.png', 128 | u'be5020af1c11fb0.png', 129 | u'831233abfc981bb.png', 130 | u'f8cbaf91c3c404f.png', 131 | u'c6d77ca7ad58ced.png', 132 | u'ca7098dc8853675.png', 133 | u'bde00b1efb71c8f.png', 134 | u'1637deef28fa753.png', 135 | u'e75a0c252c98431.png', 136 | u'05a32153f52b845.png'] 137 | 138 | p.map(do, set(I2L_NOPOOL_50_MATCHED+I2L_STRIPS_50_MATCHED)) 139 | -------------------------------------------------------------------------------- /src/tools/disp_alpha.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Note: Copy this notebook into the gallery folder before running it." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 7, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import sys, os\n", 19 | "sys.path.extend(['../src/commons'])\n", 20 | "from pub_commons import DISP_ALPHA\n", 21 | "%matplotlib inline" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": true, 29 | "scrolled": false, 30 | "slideshow": { 31 | "slide_type": "-" 32 | } 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "DISP_ALPHA(storedir='I2L-NOPOOL', graph='test', step=167526, cmap='gist_gray', image='bde00b1efb71c8f.png')" 37 | ] 38 | } 39 | ], 40 | "metadata": { 41 | "kernelspec": { 42 | "display_name": "Python 2", 43 | "language": "python", 44 | "name": "python2" 45 | }, 46 | "language_info": { 47 | "codemirror_mode": { 48 | "name": "ipython", 49 | "version": 2 50 | }, 51 | "file_extension": ".py", 52 | "mimetype": "text/x-python", 53 | "name": "python", 54 | "nbconvert_exporter": "python", 55 | "pygments_lexer": "ipython2", 56 | "version": "2.7.14" 57 | } 58 | }, 59 | "nbformat": 4, 60 | "nbformat_minor": 2 61 | } 62 | -------------------------------------------------------------------------------- /src/tools/sample_preds.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#### This notebook is used for extracting and formatting data for publishing. Copy it into a gallery folder such as gallery/I2L-STRIPS before running it." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import sys\n", 19 | "sys.path.extend(['../../src/commons'])" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "from IPython.core.display import display, HTML\n", 31 | "display(HTML(\"\"))" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import pandas as pd\n", 43 | "import os\n", 44 | "import re\n", 45 | "import codecs\n", 46 | "from IPython.display import display, Math, Latex\n", 47 | "from IPython.display import Image as ipImage\n", 48 | "from six.moves import cPickle as pickle\n", 49 | "import string\n", 50 | "from PIL import Image\n", 51 | "import numpy as np\n", 52 | "import h5py\n", 53 | "import matplotlib as mpl\n", 54 | "from matplotlib import pyplot as plt\n", 55 | "from mpl_toolkits.axes_grid1 import ImageGrid\n", 56 | "# Config the matplotlib backend as plotting inline in IPython\n", 57 | "%matplotlib inline" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "pd.options.display.max_rows = 120\n", 69 | "pd.options.display.max_colwidth = 600\n", 70 | "pd.options.display.expand_frame_repr = False\n", 71 | "pd.options.display.colheader_justify = 'left'" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "import data_commons as dtc\n", 83 | "import dl_commons as dlc\n", 84 | "import viz_commons as viz\n", 85 | "from viz_commons import VisualizeDir, DiffParams, VisualizeStep" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Load results of test run" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "storedir = '.'\n", 104 | "clobber = True\n", 105 | "dump = True\n", 106 | "evaldir = os.path.join(storedir, 'eval_images')\n", 107 | "rendered_dir = os.path.join(evaldir, 'rendered_images')\n", 108 | "dumpdir = os.path.join(storedir, 'gallery_data')\n", 109 | "\n", 110 | "def chkclobber(path):\n", 111 | " assert clobber or (not os.path.exists(path)), \"Can't overwrite file %s when clobber==False\"%path\n", 112 | " return path\n", 113 | "\n", 114 | "def dump(df_, df_sample_, fname):\n", 115 | " if dump:\n", 116 | " with open(chkclobber(os.path.join(dumpdir, '%s_sample_table.txt'%fname)), 'w') as f:\n", 117 | " for row in df_sample_[['y','$\\hat{y}$']].itertuples(index=False):\n", 118 | " f.write(row[0] + ' & ' + row[1] + '\\n')\n", 119 | " df_.to_pickle(chkclobber(os.path.join(dumpdir, '%s_preds.pkl'%fname)))\n", 120 | " df_sample_.to_pickle(chkclobber(os.path.join(dumpdir, '%s_preds_sample.pkl'%fname)))" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "vd = VisualizeDir(os.path.expanduser(storedir))\n", 132 | "last_step = vd.get_steps()[1][-1]\n", 133 | "print('last_step = %d' % last_step)\n", 134 | "vs = VisualizeStep(vd, 'test', last_step)\n", 135 | "df_preds = pd.read_pickle(os.path.join(evaldir, 'predictions_test_%d.pkl'%last_step))" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": { 142 | "collapsed": true, 143 | "scrolled": true 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "df_preds" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "collapsed": true 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "df_strs = vs.strs( 'y', 'predicted_ids', sortkey=None, mingle=False, trim=True, wrap_strs=True, keys=['image_name', 'ed'])\n", 159 | "df_strs.columns" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "### View and save the unmatched images" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "collapsed": true, 174 | "scrolled": true 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "with open(os.path.join(rendered_dir, 'unmatched_filenames.txt'), 'r') as f:\n", 179 | " unmatched = []; missing = []\n", 180 | " for fname in f:\n", 181 | " fname = os.path.basename(fname.strip())\n", 182 | " path = os.path.join(rendered_dir, 'images_pred', fname)\n", 183 | " if not os.path.exists(path):\n", 184 | " missing.append(fname)\n", 185 | " else:\n", 186 | " unmatched.append(fname)\n", 187 | "num_missing = len(missing)\n", 188 | "total = len(df_preds)\n", 189 | "print('%d(%.3f%%) files missing out of %d'%(num_missing, (num_missing*100.)/(total*1.0), total))\n", 190 | "df_bad = df_preds.loc[unmatched]\n", 191 | "\n", 192 | "def wrap_math(df_):\n", 193 | " \"\"\"Wrap the latex formulas with $ symbols.\"\"\"\n", 194 | " targets=[]; preds=[]; # image=[];\n", 195 | " for row in df_[['target_seq', 'pred_seq']].itertuples(index=True):\n", 196 | "# image.append(row[0])\n", 197 | " targets.append('$%s$'%row[1])\n", 198 | " preds.append('$%s$'%row[2])\n", 199 | " _df = df_.drop(['iloc'], axis=1).reset_index(drop=False).copy(deep=True)\n", 200 | " _df = _df.assign(y=targets, pred=preds)\n", 201 | " return _df.rename(columns={'pred':'$\\hat{y}$'})\n", 202 | "\n", 203 | "df_bad_sample_ = wrap_math(df_bad.sample(115))\n", 204 | "df_bad_sample_[['$\\hat{y}$', 'y']]" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": true 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "df_bad_sample_.columns" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "#### Filter MathJax Errors\n", 223 | "Though they rendered fine with pdflatex, MathJax has difficulty rendering some images. Therefore we will remove them for visualization purposes so that one may leverage pandas to generate pretty formatted formulas." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "collapsed": true, 231 | "scrolled": true 232 | }, 233 | "outputs": [], 234 | "source": [ 235 | "df_bad_sample = df_bad_sample_.drop([1,3,44,45,86,89,94,102,107,110,114]).iloc[:100].reset_index(drop=True)\n", 236 | "df_bad_sample[['$\\hat{y}$', 'y']]" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "collapsed": true 244 | }, 245 | "outputs": [], 246 | "source": [ 247 | "dump(df_bad, df_bad_sample, 'unmatched')" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": true 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "df_matched = df_preds[~df_preds.index.isin(unmatched + missing)]\n", 259 | "df_matched.shape" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": { 266 | "collapsed": true 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "df_matched[df_matched.ed==0.0].shape" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": { 277 | "collapsed": true, 278 | "scrolled": true 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "df_txt_matched = df_matched[df_matched.ed==0.0].sort_values(by='pred_len', ascending=False)\n", 283 | "df_txt_matched_sample_ = wrap_math(df_txt_matched[:100])\n", 284 | "df_txt_matched_sample_[['$\\hat{y}$', 'y']]" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": { 291 | "collapsed": true 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "dump(df_txt_matched, df_txt_matched_sample_, 'txt_matched')" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "collapsed": true 303 | }, 304 | "outputs": [], 305 | "source": [ 306 | "df_img_matched.columns" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "collapsed": true, 314 | "scrolled": true 315 | }, 316 | "outputs": [], 317 | "source": [ 318 | "df_img_matched = df_matched[df_matched.ed!=0.0].sort_values(by='pred_len', ascending=False)\n", 319 | "df_img_matched_sample_ = wrap_math(df_img_matched[:110])\n", 320 | "df_img_matched_sample_[['$\\hat{y}$', 'pred_len', 'y', 'target_len']]" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": { 327 | "collapsed": true, 328 | "scrolled": true 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "df_img_matched_sample = df_img_matched_sample_.drop([29, 60, 89, 104]).reset_index(drop=True).iloc[:100]\n", 333 | "df_img_matched_sample[['$\\hat{y}$', 'pred_len', 'y', 'target_len']]" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": { 340 | "collapsed": true 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "dump(df_img_matched, df_img_matched_sample, 'img_matched')" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "# End" 352 | ] 353 | } 354 | ], 355 | "metadata": { 356 | "kernelspec": { 357 | "display_name": "Python 2", 358 | "language": "python", 359 | "name": "python2" 360 | }, 361 | "language_info": { 362 | "codemirror_mode": { 363 | "name": "ipython", 364 | "version": 2 365 | }, 366 | "file_extension": ".py", 367 | "mimetype": "text/x-python", 368 | "name": "python", 369 | "nbconvert_exporter": "python", 370 | "pygments_lexer": "ipython2", 371 | "version": "2.7.14" 372 | } 373 | }, 374 | "nbformat": 4, 375 | "nbformat_minor": 2 376 | } 377 | -------------------------------------------------------------------------------- /thirdparty/data/im2latex_formulas_downloaded.lst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/untrix/im2latex/3dd4572d2d3a52d5b5cf2ed302a06ee788015f09/thirdparty/data/im2latex_formulas_downloaded.lst -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Harvard NLP 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/Readme.md: -------------------------------------------------------------------------------- 1 | All code in this directory is taken from https://github.com/harvardnlp/im2markup under MIT license. The code is used for dataset preprocessing only. 2 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/evaluation/LevSeq.py: -------------------------------------------------------------------------------- 1 | from Levenshtein import * 2 | from warnings import warn 3 | 4 | class StringMatcher: 5 | """A SequenceMatcher-like class built on the top of Levenshtein""" 6 | 7 | def _reset_cache(self): 8 | self._ratio = self._distance = None 9 | self._opcodes = self._editops = self._matching_blocks = None 10 | 11 | def __init__(self, isjunk=None, seq1='', seq2=''): 12 | if isjunk: 13 | warn("isjunk not NOT implemented, it will be ignored") 14 | self._str1, self._str2 = seq1, seq2 15 | self._reset_cache() 16 | 17 | def set_seqs(self, seq1, seq2): 18 | self._str1, self._str2 = seq1, seq2 19 | self._reset_cache() 20 | 21 | def set_seq1(self, seq1): 22 | self._str1 = seq1 23 | self._reset_cache() 24 | 25 | def set_seq2(self, seq2): 26 | self._str2 = seq2 27 | self._reset_cache() 28 | 29 | def get_opcodes(self): 30 | if not self._opcodes: 31 | if self._editops: 32 | self._opcodes = opcodes(self._editops, self._str1, self._str2) 33 | else: 34 | self._opcodes = opcodes(self._str1, self._str2) 35 | return self._opcodes 36 | 37 | def get_editops(self): 38 | if not self._editops: 39 | if self._opcodes: 40 | self._editops = editops(self._opcodes, self._str1, self._str2) 41 | else: 42 | self._editops = editops(self._str1, self._str2) 43 | return self._editops 44 | 45 | def get_matching_blocks(self): 46 | if not self._matching_blocks: 47 | self._matching_blocks = matching_blocks(self.get_opcodes(), 48 | self._str1, self._str2) 49 | return self._matching_blocks 50 | 51 | def ratio(self): 52 | if not self._ratio: 53 | self._ratio = ratio(self._str1, self._str2) 54 | return self._ratio 55 | 56 | def quick_ratio(self): 57 | # This is usually quick enough :o) 58 | if not self._ratio: 59 | self._ratio = ratio(self._str1, self._str2) 60 | return self._ratio 61 | 62 | def real_quick_ratio(self): 63 | len1, len2 = len(self._str1), len(self._str2) 64 | return 2.0 * min(len1, len2) / (len1 + len2) 65 | 66 | def distance(self): 67 | if not self._distance: 68 | self._distance = distance(self._str1, self._str2) 69 | return self._distance 70 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/evaluation/distance/__init__.py: -------------------------------------------------------------------------------- 1 | "Utilities for comparing sequences" 2 | 3 | __all__ = ["hamming", "levenshtein", "nlevenshtein", "jaccard", "sorensen", 4 | "fast_comp", "lcsubstrings", "ilevenshtein", "ifast_comp"] 5 | 6 | try: 7 | from .cdistance import * 8 | except ImportError: 9 | from ._pyimports import * 10 | 11 | from ._pyimports import jaccard, sorensen 12 | 13 | def quick_levenshtein(str1, str2): 14 | return fast_comp(str1, str2, transpositions=False) 15 | 16 | def iquick_levenshtein(str1, strs): 17 | return ifast_comp(str1, str2, transpositions=False) 18 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/evaluation/distance/_fastcomp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | def fast_comp(seq1, seq2, transpositions=False): 4 | """Compute the distance between the two sequences `seq1` and `seq2` up to a 5 | maximum of 2 included, and return it. If the edit distance between the two 6 | sequences is higher than that, -1 is returned. 7 | 8 | If `transpositions` is `True`, transpositions will be taken into account for 9 | the computation of the distance. This can make a difference, e.g.: 10 | 11 | >>> fast_comp("abc", "bac", transpositions=False) 12 | 2 13 | >>> fast_comp("abc", "bac", transpositions=True) 14 | 1 15 | 16 | This is faster than `levenshtein` by an order of magnitude, but on the 17 | other hand is of limited use. 18 | 19 | The algorithm comes from `http://writingarchives.sakura.ne.jp/fastcomp`. 20 | I've added transpositions support to the original code. 21 | """ 22 | replace, insert, delete = "r", "i", "d" 23 | 24 | L1, L2 = len(seq1), len(seq2) 25 | if L1 < L2: 26 | L1, L2 = L2, L1 27 | seq1, seq2 = seq2, seq1 28 | 29 | ldiff = L1 - L2 30 | if ldiff == 0: 31 | models = (insert+delete, delete+insert, replace+replace) 32 | elif ldiff == 1: 33 | models = (delete+replace, replace+delete) 34 | elif ldiff == 2: 35 | models = (delete+delete,) 36 | else: 37 | return -1 38 | 39 | res = 3 40 | for model in models: 41 | i = j = c = 0 42 | while (i < L1) and (j < L2): 43 | if seq1[i] != seq2[j]: 44 | c = c+1 45 | if 2 < c: 46 | break 47 | 48 | if transpositions and ldiff != 2 \ 49 | and i < L1 - 1 and j < L2 - 1 \ 50 | and seq1[i+1] == seq2[j] and seq1[i] == seq2[j+1]: 51 | i, j = i+2, j+2 52 | else: 53 | cmd = model[c-1] 54 | if cmd == delete: 55 | i = i+1 56 | elif cmd == insert: 57 | j = j+1 58 | else: 59 | assert cmd == replace 60 | i,j = i+1, j+1 61 | else: 62 | i,j = i+1, j+1 63 | 64 | if 2 < c: 65 | continue 66 | elif i < L1: 67 | if L1-i <= model[c:].count(delete): 68 | c = c + (L1-i) 69 | else: 70 | continue 71 | elif j < L2: 72 | if L2-j <= model[c:].count(insert): 73 | c = c + (L2-j) 74 | else: 75 | continue 76 | 77 | if c < res: 78 | res = c 79 | 80 | if res == 3: 81 | res = -1 82 | return res 83 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/evaluation/distance/_iterators.py: -------------------------------------------------------------------------------- 1 | from ._pyimports import levenshtein, fast_comp 2 | 3 | def ilevenshtein(seq1, seqs, max_dist=-1): 4 | """Compute the Levenshtein distance between the sequence `seq1` and the series 5 | of sequences `seqs`. 6 | 7 | `seq1`: the reference sequence 8 | `seqs`: a series of sequences (can be a generator) 9 | `max_dist`: if provided and > 0, only the sequences which distance from 10 | the reference sequence is lower or equal to this value will be returned. 11 | 12 | The return value is a series of pairs (distance, sequence). 13 | 14 | The sequence objects in `seqs` are expected to be of the same kind than 15 | the reference sequence in the C implementation; the same holds true for 16 | `ifast_comp`. 17 | """ 18 | for seq2 in seqs: 19 | dist = levenshtein(seq1, seq2, max_dist=max_dist) 20 | if dist != -1: 21 | yield dist, seq2 22 | 23 | 24 | def ifast_comp(seq1, seqs, transpositions=False): 25 | """Return an iterator over all the sequences in `seqs` which distance from 26 | `seq1` is lower or equal to 2. The sequences which distance from the 27 | reference sequence is higher than that are dropped. 28 | 29 | `seq1`: the reference sequence. 30 | `seqs`: a series of sequences (can be a generator) 31 | `transpositions` has the same sense than in `fast_comp`. 32 | 33 | The return value is a series of pairs (distance, sequence). 34 | 35 | You might want to call `sorted()` on the iterator to get the results in a 36 | significant order: 37 | 38 | >>> g = ifast_comp("foo", ["fo", "bar", "foob", "foo", "foobaz"]) 39 | >>> sorted(g) 40 | [(0, 'foo'), (1, 'fo'), (1, 'foob')] 41 | """ 42 | for seq2 in seqs: 43 | dist = fast_comp(seq1, seq2, transpositions) 44 | if dist != -1: 45 | yield dist, seq2 46 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/evaluation/distance/_lcsubstrings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from array import array 4 | 5 | 6 | def lcsubstrings(seq1, seq2, positions=False): 7 | """Find the longest common substring(s) in the sequences `seq1` and `seq2`. 8 | 9 | If positions evaluates to `True` only their positions will be returned, 10 | together with their length, in a tuple: 11 | 12 | (length, [(start pos in seq1, start pos in seq2)..]) 13 | 14 | Otherwise, the substrings themselves will be returned, in a set. 15 | 16 | Example: 17 | 18 | >>> lcsubstrings("sedentar", "dentist") 19 | {'dent'} 20 | >>> lcsubstrings("sedentar", "dentist", positions=True) 21 | (4, [(2, 0)]) 22 | """ 23 | L1, L2 = len(seq1), len(seq2) 24 | ms = [] 25 | mlen = last = 0 26 | if L1 < L2: 27 | seq1, seq2 = seq2, seq1 28 | L1, L2 = L2, L1 29 | 30 | column = array('L', range(L2)) 31 | 32 | for i in range(L1): 33 | for j in range(L2): 34 | old = column[j] 35 | if seq1[i] == seq2[j]: 36 | if i == 0 or j == 0: 37 | column[j] = 1 38 | else: 39 | column[j] = last + 1 40 | if column[j] > mlen: 41 | mlen = column[j] 42 | ms = [(i, j)] 43 | elif column[j] == mlen: 44 | ms.append((i, j)) 45 | else: 46 | column[j] = 0 47 | last = old 48 | 49 | if positions: 50 | return (mlen, tuple((i - mlen + 1, j - mlen + 1) for i, j in ms if ms)) 51 | return set(seq1[i - mlen + 1:i + 1] for i, _ in ms if ms) 52 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/evaluation/distance/_levenshtein.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from array import array 4 | 5 | 6 | def levenshtein(seq1, seq2, max_dist=-1, normalized=False): 7 | """Compute the absolute Levenshtein distance between the two sequences 8 | `seq1` and `seq2`. 9 | 10 | The Levenshtein distance is the minimum number of edit operations necessary 11 | for transforming one sequence into the other. The edit operations allowed are: 12 | 13 | * deletion: ABC -> BC, AC, AB 14 | * insertion: ABC -> ABCD, EABC, AEBC.. 15 | * substitution: ABC -> ABE, ADC, FBC.. 16 | 17 | The `max_dist` parameter controls at which moment we should stop computing the 18 | distance between the provided sequences. If it is a negative integer, the 19 | distance will be computed until the sequences are exhausted; otherwise, the 20 | computation will stop at the moment the calculated distance is higher than 21 | `max_dist`, and then return -1. For example: 22 | 23 | >>> levenshtein("abc", "abcd", max_dist=1) # dist = 1 24 | 1 25 | >>> levenshtein("abc", "abcde", max_dist=1) # dist = 2 26 | -1 27 | 28 | This can be a time saver if you're not interested in the exact distance, but 29 | only need to check if the distance between the given sequences is below a 30 | given threshold. 31 | 32 | The `normalized` parameter is here for backward compatibility; providing 33 | it will result in a call to `nlevenshtein`, which should be used directly 34 | instead. 35 | """ 36 | if normalized: 37 | return nlevenshtein(seq1, seq2, method=1) 38 | 39 | if seq1 == seq2: 40 | return 0 41 | 42 | len1, len2 = len(seq1), len(seq2) 43 | if max_dist >= 0 and abs(len1 - len2) > max_dist: 44 | return -1 45 | if len1 == 0: 46 | return len2 47 | if len2 == 0: 48 | return len1 49 | if len1 < len2: 50 | len1, len2 = len2, len1 51 | seq1, seq2 = seq2, seq1 52 | 53 | column = array('L', range(len2 + 1)) 54 | 55 | for x in range(1, len1 + 1): 56 | column[0] = x 57 | last = x - 1 58 | for y in range(1, len2 + 1): 59 | old = column[y] 60 | cost = int(seq1[x - 1] != seq2[y - 1]) 61 | column[y] = min(column[y] + 1, column[y - 1] + 1, last + cost) 62 | last = old 63 | if max_dist >= 0 and min(column) > max_dist: 64 | return -1 65 | 66 | if max_dist >= 0 and column[len2] > max_dist: 67 | # stay consistent, even if we have the exact distance 68 | return -1 69 | return column[len2] 70 | 71 | 72 | def nlevenshtein(seq1, seq2, method=1): 73 | """Compute the normalized Levenshtein distance between `seq1` and `seq2`. 74 | 75 | Two normalization methods are provided. For both of them, the normalized 76 | distance will be a float between 0 and 1, where 0 means equal and 1 77 | completely different. The computation obeys the following patterns: 78 | 79 | 0.0 if seq1 == seq2 80 | 1.0 if len(seq1) == 0 or len(seq2) == 0 81 | edit distance / factor otherwise 82 | 83 | The `method` parameter specifies which normalization factor should be used. 84 | It can have the value 1 or 2, which correspond to the following: 85 | 86 | 1: the length of the shortest alignment between the sequences 87 | (that is, the length of the longest sequence) 88 | 2: the length of the longest alignment between the sequences 89 | 90 | Which normalization factor should be chosen is a matter of taste. The first 91 | one is cheap to compute. The second one is more costly, but it accounts 92 | better than the first one for parallelisms of symbols between the sequences. 93 | 94 | For the rationale behind the use of the second method, see: 95 | Heeringa, "Measuring Dialect Pronunciation Differences using Levenshtein 96 | Distance", 2004, p. 130 sq, which is available online at: 97 | http://www.let.rug.nl/~heeringa/dialectology/thesis/thesis.pdf 98 | """ 99 | 100 | if seq1 == seq2: 101 | return 0.0 102 | len1, len2 = len(seq1), len(seq2) 103 | if len1 == 0 or len2 == 0: 104 | return 1.0 105 | if len1 < len2: # minimize the arrays size 106 | len1, len2 = len2, len1 107 | seq1, seq2 = seq2, seq1 108 | 109 | if method == 1: 110 | return levenshtein(seq1, seq2) / float(len1) 111 | if method != 2: 112 | raise ValueError("expected either 1 or 2 for `method` parameter") 113 | 114 | column = array('L', range(len2 + 1)) 115 | length = array('L', range(len2 + 1)) 116 | 117 | for x in range(1, len1 + 1): 118 | 119 | column[0] = length[0] = x 120 | last = llast = x - 1 121 | 122 | for y in range(1, len2 + 1): 123 | 124 | # dist 125 | old = column[y] 126 | ic = column[y - 1] + 1 127 | dc = column[y] + 1 128 | rc = last + (seq1[x - 1] != seq2[y - 1]) 129 | column[y] = min(ic, dc, rc) 130 | last = old 131 | 132 | # length 133 | lold = length[y] 134 | lic = length[y - 1] + 1 if ic == column[y] else 0 135 | ldc = length[y] + 1 if dc == column[y] else 0 136 | lrc = llast + 1 if rc == column[y] else 0 137 | length[y] = max(ldc, lic, lrc) 138 | llast = lold 139 | 140 | return column[y] / float(length[y]) 141 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/evaluation/distance/_pyimports.py: -------------------------------------------------------------------------------- 1 | from ._fastcomp import * 2 | from ._lcsubstrings import * 3 | from ._levenshtein import * 4 | from ._simpledists import * 5 | from ._iterators import * 6 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/evaluation/distance/_simpledists.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | def hamming(seq1, seq2, normalized=False): 4 | """Compute the Hamming distance between the two sequences `seq1` and `seq2`. 5 | The Hamming distance is the number of differing items in two ordered 6 | sequences of the same length. If the sequences submitted do not have the 7 | same length, an error will be raised. 8 | 9 | If `normalized` evaluates to `False`, the return value will be an integer 10 | between 0 and the length of the sequences provided, edge values included; 11 | otherwise, it will be a float between 0 and 1 included, where 0 means 12 | equal, and 1 totally different. Normalized hamming distance is computed as: 13 | 14 | 0.0 if len(seq1) == 0 15 | hamming_dist / len(seq1) otherwise 16 | """ 17 | L = len(seq1) 18 | if L != len(seq2): 19 | raise ValueError("expected two strings of the same length") 20 | if L == 0: 21 | return 0.0 if normalized else 0 # equal 22 | dist = sum(c1 != c2 for c1, c2 in zip(seq1, seq2)) 23 | if normalized: 24 | return dist / float(L) 25 | return dist 26 | 27 | def jaccard(seq1, seq2): 28 | """Compute the Jaccard distance between the two sequences `seq1` and `seq2`. 29 | They should contain hashable items. 30 | 31 | The return value is a float between 0 and 1, where 0 means equal, and 1 totally different. 32 | """ 33 | set1, set2 = set(seq1), set(seq2) 34 | return 1 - len(set1 & set2) / float(len(set1 | set2)) 35 | 36 | 37 | def sorensen(seq1, seq2): 38 | """Compute the Sorensen distance between the two sequences `seq1` and `seq2`. 39 | They should contain hashable items. 40 | 41 | The return value is a float between 0 and 1, where 0 means equal, and 1 totally different. 42 | """ 43 | set1, set2 = set(seq1), set(seq2) 44 | return 1 - (2 * len(set1 & set2) / float(len(set1) + len(set2))) 45 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/evaluation/evaluate_bleu.py: -------------------------------------------------------------------------------- 1 | import os, sys, copy, argparse, shutil, pickle, subprocess, logging 2 | 3 | def process_args(args): 4 | parser = argparse.ArgumentParser(description='Evaluate BLEU score') 5 | parser.add_argument('--result-path', dest='result_path', 6 | type=str, required=True, 7 | help=('Result file containing per line. This should be set to the output file of the model.' 8 | )) 9 | parser.add_argument('--data-path', dest='data_path', 10 | type=str, required=True, 11 | help=('Input file which contains the samples to be evaluated. The format is per line.' 12 | )) 13 | parser.add_argument('--label-path', dest='label_path', 14 | type=str, required=True, 15 | help=('Gold label file which contains a tokenized formula per line.' 16 | )) 17 | parser.add_argument('--log-path', dest="log_path", 18 | type=str, default='log.txt', 19 | help=('Log file path, default=log.txt' 20 | )) 21 | parameters = parser.parse_args(args) 22 | return parameters 23 | 24 | def main(args): 25 | script_path = os.path.realpath(__file__) 26 | script_dir = os.path.dirname(script_path) 27 | app_dir = os.path.join(script_dir, '../..') 28 | 29 | parameters = process_args(args) 30 | logging.basicConfig( 31 | level=logging.INFO, 32 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s', 33 | filename=parameters.log_path) 34 | 35 | console = logging.StreamHandler() 36 | console.setLevel(logging.INFO) 37 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s') 38 | console.setFormatter(formatter) 39 | logging.getLogger('').addHandler(console) 40 | 41 | logging.info('Script being executed: %s'%__file__) 42 | 43 | label_path = parameters.label_path 44 | data_path = parameters.data_path 45 | result_path = parameters.result_path 46 | assert os.path.exists(label_path), 'Label file %s not found'%label_path 47 | assert os.path.exists(data_path), 'Data file %s not found'%data_path 48 | assert os.path.exists(result_path), 'Result file %s not found'%result_path 49 | 50 | labels_tmp = {} 51 | labels = {} 52 | with open(label_path) as flabel: 53 | with open(data_path) as fdata: 54 | line_idx = 0 55 | for line in flabel: 56 | labels_tmp[line_idx] = line.strip() 57 | line_idx += 1 58 | for line in fdata: 59 | img_path, idx = line.strip().split() 60 | labels[img_path] = labels_tmp[int(idx)] 61 | 62 | results = {} 63 | with open(result_path) as fin: 64 | for line_idx,line in enumerate(fin): 65 | if line_idx % 1000 == 0: 66 | print (line_idx) 67 | items = line.strip().split('\t') 68 | if len(items) == 5: 69 | img_path, label_gold, label_pred, score_pred, score_gold = items 70 | if not img_path in labels: 71 | logging.warning('%s in result file while not in the gold file!'%img_path) 72 | results[img_path] = label_pred+'\n' 73 | 74 | fpred = open('.tmp.pred.txt', 'w') 75 | fgold = open('.tmp.gold.txt', 'w') 76 | for img_path in labels: 77 | fpred.write(results.setdefault(img_path, '\n')) 78 | fgold.write(labels[img_path]+'\n') 79 | fpred.close() 80 | fgold.close() 81 | metric = subprocess.check_output('perl third_party/multi-bleu.perl %s < %s'%('.tmp.gold.txt', '.tmp.pred.txt'), shell=True) 82 | #os.remove('.tmp.pred.txt') 83 | #os.remove('.tmp.gold.txt') 84 | logging.info(metric) 85 | 86 | if __name__ == '__main__': 87 | main(sys.argv[1:]) 88 | logging.info('Jobs finished') 89 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/evaluation/evaluate_text_edit_distance.py: -------------------------------------------------------------------------------- 1 | import os, sys, argparse, logging 2 | import distance 3 | 4 | 5 | def process_args(args): 6 | parser = argparse.ArgumentParser(description='Evaluate text edit distance.') 7 | 8 | parser.add_argument('--result-path', dest='result_path', 9 | type=str, required=True, 10 | help=('Result file containing per line. This should be set to the output file of the model.' 11 | )) 12 | 13 | parser.add_argument('--log-path', dest="log_path", 14 | type=str, default='log.txt', 15 | help=('Log file path, default=log.txt' 16 | )) 17 | parameters = parser.parse_args(args) 18 | return parameters 19 | 20 | def main(args): 21 | parameters = process_args(args) 22 | logging.basicConfig( 23 | level=logging.INFO, 24 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s', 25 | filename=parameters.log_path) 26 | 27 | console = logging.StreamHandler() 28 | console.setLevel(logging.INFO) 29 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s') 30 | console.setFormatter(formatter) 31 | logging.getLogger('').addHandler(console) 32 | 33 | logging.info('Script being executed: %s'%__file__) 34 | 35 | result_file = parameters.result_path 36 | total_ref = 0 37 | total_edit_distance = 0 38 | with open(result_file) as fin: 39 | for idx,line in enumerate(fin): 40 | if idx % 100 == 0: 41 | print (idx) 42 | items = line.strip().split('\t') 43 | if len(items) == 5: 44 | img_path, label_gold, label_pred, score_pred, score_gold = items 45 | l_pred = label_pred.strip() 46 | l_gold = label_gold.strip() 47 | tokens_pred = l_pred.split(' ') 48 | tokens_gold = l_gold.split(' ') 49 | ref = max(len(tokens_gold), len(tokens_pred)) 50 | edit_distance = distance.levenshtein(tokens_gold, tokens_pred) 51 | total_ref += ref 52 | total_edit_distance += edit_distance 53 | logging.info('Edit Distance Accuracy: %f'%(1.-float(total_edit_distance)/total_ref)) 54 | 55 | if __name__ == '__main__': 56 | main(sys.argv[1:]) 57 | logging.info('Jobs finished') 58 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/evaluation/render_html.py: -------------------------------------------------------------------------------- 1 | import sys, os, re, shutil, argparse, logging 2 | sys.path.insert(0, '%s'%os.path.join(os.path.dirname(__file__), '../utils/')) 3 | from image_utils import * 4 | from multiprocessing import Pool 5 | from multiprocessing.dummy import Pool as ThreadPool 6 | 7 | W=100 8 | H=100 9 | 10 | def process_args(args): 11 | parser = argparse.ArgumentParser(description='Render HTML files for comparison. Note that we render both the predicted results, and the original HTMLs.') 12 | 13 | parser.add_argument('--result-path', dest='result_path', 14 | type=str, required=True, 15 | help=('Result file containing per line. This should be set to the output file of the model.' 16 | )) 17 | parser.add_argument('--output-dir', dest='output_dir', 18 | type=str, required=True, 19 | help=('Output directory to put the rendered images. A subfolder with name "images_gold" will be created for the rendered gold images, and a subfolder with name "images_pred" will be created for the rendered predictions.' 20 | )) 21 | 22 | parser.add_argument('--replace', dest='replace', action='store_true', 23 | help=('Replace flag, if set to false, will ignore the already existing images.' 24 | )) 25 | parser.add_argument('--no-replace', dest='replace', action='store_false') 26 | parser.set_defaults(replace=False) 27 | parser.add_argument('--num-threads', dest='num_threads', 28 | type=int, default=4, 29 | help=('Number of threads, default=4.' 30 | )) 31 | parser.add_argument('--log-path', dest="log_path", 32 | type=str, default='log.txt', 33 | help=('Log file path, default=log.txt' 34 | )) 35 | parameters = parser.parse_args(args) 36 | return parameters 37 | 38 | def main(args): 39 | parameters = process_args(args) 40 | logging.basicConfig( 41 | level=logging.INFO, 42 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s', 43 | filename=parameters.log_path) 44 | 45 | console = logging.StreamHandler() 46 | console.setLevel(logging.INFO) 47 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s') 48 | console.setFormatter(formatter) 49 | logging.getLogger('').addHandler(console) 50 | 51 | logging.info('Script being executed: %s'%__file__) 52 | 53 | result_path = parameters.result_path 54 | output_dir = parameters.output_dir 55 | assert os.path.exists(result_path), result_path 56 | 57 | pred_dir = os.path.join(output_dir, 'images_pred') 58 | gold_dir = os.path.join(output_dir, 'images_gold') 59 | for dirname in [pred_dir, gold_dir]: 60 | if not os.path.exists(dirname): 61 | os.makedirs(dirname) 62 | 63 | lines = [] 64 | with open(result_path) as fin: 65 | for idx,line in enumerate(fin): 66 | items = line.strip().split('\t') 67 | if len(items) == 5: 68 | img_path, label_gold, label_pred, score_pred, score_gold = items 69 | img_idx = img_path[:-9] 70 | lines.append((label_pred, img_idx, pred_dir, parameters.replace)) 71 | lines.append((label_gold, img_idx, gold_dir, parameters.replace)) 72 | 73 | logging.info('Creating pool with %d threads'%parameters.num_threads) 74 | pool = ThreadPool(parameters.num_threads) 75 | logging.info('Jobs running...') 76 | results = pool.map(main_parallel, lines) 77 | pool.close() 78 | pool.join() 79 | 80 | 81 | def main_parallel(l): 82 | label, img_idx, dirname, replace = l 83 | if replace or (not os.path.exists('%s/%s-full.png'%(dirname, img_idx))): 84 | html_name = '%s_%s.html'%(dirname, img_idx) 85 | with open(html_name, 'w') as fout: 86 | fout.write(label) 87 | os.system('webkit2png --clipwidth=1 --clipheight=1 -Fs 1 -W %d -H %d %s -o %s/%s'%(W,H,html_name,dirname,img_idx)) 88 | os.remove(html_name) 89 | 90 | if __name__ == '__main__': 91 | main(sys.argv[1:]) 92 | logging.info('Jobs finished') 93 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/evaluation/render_latex.py: -------------------------------------------------------------------------------- 1 | import sys, os, re, shutil, argparse, logging 2 | sys.path.insert(0, '%s'%os.path.join(os.path.dirname(__file__), '../utils/')) 3 | from utils import run 4 | from image_utils import * 5 | from multiprocessing import Pool 6 | from multiprocessing.dummy import Pool as ThreadPool 7 | 8 | 9 | TIMEOUT = 10 10 | 11 | # replace \pmatrix with \begin{pmatrix}\end{pmatrix} 12 | # replace \matrix with \begin{matrix}\end{matrix} 13 | template = r""" 14 | \documentclass[12pt]{article} 15 | \pagestyle{empty} 16 | \usepackage{amsmath} 17 | \newcommand{\mymatrix}[1]{\begin{matrix}#1\end{matrix}} 18 | \newcommand{\mypmatrix}[1]{\begin{pmatrix}#1\end{pmatrix}} 19 | \begin{document} 20 | \begin{displaymath} 21 | %s 22 | \end{displaymath} 23 | \end{document} 24 | """ 25 | 26 | 27 | def process_args(args): 28 | parser = argparse.ArgumentParser(description='Render latex formulas for comparison. Note that we need to render both the predicted results, and the original formulas, since we need to make sure the same environment of rendering is used.') 29 | 30 | parser.add_argument('--result-path', dest='result_path', 31 | type=str, required=True, 32 | help=('Result file containing per line. This should be set to the output file of the model.' 33 | )) 34 | parser.add_argument('--data-path', dest='data_path', 35 | type=str, required=True, 36 | help=('Input file which contains the samples to be evaluated. The format is per line.' 37 | )) 38 | parser.add_argument('--label-path', dest='label_path', 39 | type=str, required=True, 40 | help=('Gold label file which contains a formula per line. Note that this does not necessarily need to be tokenized, and for comparing against the gold standard, the original (un-preprocessed) label file shall be used.' 41 | )) 42 | parser.add_argument('--output-dir', dest='output_dir', 43 | type=str, required=True, 44 | help=('Output directory to put the rendered images. A subfolder with name "images_gold" will be created for the rendered gold images, and a subfolder with name "images_pred" will be created for the rendered predictions.' 45 | )) 46 | 47 | parser.add_argument('--replace', dest='replace', action='store_true', 48 | help=('Replace flag, if set to false, will ignore the already existing images.' 49 | )) 50 | parser.add_argument('--no-replace', dest='replace', action='store_false') 51 | parser.set_defaults(replace=False) 52 | parser.add_argument('--num-threads', dest='num_threads', 53 | type=int, default=4, 54 | help=('Number of threads, default=4.' 55 | )) 56 | parser.add_argument('--log-path', dest="log_path", 57 | type=str, default='log.txt', 58 | help=('Log file path, default=log.txt' 59 | )) 60 | parameters = parser.parse_args(args) 61 | return parameters 62 | 63 | def main(args): 64 | parameters = process_args(args) 65 | logging.basicConfig( 66 | level=logging.INFO, 67 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s', 68 | filename=parameters.log_path) 69 | 70 | console = logging.StreamHandler() 71 | console.setLevel(logging.INFO) 72 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s') 73 | console.setFormatter(formatter) 74 | logging.getLogger('').addHandler(console) 75 | 76 | logging.info('Script being executed: %s'%__file__) 77 | 78 | result_path = parameters.result_path 79 | data_path = parameters.data_path 80 | label_path = parameters.label_path 81 | output_dir = parameters.output_dir 82 | assert os.path.exists(label_path), label_path 83 | assert os.path.exists(result_path), result_path 84 | assert os.path.exists(data_path), data_path 85 | 86 | pred_dir = os.path.join(output_dir, 'images_pred') 87 | gold_dir = os.path.join(output_dir, 'images_gold') 88 | for dirname in [pred_dir, gold_dir]: 89 | if not os.path.exists(dirname): 90 | os.makedirs(dirname) 91 | 92 | 93 | formulas = open(label_path).readlines() 94 | lines = [] 95 | with open(data_path) as fin: 96 | for line in fin: 97 | img_path, line_idx = line.strip().split() 98 | lines.append((img_path, formulas[int(line_idx)], os.path.join(gold_dir, img_path), parameters.replace)) 99 | with open(result_path) as fin: 100 | for line in fin: 101 | img_path, label_gold, label_pred, _, _ = line.strip().split('\t') 102 | lines.append((img_path, label_pred, os.path.join(pred_dir, img_path), parameters.replace)) 103 | logging.info('Creating pool with %d threads'%parameters.num_threads) 104 | pool = ThreadPool(parameters.num_threads) 105 | logging.info('Jobs running...') 106 | results = pool.map(main_parallel, lines) 107 | pool.close() 108 | pool.join() 109 | 110 | def output_err(output_path, i, reason, img): 111 | logging.info('ERROR: %s %s\n'%(img,reason)) 112 | 113 | def main_parallel(line): 114 | img_path, l, output_path, replace = line 115 | pre_name = output_path.replace('/', '_').replace('.','_') 116 | l = l.strip() 117 | l = l.replace(r'\pmatrix', r'\mypmatrix') 118 | l = l.replace(r'\matrix', r'\mymatrix') 119 | # remove leading comments 120 | l = l.strip('%') 121 | if len(l) == 0: 122 | l = '\\hspace{1cm}' 123 | # \hspace {1 . 5 cm} -> \hspace {1.5cm} 124 | for space in ["hspace", "vspace"]: 125 | match = re.finditer(space + " {(.*?)}", l) 126 | if match: 127 | new_l = "" 128 | last = 0 129 | for m in match: 130 | new_l = new_l + l[last:m.start(1)] + m.group(1).replace(" ", "") 131 | last = m.end(1) 132 | new_l = new_l + l[last:] 133 | l = new_l 134 | if replace or (not os.path.exists(output_path)): 135 | tex_filename = pre_name+'.tex' 136 | log_filename = pre_name+'.log' 137 | aux_filename = pre_name+'.aux' 138 | with open(tex_filename, "w") as w: 139 | print >> w, (template%l) 140 | run("pdflatex -interaction=nonstopmode %s >/dev/null"%tex_filename, TIMEOUT) 141 | os.remove(tex_filename) 142 | os.remove(log_filename) 143 | os.remove(aux_filename) 144 | pdf_filename = tex_filename[:-4]+'.pdf' 145 | png_filename = tex_filename[:-4]+'.png' 146 | if not os.path.exists(pdf_filename): 147 | output_err(output_path, 0, 'cannot compile', img_path) 148 | else: 149 | os.system("convert -density 200 -quality 100 %s %s"%(pdf_filename, png_filename)) 150 | os.remove(pdf_filename) 151 | if os.path.exists(png_filename): 152 | crop_image(png_filename, output_path) 153 | os.remove(png_filename) 154 | 155 | 156 | if __name__ == '__main__': 157 | main(sys.argv[1:]) 158 | logging.info('Jobs finished') 159 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/preprocessing/generate_latex_vocab.py: -------------------------------------------------------------------------------- 1 | import sys, logging, argparse, os 2 | 3 | def process_args(args): 4 | parser = argparse.ArgumentParser(description='Generate vocabulary file.') 5 | 6 | parser.add_argument('--data-path', dest='data_path', 7 | type=str, required=True, 8 | help=('Input file containing per line. This should be the file used for training.' 9 | )) 10 | parser.add_argument('--label-path', dest='label_path', 11 | type=str, required=True, 12 | help=('Input file containing a tokenized formula per line.' 13 | )) 14 | parser.add_argument('--output-file', dest='output_file', 15 | type=str, required=True, 16 | help=('Output file for putting vocabulary.' 17 | )) 18 | parser.add_argument('--unk-threshold', dest='unk_threshold', 19 | type=int, default=1, 20 | help=('If the number of occurences of a token is less than (including) the threshold, then it will be excluded from the generated vocabulary.' 21 | )) 22 | parser.add_argument('--log-path', dest="log_path", 23 | type=str, default='log.txt', 24 | help=('Log file path, default=log.txt' 25 | )) 26 | parameters = parser.parse_args(args) 27 | return parameters 28 | 29 | def main(args): 30 | parameters = process_args(args) 31 | logging.basicConfig( 32 | level=logging.INFO, 33 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s', 34 | filename=parameters.log_path) 35 | 36 | console = logging.StreamHandler() 37 | console.setLevel(logging.INFO) 38 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s') 39 | console.setFormatter(formatter) 40 | logging.getLogger('').addHandler(console) 41 | 42 | logging.info('Script being executed: %s'%__file__) 43 | 44 | label_path = parameters.label_path 45 | assert os.path.exists(label_path), label_path 46 | data_path = parameters.data_path 47 | assert os.path.exists(data_path), data_path 48 | 49 | formulas = open(label_path).readlines() 50 | vocab = {} 51 | max_len = 0 52 | with open(data_path) as fin: 53 | for line in fin: 54 | _, line_idx = line.strip().split() 55 | line_strip = formulas[int(line_idx)].strip() 56 | tokens = line_strip.split() 57 | tokens_out = [] 58 | for token in tokens: 59 | tokens_out.append(token) 60 | if token not in vocab: 61 | vocab[token] = 0 62 | vocab[token] += 1 63 | 64 | vocab_sort = sorted(list(vocab.keys())) 65 | vocab_out = [] 66 | unk_tokens = [] 67 | num_unknown = 0 68 | for word in vocab_sort: 69 | if vocab[word] > parameters.unk_threshold: 70 | vocab_out.append(word) 71 | else: 72 | unk_tokens.append(word) 73 | num_unknown += 1 74 | #vocab = ["'"+word.replace('\\','\\\\').replace('\'', '\\\'')+"'" for word in vocab_out] 75 | vocab = [word for word in vocab_out] 76 | 77 | with open(parameters.output_file, 'w') as fout: 78 | fout.write('\n'.join(vocab)) 79 | logging.info('#UNK\'s: %d'%num_unknown) 80 | logging.info('UNK tokens:\n%s', unk_tokens) 81 | 82 | if __name__ == '__main__': 83 | main(sys.argv[1:]) 84 | logging.info('Jobs finished') 85 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/preprocessing/preprocess_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, os, argparse, logging 3 | import numpy as np 4 | import PIL 5 | from PIL import Image 6 | 7 | def process_args(args): 8 | parser = argparse.ArgumentParser(description='Process im2latex-100k train, test, development files ( ) for formatting files such that can be used for training. ( >). Additionaly, if flag is set, large images, too long formulas and formulas that cannot be parsed will be discarded.') 9 | 10 | parser.add_argument('--image-dir', dest='image_dir', 11 | type=str, default='', 12 | help=('Directory containing processed images.' 13 | )) 14 | parser.add_argument('--data-path', dest='data_path', 15 | type=str, required=True, 16 | help=('Input file path containing per line. Note that does not contain postfix.' 17 | )) 18 | parser.add_argument('--output-path', dest='output_path', 19 | type=str, required=True, 20 | help=('Output file path containing per line. Note that does contain postfix. If filter flag is set, then the output file may have less lines than original file.' 21 | )) 22 | 23 | parser.add_argument('--label-path', dest='label_path', 24 | type=str, default='', 25 | help=('Input label path containing per line. This is required if filter flag is set, and data point with blank formulas will be discarded.' 26 | )) 27 | parser.add_argument('--filter', dest='filter', action='store_true', 28 | help=('Filter flag, if set, then too large images, formulas that cannot be parsed or have too many tokens will be discarded.' 29 | )) 30 | parser.add_argument('--no-filter', dest='filter', action='store_false') 31 | parser.set_defaults(filter=False) 32 | parser.add_argument('--max-width', dest='max_width', 33 | type=int, default=500, 34 | help=('If filter flag is set, images with width than max-width will be discarded in the output file.' 35 | )) 36 | parser.add_argument('--max-height', dest='max_height', 37 | type=int, default=160, 38 | help=('If filter flag is set, images with larger height than max-width will be discarded in the output file.' 39 | )) 40 | parser.add_argument('--max-tokens', dest='max_tokens', 41 | type=int, default=150, 42 | help=('If filter flag is set, formulas with more than max-tokens tokens will be discarded in the output file.' 43 | )) 44 | parser.add_argument('--log-path', dest="log_path", 45 | type=str, default='log.txt', 46 | help=('Log file path, default=log.txt' 47 | )) 48 | parser.add_argument('--postfix', dest='postfix', 49 | type=str, default='.png', 50 | help=('The format of images, default=".png".' 51 | )) 52 | parameters = parser.parse_args(args) 53 | return parameters 54 | 55 | def main(args): 56 | parameters = process_args(args) 57 | logging.basicConfig( 58 | level=logging.INFO, 59 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s', 60 | filename=parameters.log_path) 61 | 62 | console = logging.StreamHandler() 63 | console.setLevel(logging.INFO) 64 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s') 65 | console.setFormatter(formatter) 66 | logging.getLogger('').addHandler(console) 67 | 68 | logging.info('Script being executed: %s'%__file__) 69 | data_path = parameters.data_path 70 | output_path = parameters.output_path 71 | image_dir = parameters.image_dir 72 | 73 | num_discard = 0 74 | num_nonexist = 0 75 | 76 | if parameters.filter: 77 | assert os.path.isfile(parameters.label_path), parameters.label_path 78 | labels = open(parameters.label_path).readlines() 79 | with open(output_path, 'w') as fout: 80 | with open(data_path, 'r') as fdata: 81 | for line in fdata: 82 | line_strip = line.strip() 83 | if len(line_strip) > 0: 84 | line_idx, img_path, mod = line_strip.split() 85 | img_path = os.path.join(image_dir, img_path) + parameters.postfix 86 | if parameters.filter: 87 | if not os.path.exists(img_path): 88 | logging.warning('%s does not exist!'%os.path.basename(img_path)) 89 | num_nonexist += 1 90 | continue 91 | old_im = Image.open(img_path) 92 | old_size = old_im.size 93 | w = old_size[0] 94 | h = old_size[1] 95 | else: 96 | w = 0 97 | h = 0 98 | if (not parameters.filter) or (w <= parameters.max_width and h <= parameters.max_height): 99 | if parameters.filter: 100 | label = labels[int(line_idx)] 101 | if len(label.strip()) == 0: 102 | logging.info('%s discarded due to cannot-be-parsed formula!'%os.path.basename(img_path)) 103 | continue 104 | if len(label.strip().split()) > parameters.max_tokens: 105 | logging.info('%s discarded due to too many tokens!'%os.path.basename(img_path)) 106 | continue 107 | fout.write('%s %s\n'%(os.path.basename(img_path),line_idx)) 108 | else: 109 | logging.info('%s discarded due to large image size!'%os.path.basename(img_path)) 110 | num_discard += 1 111 | logging.info('%d discarded. %d not found in %s.'%(num_discard, num_nonexist, image_dir)) 112 | 113 | 114 | if __name__ == '__main__': 115 | main(sys.argv[1:]) 116 | logging.info('Jobs finished') 117 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/preprocessing/preprocess_formulas.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # tokenize latex formulas 3 | import sys, os, argparse, logging, subprocess, shutil 4 | 5 | def is_ascii(str): 6 | try: 7 | str.decode('ascii') 8 | return True 9 | except UnicodeError: 10 | return False 11 | 12 | def process_args(args): 13 | parser = argparse.ArgumentParser(description='Preprocess (tokenize or normalize) latex formulas') 14 | 15 | parser.add_argument('--mode', dest='mode', 16 | choices=['tokenize', 'normalize'], required=True, 17 | help=('Tokenize (split to tokens seperated by space) or normalize (further translate to an equivalent standard form).' 18 | )) 19 | parser.add_argument('--input-file', dest='input_file', 20 | type=str, required=True, 21 | help=('Input file containing latex formulas. One formula per line.' 22 | )) 23 | parser.add_argument('--output-file', dest='output_file', 24 | type=str, required=True, 25 | help=('Output file.' 26 | )) 27 | parser.add_argument('--num-threads', dest='num_threads', 28 | type=int, default=4, 29 | help=('Number of threads, default=4.' 30 | )) 31 | parser.add_argument('--log-path', dest="log_path", 32 | type=str, default='log.txt', 33 | help=('Log file path, default=log.txt' 34 | )) 35 | parameters = parser.parse_args(args) 36 | return parameters 37 | 38 | def main(args): 39 | parameters = process_args(args) 40 | logging.basicConfig( 41 | level=logging.INFO, 42 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s', 43 | filename=parameters.log_path) 44 | 45 | console = logging.StreamHandler() 46 | console.setLevel(logging.INFO) 47 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s') 48 | console.setFormatter(formatter) 49 | logging.getLogger('').addHandler(console) 50 | 51 | logging.info('Script being executed: %s'%__file__) 52 | 53 | input_file = parameters.input_file 54 | output_file = parameters.output_file 55 | 56 | assert os.path.exists(input_file), input_file 57 | cmd = "perl -pe 's|hskip(.*?)(cm\\|in\\|pt\\|mm\\|em)|hspace{\\1\\2}|g' %s > %s"%(input_file, output_file) 58 | ret = subprocess.call(cmd, shell=True) 59 | if ret != 0: 60 | logging.error('FAILED: %s'%cmd) 61 | 62 | temp_file = output_file + '.tmp' 63 | with open(temp_file, 'w') as fout: 64 | with open(output_file) as fin: 65 | for line in fin: 66 | fout.write(line.replace('\r', ' ').strip() + '\n') # delete \r 67 | 68 | cmd = "cat %s | node scripts/preprocessing/preprocess_latex.js %s > %s "%(temp_file, parameters.mode, output_file) 69 | ret = subprocess.call(cmd, shell=True) 70 | os.remove(temp_file) 71 | if ret != 0: 72 | logging.error('FAILED: %s'%cmd) 73 | temp_file = output_file + '.tmp' 74 | shutil.move(output_file, temp_file) 75 | with open(temp_file) as fin: 76 | with open(output_file, 'w') as fout: 77 | for line in fin: 78 | tokens = line.strip().split() 79 | tokens_out = [] 80 | for token in tokens: 81 | if is_ascii(token): 82 | tokens_out.append(token) 83 | fout.write(' '.join(tokens_out)+'\n') 84 | os.remove(temp_file) 85 | 86 | if __name__ == '__main__': 87 | main(sys.argv[1:]) 88 | logging.info('Jobs finished') 89 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/preprocessing/preprocess_images.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Preprocess images for ease of training 3 | import sys, os, argparse, json, glob, logging 4 | import numpy as np 5 | from PIL import Image 6 | sys.path.insert(0, '%s'%os.path.join(os.path.dirname(__file__), '../utils/')) 7 | from image_utils import * 8 | from multiprocessing import Pool 9 | from multiprocessing.dummy import Pool as ThreadPool 10 | 11 | def process_args(args): 12 | parser = argparse.ArgumentParser(description='Process images for ease of training. Crop images to get rid of the background. For a cropped image of size (w,h), we pad it with PAD_TOP, PAD_BOTTOM, PAD_LEFT, PAD_RIGHT, and the result is of size (w+PAD_LEFT+PAD_RIGHT, h+PAD_TOP+PAD_BOTTOM. Then we see which bucket it falls into and pad them with whitespace to match the smallest bucket that can hold it. Finally, downsample images.') 13 | 14 | parser.add_argument('--input-dir', dest='input_dir', 15 | type=str, required=True, 16 | help=('Input directory containing orginal images.' 17 | )) 18 | parser.add_argument('--output-dir', dest='output_dir', 19 | type=str, required=True, 20 | help=('Output directory to put processed images.' 21 | )) 22 | parser.add_argument('--num-threads', dest='num_threads', 23 | type=int, default=4, 24 | help=('Number of threads, default=4.' 25 | )) 26 | parser.add_argument('--crop-blank-default-size', dest='crop_blank_default_size', 27 | type=str, default='[600,60]', 28 | help=('If an image is blank, this is the size of the cropped image, should be a Json string. Default=(600,60).' 29 | )) 30 | parser.add_argument('--pad-size', dest='pad_size', 31 | type=str, default='[8,8,8,8]', 32 | help=('We pad the cropped image to the top, left, bottom, right with whitespace of size PAD_TOP, PAD_LEFT, PAD_BOTTOM, PAD_RIGHT, should be a Json string. Default=(8,8,8,8).' 33 | )) 34 | parser.add_argument('--buckets', dest='buckets', 35 | type=str, default='[[240, 100], [320, 80], [400, 80], [400, 100], [480, 80], [480, 100], [560, 80], [560, 100], [640, 80], [640, 100], [720, 80], [720, 100], [720, 120], [720, 200], [800, 100], [800, 320], [1000, 200], [1000, 400], [1200, 200], [1600, 200], [1600, 1600]]', 36 | help=('Bucket sizes used for grouping. Should be a Json string. Note that this denotes the bucket size after padding and before downsampling.' 37 | )) 38 | parser.add_argument('--downsample-ratio', dest='downsample_ratio', 39 | type=float, default=2., 40 | help=('The ratio of downsampling, default=2.0.' 41 | )) 42 | parser.add_argument('--log-path', dest="log_path", 43 | type=str, default='log.txt', 44 | help=('Log file path, default=log.txt' 45 | )) 46 | parser.add_argument('--postfix', dest='postfix', 47 | type=str, default='.png', 48 | help=('The format of images, default=".png".' 49 | )) 50 | parameters = parser.parse_args(args) 51 | return parameters 52 | 53 | def main_parallel(l): 54 | filename, postfix, output_filename, crop_blank_default_size, pad_size, buckets, downsample_ratio = l 55 | postfix_length = len(postfix) 56 | status = crop_image(filename, output_filename, crop_blank_default_size) 57 | if not status: 58 | logging.info('%s is blank, crop a white image of default size!'%filename) 59 | status = pad_group_image(output_filename, output_filename, pad_size, buckets) 60 | if not status: 61 | logging.info('%s (after cropping and padding) is larger than the largest provided bucket size, left unchanged!'%filename) 62 | status = downsample_image(output_filename, output_filename, downsample_ratio) 63 | 64 | def main(args): 65 | parameters = process_args(args) 66 | logging.basicConfig( 67 | level=logging.INFO, 68 | format='%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s', 69 | filename=parameters.log_path) 70 | 71 | console = logging.StreamHandler() 72 | console.setLevel(logging.INFO) 73 | formatter = logging.Formatter('%(asctime)-15s %(name)-5s %(levelname)-8s %(message)s') 74 | console.setFormatter(formatter) 75 | logging.getLogger('').addHandler(console) 76 | 77 | logging.info('Script being executed: %s'%__file__) 78 | 79 | output_dir = parameters.output_dir 80 | if not os.path.exists(output_dir): 81 | os.makedirs(output_dir) 82 | 83 | input_dir = parameters.input_dir 84 | postfix = parameters.postfix 85 | crop_blank_default_size = json.loads(parameters.crop_blank_default_size) 86 | pad_size = json.loads(parameters.pad_size) 87 | buckets = json.loads(parameters.buckets) 88 | downsample_ratio = parameters.downsample_ratio 89 | 90 | filenames = glob.glob(os.path.join(input_dir, '*'+postfix)) 91 | logging.info('Creating pool with %d threads'%parameters.num_threads) 92 | pool = ThreadPool(parameters.num_threads) 93 | logging.info('Jobs running...') 94 | results = pool.map(main_parallel, [(filename, postfix, os.path.join(output_dir, os.path.basename(filename)), crop_blank_default_size, pad_size, buckets, downsample_ratio) for filename in filenames]) 95 | pool.close() 96 | pool.join() 97 | 98 | if __name__ == '__main__': 99 | main(sys.argv[1:]) 100 | logging.info('Jobs finished') 101 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/utils/image_utils.py: -------------------------------------------------------------------------------- 1 | import PIL 2 | from PIL import Image 3 | import numpy as np 4 | 5 | def crop_image(img, output_path, default_size=None): 6 | old_im = Image.open(img).convert('L') 7 | img_data = np.asarray(old_im, dtype=np.uint8) # height, width 8 | nnz_inds = np.where(img_data!=255) 9 | if len(nnz_inds[0]) == 0: 10 | if not default_size: 11 | old_im.save(output_path) 12 | return False 13 | else: 14 | assert len(default_size) == 2, default_size 15 | x_min,y_min,x_max,y_max = 0,0,default_size[0],default_size[1] 16 | old_im = old_im.crop((x_min, y_min, x_max+1, y_max+1)) 17 | old_im.save(output_path) 18 | return False 19 | y_min = np.min(nnz_inds[0]) 20 | y_max = np.max(nnz_inds[0]) 21 | x_min = np.min(nnz_inds[1]) 22 | x_max = np.max(nnz_inds[1]) 23 | old_im = old_im.crop((x_min, y_min, x_max+1, y_max+1)) 24 | old_im.save(output_path) 25 | return True 26 | 27 | def pad_group_image(img, output_path, pad_size, buckets): 28 | PAD_TOP, PAD_LEFT, PAD_BOTTOM, PAD_RIGHT = pad_size 29 | old_im = Image.open(img) 30 | old_size = (old_im.size[0]+PAD_LEFT+PAD_RIGHT, old_im.size[1]+PAD_TOP+PAD_BOTTOM) 31 | j = -1 32 | for i in range(len(buckets)): 33 | if old_size[0]<=buckets[i][0] and old_size[1]<=buckets[i][1]: 34 | j = i 35 | break 36 | if j < 0: 37 | new_size = old_size 38 | new_im = Image.new("RGB", new_size, (255,255,255)) 39 | new_im.paste(old_im, (PAD_LEFT,PAD_TOP)) 40 | new_im.save(output_path) 41 | return False 42 | new_size = buckets[j] 43 | new_im = Image.new("RGB", new_size, (255,255,255)) 44 | new_im.paste(old_im, (PAD_LEFT,PAD_TOP)) 45 | new_im.save(output_path) 46 | return True 47 | 48 | def downsample_image(img, output_path, ratio): 49 | assert ratio>=1, ratio 50 | if ratio == 1: 51 | return True 52 | old_im = Image.open(img) 53 | old_size = old_im.size 54 | new_size = (int(old_size[0]/ratio), int(old_size[1]/ratio)) 55 | 56 | new_im = old_im.resize(new_size, PIL.Image.LANCZOS) 57 | new_im.save(output_path) 58 | return True 59 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/scripts/utils/utils.py: -------------------------------------------------------------------------------- 1 | import subprocess, shlex 2 | from threading import Timer 3 | 4 | def run(cmd, timeout_sec): 5 | proc = subprocess.Popen(cmd, shell=True) 6 | kill_proc = lambda p: p.kill() 7 | timer = Timer(timeout_sec, kill_proc, [proc]) 8 | try: 9 | timer.start() 10 | stdout,stderr = proc.communicate() 11 | finally: 12 | timer.cancel() 13 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/.#katex.js: -------------------------------------------------------------------------------- 1 | srush@beaker.12118:1471814512 -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Khan Academy 4 | 5 | This software also uses portions of the underscore.js project, which is 6 | MIT licensed with the following copyright: 7 | 8 | Copyright (c) 2009-2015 Jeremy Ashkenas, DocumentCloud and Investigative 9 | Reporters & Editors 10 | 11 | Permission is hereby granted, free of charge, to any person obtaining a copy 12 | of this software and associated documentation files (the "Software"), to deal 13 | in the Software without restriction, including without limitation the rights 14 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 | copies of the Software, and to permit persons to whom the Software is 16 | furnished to do so, subject to the following conditions: 17 | 18 | The above copyright notice and this permission notice shall be included in all 19 | copies or substantial portions of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 27 | SOFTWARE. 28 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/README.md: -------------------------------------------------------------------------------- 1 | # [KaTeX](https://khan.github.io/KaTeX/) [![Build Status](https://travis-ci.org/Khan/KaTeX.svg?branch=master)](https://travis-ci.org/Khan/KaTeX) 2 | 3 | [![Join the chat at https://gitter.im/Khan/KaTeX](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/Khan/KaTeX?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 4 | 5 | KaTeX is a fast, easy-to-use JavaScript library for TeX math rendering on the web. 6 | 7 | * **Fast:** KaTeX renders its math synchronously and doesn't need to reflow the page. See how it compares to a competitor in [this speed test](http://jsperf.com/katex-vs-mathjax/). 8 | * **Print quality:** KaTeX’s layout is based on Donald Knuth’s TeX, the gold standard for math typesetting. 9 | * **Self contained:** KaTeX has no dependencies and can easily be bundled with your website resources. 10 | * **Server side rendering:** KaTeX produces the same output regardless of browser or environment, so you can pre-render expressions using Node.js and send them as plain HTML. 11 | 12 | KaTeX supports all major browsers, including Chrome, Safari, Firefox, Opera, and IE 8 - IE 11. A list of supported commands can be on the [wiki](https://github.com/Khan/KaTeX/wiki/Function-Support-in-KaTeX). 13 | 14 | ## Usage 15 | 16 | You can [download KaTeX](https://github.com/khan/katex/releases) and host it on your server or include the `katex.min.js` and `katex.min.css` files on your page directly from a CDN: 17 | 18 | ```html 19 | 20 | 21 | ``` 22 | 23 | #### In-browser rendering 24 | 25 | Call `katex.render` with a TeX expression and a DOM element to render into: 26 | 27 | ```js 28 | katex.render("c = \\pm\\sqrt{a^2 + b^2}", element); 29 | ``` 30 | 31 | If KaTeX can't parse the expression, it throws a `katex.ParseError` error. 32 | 33 | #### Server side rendering or rendering to a string 34 | 35 | To generate HTML on the server or to generate an HTML string of the rendered math, you can use `katex.renderToString`: 36 | 37 | ```js 38 | var html = katex.renderToString("c = \\pm\\sqrt{a^2 + b^2}"); 39 | // '...' 40 | ``` 41 | 42 | Make sure to include the CSS and font files, but there is no need to include the JavaScript. Like `render`, `renderToString` throws if it can't parse the expression. 43 | 44 | #### Rendering options 45 | 46 | You can provide an object of options as the last argument to `katex.render` and `katex.renderToString`. Available options are: 47 | 48 | - `displayMode`: `boolean`. If `true` the math will be rendered in display mode, which will put the math in display style (so `\int` and `\sum` are large, for example), and will center the math on the page on its own line. If `false` the math will be rendered in inline mode. (default: `false`) 49 | - `throwOnError`: `boolean`. If `true`, KaTeX will throw a `ParseError` when it encounters an unsupported command. If `false`, KaTeX will render the unsupported command as text in the color given by `errorColor`. (default: `true`) 50 | - `errorColor`: `string`. A color string given in the format `"#XXX"` or `"#XXXXXX"`. This option determines the color which unsupported commands are rendered in. (default: `#cc0000`) 51 | 52 | For example: 53 | 54 | ```js 55 | katex.render("c = \\pm\\sqrt{a^2 + b^2}", element, { displayMode: true }); 56 | ``` 57 | 58 | #### Automatic rendering of math on a page 59 | 60 | Math on the page can be automatically rendered using the auto-render extension. See [the Auto-render README](contrib/auto-render/README.md) for more information. 61 | 62 | ## Contributing 63 | 64 | See [CONTRIBUTING.md](CONTRIBUTING.md) 65 | 66 | ## License 67 | 68 | KaTeX is licensed under the [MIT License](http://opensource.org/licenses/MIT). 69 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/cli.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | // Simple CLI for KaTeX. 3 | // Reads TeX from stdin, outputs HTML to stdout. 4 | /* eslint no-console:0 */ 5 | 6 | var katex = require("./"); 7 | var input = ""; 8 | 9 | // Skip the first two args, which are just "node" and "cli.js" 10 | var args = process.argv.slice(2); 11 | 12 | if (args.indexOf("--help") !== -1) { 13 | console.log(process.argv[0] + " " + process.argv[1] + 14 | " [ --help ]" + 15 | " [ --display-mode ]"); 16 | 17 | console.log("\n" + 18 | "Options:"); 19 | console.log(" --help Display this help message"); 20 | console.log(" --display-mode Render in display mode (not inline mode)"); 21 | process.exit(); 22 | } 23 | 24 | process.stdin.on("data", function(chunk) { 25 | input += chunk.toString(); 26 | }); 27 | 28 | process.stdin.on("end", function() { 29 | var options = { displayMode: args.indexOf("--display-mode") !== -1 }; 30 | var output = katex.renderToString(input, options); 31 | console.log(output); 32 | }); 33 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/katex.js: -------------------------------------------------------------------------------- 1 | /* eslint no-console:0 */ 2 | /** 3 | * This is the main entry point for KaTeX. Here, we expose functions for 4 | * rendering expressions either to DOM nodes or to markup strings. 5 | * 6 | * We also expose the ParseError class to check if errors thrown from KaTeX are 7 | * errors in the expression, or errors in javascript handling. 8 | */ 9 | 10 | var ParseError = require("./src/ParseError"); 11 | var Settings = require("./src/Settings"); 12 | 13 | var buildTree = require("./src/buildTree"); 14 | var parseTree = require("./src/parseTree"); 15 | var utils = require("./src/utils"); 16 | 17 | /** 18 | * Parse and build an expression, and place that expression in the DOM node 19 | * given. 20 | */ 21 | var render = function(expression, baseNode, options) { 22 | utils.clearNode(baseNode); 23 | 24 | var settings = new Settings(options); 25 | 26 | var tree = parseTree(expression, settings); 27 | var node = buildTree(tree, expression, settings).toNode(); 28 | 29 | baseNode.appendChild(node); 30 | }; 31 | 32 | // KaTeX's styles don't work properly in quirks mode. Print out an error, and 33 | // disable rendering. 34 | if (typeof document !== "undefined") { 35 | if (document.compatMode !== "CSS1Compat") { 36 | typeof console !== "undefined" && console.warn( 37 | "Warning: KaTeX doesn't work in quirks mode. Make sure your " + 38 | "website has a suitable doctype."); 39 | 40 | render = function() { 41 | throw new ParseError("KaTeX doesn't work in quirks mode."); 42 | }; 43 | } 44 | } 45 | 46 | /** 47 | * Parse and build an expression, and return the markup for that. 48 | */ 49 | var renderToString = function(expression, options) { 50 | var settings = new Settings(options); 51 | 52 | var tree = parseTree(expression, settings); 53 | return buildTree(tree, expression, settings).toMarkup(); 54 | }; 55 | 56 | /** 57 | * Parse an expression and return the parse tree. 58 | */ 59 | var generateParseTree = function(expression, options) { 60 | var settings = new Settings(options); 61 | return parseTree(expression, settings); 62 | }; 63 | 64 | module.exports = { 65 | render: render, 66 | renderToString: renderToString, 67 | /** 68 | * NOTE: This method is not currently recommended for public use. 69 | * The internal tree representation is unstable and is very likely 70 | * to change. Use at your own risk. 71 | */ 72 | __parse: generateParseTree, 73 | ParseError: ParseError, 74 | }; 75 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "_args": [ 3 | [ 4 | "katex", 5 | "/home/srush/Projects/im2latex" 6 | ] 7 | ], 8 | "_from": "katex@latest", 9 | "_id": "katex@0.6.0", 10 | "_inCache": true, 11 | "_installable": true, 12 | "_location": "/katex", 13 | "_nodeVersion": "4.2.1", 14 | "_npmOperationalInternal": { 15 | "host": "packages-12-west.internal.npmjs.com", 16 | "tmp": "tmp/katex-0.6.0.tgz_1460769444991_0.38667152682319283" 17 | }, 18 | "_npmUser": { 19 | "email": "kevinb7@gmail.com", 20 | "name": "kevinbarabash" 21 | }, 22 | "_npmVersion": "2.15.2", 23 | "_phantomChildren": {}, 24 | "_requested": { 25 | "name": "katex", 26 | "raw": "katex", 27 | "rawSpec": "", 28 | "scope": null, 29 | "spec": "latest", 30 | "type": "tag" 31 | }, 32 | "_requiredBy": [ 33 | "#USER" 34 | ], 35 | "_resolved": "https://registry.npmjs.org/katex/-/katex-0.6.0.tgz", 36 | "_shasum": "12418e09121c05c92041b6b3b9fb6bab213cb6f3", 37 | "_shrinkwrap": null, 38 | "_spec": "katex", 39 | "_where": "/home/srush/Projects/im2latex", 40 | "bin": { 41 | "katex": "cli.js" 42 | }, 43 | "bugs": { 44 | "url": "https://github.com/Khan/KaTeX/issues" 45 | }, 46 | "dependencies": { 47 | "match-at": "^0.1.0" 48 | }, 49 | "description": "Fast math typesetting for the web.", 50 | "devDependencies": { 51 | "browserify": "^10.2.4", 52 | "clean-css": "~2.2.15", 53 | "eslint": "^1.10.2", 54 | "express": "~3.3.3", 55 | "glob": "^5.0.15", 56 | "jasmine": "^2.3.2", 57 | "jasmine-core": "^2.3.4", 58 | "js-yaml": "^3.3.1", 59 | "jspngopt": "^0.1.0", 60 | "less": "~1.7.5", 61 | "nomnom": "^1.8.1", 62 | "pako": "0.2.7", 63 | "selenium-webdriver": "^2.46.1", 64 | "uglify-js": "~2.4.15" 65 | }, 66 | "directories": {}, 67 | "dist": { 68 | "shasum": "12418e09121c05c92041b6b3b9fb6bab213cb6f3", 69 | "tarball": "https://registry.npmjs.org/katex/-/katex-0.6.0.tgz" 70 | }, 71 | "files": [ 72 | "cli.js", 73 | "dist/", 74 | "katex.js", 75 | "src/" 76 | ], 77 | "gitHead": "b94fc6534d5c23f944906a52a592bee4e0090665", 78 | "homepage": "https://github.com/Khan/KaTeX#readme", 79 | "license": "MIT", 80 | "main": "katex.js", 81 | "maintainers": [ 82 | { 83 | "name": "kevinbarabash", 84 | "email": "kevinb7@gmail.com" 85 | }, 86 | { 87 | "name": "spicyj", 88 | "email": "ben@benalpert.com" 89 | }, 90 | { 91 | "name": "xymostech", 92 | "email": "xymostech@gmail.com" 93 | } 94 | ], 95 | "name": "katex", 96 | "optionalDependencies": {}, 97 | "readme": "ERROR: No README data found!", 98 | "repository": { 99 | "type": "git", 100 | "url": "git://github.com/Khan/KaTeX.git" 101 | }, 102 | "scripts": { 103 | "prepublish": "make dist", 104 | "start": "node server.js", 105 | "test": "make lint test" 106 | }, 107 | "version": "0.6.0" 108 | } 109 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/src/Lexer.js: -------------------------------------------------------------------------------- 1 | /** 2 | * The Lexer class handles tokenizing the input in various ways. Since our 3 | * parser expects us to be able to backtrack, the lexer allows lexing from any 4 | * given starting point. 5 | * 6 | * Its main exposed function is the `lex` function, which takes a position to 7 | * lex from and a type of token to lex. It defers to the appropriate `_innerLex` 8 | * function. 9 | * 10 | * The various `_innerLex` functions perform the actual lexing of different 11 | * kinds. 12 | */ 13 | 14 | var matchAt = require("../../match-at"); 15 | 16 | var ParseError = require("./ParseError"); 17 | 18 | // The main lexer class 19 | function Lexer(input) { 20 | this._input = input; 21 | } 22 | 23 | // The resulting token returned from `lex`. 24 | function Token(text, data, position) { 25 | this.text = text; 26 | this.data = data; 27 | this.position = position; 28 | } 29 | 30 | /* The following tokenRegex 31 | * - matches typical whitespace (but not NBSP etc.) using its first group 32 | * - matches symbol combinations which result in a single output character 33 | * - does not match any control character \x00-\x1f except whitespace 34 | * - does not match a bare backslash 35 | * - matches any ASCII character except those just mentioned 36 | * - does not match the BMP private use area \uE000-\uF8FF 37 | * - does not match bare surrogate code units 38 | * - matches any BMP character except for those just described 39 | * - matches any valid Unicode surrogate pair 40 | * - matches a backslash followed by one or more letters 41 | * - matches a backslash followed by any BMP character, including newline 42 | * Just because the Lexer matches something doesn't mean it's valid input: 43 | * If there is no matching function or symbol definition, the Parser will 44 | * still reject the input. 45 | */ 46 | var tokenRegex = new RegExp( 47 | "([ \r\n\t]+)|(" + // whitespace 48 | "---?" + // special combinations 49 | "|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint 50 | "|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair 51 | "|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" + // function name 52 | ")" 53 | ); 54 | 55 | var whitespaceRegex = /\s*/; 56 | 57 | /** 58 | * This function lexes a single normal token. It takes a position and 59 | * whether it should completely ignore whitespace or not. 60 | */ 61 | Lexer.prototype._innerLex = function(pos, ignoreWhitespace) { 62 | var input = this._input; 63 | if (pos === input.length) { 64 | return new Token("EOF", null, pos); 65 | } 66 | var match = matchAt(tokenRegex, input, pos); 67 | if (match === null) { 68 | throw new ParseError( 69 | "Unexpected character: '" + input[pos] + "'", 70 | this, pos); 71 | } else if (match[2]) { // matched non-whitespace 72 | return new Token(match[2], null, pos + match[2].length); 73 | } else if (ignoreWhitespace) { 74 | return this._innerLex(pos + match[1].length, true); 75 | } else { // concatenate whitespace to a single space 76 | return new Token(" ", null, pos + match[1].length); 77 | } 78 | }; 79 | 80 | // A regex to match a CSS color (like #ffffff or BlueViolet) 81 | var cssColor = /#[a-z0-9]+|[a-z]+/i; 82 | 83 | /** 84 | * This function lexes a CSS color. 85 | */ 86 | Lexer.prototype._innerLexColor = function(pos) { 87 | var input = this._input; 88 | 89 | // Ignore whitespace 90 | var whitespace = matchAt(whitespaceRegex, input, pos)[0]; 91 | pos += whitespace.length; 92 | 93 | var match; 94 | if ((match = matchAt(cssColor, input, pos))) { 95 | // If we look like a color, return a color 96 | return new Token(match[0], null, pos + match[0].length); 97 | } else { 98 | throw new ParseError("Invalid color", this, pos); 99 | } 100 | }; 101 | 102 | // A regex to match a dimension. Dimensions look like 103 | // "1.2em" or ".4pt" or "1 ex" 104 | var sizeRegex = /(-?)\s*(\d+(?:\.\d*)?|\.\d+)\s*([a-z]{2})/; 105 | 106 | /** 107 | * This function lexes a dimension. 108 | */ 109 | Lexer.prototype._innerLexSize = function(pos) { 110 | var input = this._input; 111 | 112 | // Ignore whitespace 113 | var whitespace = matchAt(whitespaceRegex, input, pos)[0]; 114 | pos += whitespace.length; 115 | 116 | var match; 117 | if ((match = matchAt(sizeRegex, input, pos))) { 118 | var unit = match[3]; 119 | // We only currently handle "em" and "ex" units 120 | // if (unit !== "em" && unit !== "ex") { 121 | // throw new ParseError("Invalid unit: '" + unit + "'", this, pos); 122 | // } 123 | return new Token(match[0], { 124 | number: +(match[1] + match[2]), 125 | unit: unit, 126 | }, pos + match[0].length); 127 | } 128 | 129 | throw new ParseError("Invalid size", this, pos); 130 | }; 131 | 132 | /** 133 | * This function lexes a string of whitespace. 134 | */ 135 | Lexer.prototype._innerLexWhitespace = function(pos) { 136 | var input = this._input; 137 | 138 | var whitespace = matchAt(whitespaceRegex, input, pos)[0]; 139 | pos += whitespace.length; 140 | 141 | return new Token(whitespace[0], null, pos); 142 | }; 143 | 144 | /** 145 | * This function lexes a single token starting at `pos` and of the given mode. 146 | * Based on the mode, we defer to one of the `_innerLex` functions. 147 | */ 148 | Lexer.prototype.lex = function(pos, mode) { 149 | if (mode === "math") { 150 | return this._innerLex(pos, true); 151 | } else if (mode === "text") { 152 | return this._innerLex(pos, false); 153 | } else if (mode === "color") { 154 | return this._innerLexColor(pos); 155 | } else if (mode === "size") { 156 | return this._innerLexSize(pos); 157 | } else if (mode === "whitespace") { 158 | return this._innerLexWhitespace(pos); 159 | } 160 | }; 161 | 162 | module.exports = Lexer; 163 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/src/Options.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This file contains information about the options that the Parser carries 3 | * around with it while parsing. Data is held in an `Options` object, and when 4 | * recursing, a new `Options` object can be created with the `.with*` and 5 | * `.reset` functions. 6 | */ 7 | 8 | /** 9 | * This is the main options class. It contains the style, size, color, and font 10 | * of the current parse level. It also contains the style and size of the parent 11 | * parse level, so size changes can be handled efficiently. 12 | * 13 | * Each of the `.with*` and `.reset` functions passes its current style and size 14 | * as the parentStyle and parentSize of the new options class, so parent 15 | * handling is taken care of automatically. 16 | */ 17 | function Options(data) { 18 | this.style = data.style; 19 | this.color = data.color; 20 | this.size = data.size; 21 | this.phantom = data.phantom; 22 | this.font = data.font; 23 | 24 | if (data.parentStyle === undefined) { 25 | this.parentStyle = data.style; 26 | } else { 27 | this.parentStyle = data.parentStyle; 28 | } 29 | 30 | if (data.parentSize === undefined) { 31 | this.parentSize = data.size; 32 | } else { 33 | this.parentSize = data.parentSize; 34 | } 35 | } 36 | 37 | /** 38 | * Returns a new options object with the same properties as "this". Properties 39 | * from "extension" will be copied to the new options object. 40 | */ 41 | Options.prototype.extend = function(extension) { 42 | var data = { 43 | style: this.style, 44 | size: this.size, 45 | color: this.color, 46 | parentStyle: this.style, 47 | parentSize: this.size, 48 | phantom: this.phantom, 49 | font: this.font, 50 | }; 51 | 52 | for (var key in extension) { 53 | if (extension.hasOwnProperty(key)) { 54 | data[key] = extension[key]; 55 | } 56 | } 57 | 58 | return new Options(data); 59 | }; 60 | 61 | /** 62 | * Create a new options object with the given style. 63 | */ 64 | Options.prototype.withStyle = function(style) { 65 | return this.extend({ 66 | style: style, 67 | }); 68 | }; 69 | 70 | /** 71 | * Create a new options object with the given size. 72 | */ 73 | Options.prototype.withSize = function(size) { 74 | return this.extend({ 75 | size: size, 76 | }); 77 | }; 78 | 79 | /** 80 | * Create a new options object with the given color. 81 | */ 82 | Options.prototype.withColor = function(color) { 83 | return this.extend({ 84 | color: color, 85 | }); 86 | }; 87 | 88 | /** 89 | * Create a new options object with "phantom" set to true. 90 | */ 91 | Options.prototype.withPhantom = function() { 92 | return this.extend({ 93 | phantom: true, 94 | }); 95 | }; 96 | 97 | /** 98 | * Create a new options objects with the give font. 99 | */ 100 | Options.prototype.withFont = function(font) { 101 | return this.extend({ 102 | font: font, 103 | }); 104 | }; 105 | 106 | /** 107 | * Create a new options object with the same style, size, and color. This is 108 | * used so that parent style and size changes are handled correctly. 109 | */ 110 | Options.prototype.reset = function() { 111 | return this.extend({}); 112 | }; 113 | 114 | /** 115 | * A map of color names to CSS colors. 116 | * TODO(emily): Remove this when we have real macros 117 | */ 118 | var colorMap = { 119 | "katex-blue": "#6495ed", 120 | "katex-orange": "#ffa500", 121 | "katex-pink": "#ff00af", 122 | "katex-red": "#df0030", 123 | "katex-green": "#28ae7b", 124 | "katex-gray": "gray", 125 | "katex-purple": "#9d38bd", 126 | "katex-blueA": "#c7e9f1", 127 | "katex-blueB": "#9cdceb", 128 | "katex-blueC": "#58c4dd", 129 | "katex-blueD": "#29abca", 130 | "katex-blueE": "#1c758a", 131 | "katex-tealA": "#acead7", 132 | "katex-tealB": "#76ddc0", 133 | "katex-tealC": "#5cd0b3", 134 | "katex-tealD": "#55c1a7", 135 | "katex-tealE": "#49a88f", 136 | "katex-greenA": "#c9e2ae", 137 | "katex-greenB": "#a6cf8c", 138 | "katex-greenC": "#83c167", 139 | "katex-greenD": "#77b05d", 140 | "katex-greenE": "#699c52", 141 | "katex-goldA": "#f7c797", 142 | "katex-goldB": "#f9b775", 143 | "katex-goldC": "#f0ac5f", 144 | "katex-goldD": "#e1a158", 145 | "katex-goldE": "#c78d46", 146 | "katex-redA": "#f7a1a3", 147 | "katex-redB": "#ff8080", 148 | "katex-redC": "#fc6255", 149 | "katex-redD": "#e65a4c", 150 | "katex-redE": "#cf5044", 151 | "katex-maroonA": "#ecabc1", 152 | "katex-maroonB": "#ec92ab", 153 | "katex-maroonC": "#c55f73", 154 | "katex-maroonD": "#a24d61", 155 | "katex-maroonE": "#94424f", 156 | "katex-purpleA": "#caa3e8", 157 | "katex-purpleB": "#b189c6", 158 | "katex-purpleC": "#9a72ac", 159 | "katex-purpleD": "#715582", 160 | "katex-purpleE": "#644172", 161 | "katex-mintA": "#f5f9e8", 162 | "katex-mintB": "#edf2df", 163 | "katex-mintC": "#e0e5cc", 164 | "katex-grayA": "#fdfdfd", 165 | "katex-grayB": "#f7f7f7", 166 | "katex-grayC": "#eeeeee", 167 | "katex-grayD": "#dddddd", 168 | "katex-grayE": "#cccccc", 169 | "katex-grayF": "#aaaaaa", 170 | "katex-grayG": "#999999", 171 | "katex-grayH": "#555555", 172 | "katex-grayI": "#333333", 173 | "katex-kaBlue": "#314453", 174 | "katex-kaGreen": "#639b24", 175 | }; 176 | 177 | /** 178 | * Gets the CSS color of the current options object, accounting for the 179 | * `colorMap`. 180 | */ 181 | Options.prototype.getColor = function() { 182 | if (this.phantom) { 183 | return "transparent"; 184 | } else { 185 | return colorMap[this.color] || this.color; 186 | } 187 | }; 188 | 189 | module.exports = Options; 190 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/src/ParseError.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This is the ParseError class, which is the main error thrown by KaTeX 3 | * functions when something has gone wrong. This is used to distinguish internal 4 | * errors from errors in the expression that the user provided. 5 | */ 6 | function ParseError(message, lexer, position) { 7 | var error = "KaTeX parse error: " + message; 8 | 9 | if (lexer !== undefined && position !== undefined) { 10 | // If we have the input and a position, make the error a bit fancier 11 | 12 | // Prepend some information 13 | error += " at position " + position + ": "; 14 | 15 | // Get the input 16 | var input = lexer._input; 17 | // Insert a combining underscore at the correct position 18 | input = input.slice(0, position) + "\u0332" + 19 | input.slice(position); 20 | 21 | // Extract some context from the input and add it to the error 22 | var begin = Math.max(0, position - 15); 23 | var end = position + 15; 24 | error += input.slice(begin, end); 25 | } 26 | 27 | // Some hackery to make ParseError a prototype of Error 28 | // See http://stackoverflow.com/a/8460753 29 | var self = new Error(error); 30 | self.name = "ParseError"; 31 | self.__proto__ = ParseError.prototype; 32 | 33 | self.position = position; 34 | return self; 35 | } 36 | 37 | // More hackery 38 | ParseError.prototype.__proto__ = Error.prototype; 39 | 40 | module.exports = ParseError; 41 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/src/Settings.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This is a module for storing settings passed into KaTeX. It correctly handles 3 | * default settings. 4 | */ 5 | 6 | /** 7 | * Helper function for getting a default value if the value is undefined 8 | */ 9 | function get(option, defaultValue) { 10 | return option === undefined ? defaultValue : option; 11 | } 12 | 13 | /** 14 | * The main Settings object 15 | * 16 | * The current options stored are: 17 | * - displayMode: Whether the expression should be typeset by default in 18 | * textstyle or displaystyle (default false) 19 | */ 20 | function Settings(options) { 21 | // allow null options 22 | options = options || {}; 23 | this.displayMode = get(options.displayMode, false); 24 | this.throwOnError = get(options.throwOnError, true); 25 | this.errorColor = get(options.errorColor, "#cc0000"); 26 | } 27 | 28 | module.exports = Settings; 29 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/src/Style.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This file contains information and classes for the various kinds of styles 3 | * used in TeX. It provides a generic `Style` class, which holds information 4 | * about a specific style. It then provides instances of all the different kinds 5 | * of styles possible, and provides functions to move between them and get 6 | * information about them. 7 | */ 8 | 9 | /** 10 | * The main style class. Contains a unique id for the style, a size (which is 11 | * the same for cramped and uncramped version of a style), a cramped flag, and a 12 | * size multiplier, which gives the size difference between a style and 13 | * textstyle. 14 | */ 15 | function Style(id, size, multiplier, cramped) { 16 | this.id = id; 17 | this.size = size; 18 | this.cramped = cramped; 19 | this.sizeMultiplier = multiplier; 20 | } 21 | 22 | /** 23 | * Get the style of a superscript given a base in the current style. 24 | */ 25 | Style.prototype.sup = function() { 26 | return styles[sup[this.id]]; 27 | }; 28 | 29 | /** 30 | * Get the style of a subscript given a base in the current style. 31 | */ 32 | Style.prototype.sub = function() { 33 | return styles[sub[this.id]]; 34 | }; 35 | 36 | /** 37 | * Get the style of a fraction numerator given the fraction in the current 38 | * style. 39 | */ 40 | Style.prototype.fracNum = function() { 41 | return styles[fracNum[this.id]]; 42 | }; 43 | 44 | /** 45 | * Get the style of a fraction denominator given the fraction in the current 46 | * style. 47 | */ 48 | Style.prototype.fracDen = function() { 49 | return styles[fracDen[this.id]]; 50 | }; 51 | 52 | /** 53 | * Get the cramped version of a style (in particular, cramping a cramped style 54 | * doesn't change the style). 55 | */ 56 | Style.prototype.cramp = function() { 57 | return styles[cramp[this.id]]; 58 | }; 59 | 60 | /** 61 | * HTML class name, like "displaystyle cramped" 62 | */ 63 | Style.prototype.cls = function() { 64 | return sizeNames[this.size] + (this.cramped ? " cramped" : " uncramped"); 65 | }; 66 | 67 | /** 68 | * HTML Reset class name, like "reset-textstyle" 69 | */ 70 | Style.prototype.reset = function() { 71 | return resetNames[this.size]; 72 | }; 73 | 74 | // IDs of the different styles 75 | var D = 0; 76 | var Dc = 1; 77 | var T = 2; 78 | var Tc = 3; 79 | var S = 4; 80 | var Sc = 5; 81 | var SS = 6; 82 | var SSc = 7; 83 | 84 | // String names for the different sizes 85 | var sizeNames = [ 86 | "displaystyle textstyle", 87 | "textstyle", 88 | "scriptstyle", 89 | "scriptscriptstyle", 90 | ]; 91 | 92 | // Reset names for the different sizes 93 | var resetNames = [ 94 | "reset-textstyle", 95 | "reset-textstyle", 96 | "reset-scriptstyle", 97 | "reset-scriptscriptstyle", 98 | ]; 99 | 100 | // Instances of the different styles 101 | var styles = [ 102 | new Style(D, 0, 1.0, false), 103 | new Style(Dc, 0, 1.0, true), 104 | new Style(T, 1, 1.0, false), 105 | new Style(Tc, 1, 1.0, true), 106 | new Style(S, 2, 0.7, false), 107 | new Style(Sc, 2, 0.7, true), 108 | new Style(SS, 3, 0.5, false), 109 | new Style(SSc, 3, 0.5, true), 110 | ]; 111 | 112 | // Lookup tables for switching from one style to another 113 | var sup = [S, Sc, S, Sc, SS, SSc, SS, SSc]; 114 | var sub = [Sc, Sc, Sc, Sc, SSc, SSc, SSc, SSc]; 115 | var fracNum = [T, Tc, S, Sc, SS, SSc, SS, SSc]; 116 | var fracDen = [Tc, Tc, Sc, Sc, SSc, SSc, SSc, SSc]; 117 | var cramp = [Dc, Dc, Tc, Tc, Sc, Sc, SSc, SSc]; 118 | 119 | // We only export some of the styles. Also, we don't export the `Style` class so 120 | // no more styles can be generated. 121 | module.exports = { 122 | DISPLAY: styles[D], 123 | TEXT: styles[T], 124 | SCRIPT: styles[S], 125 | SCRIPTSCRIPT: styles[SS], 126 | }; 127 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/src/buildTree.js: -------------------------------------------------------------------------------- 1 | var buildHTML = require("./buildHTML"); 2 | var buildMathML = require("./buildMathML"); 3 | var buildCommon = require("./buildCommon"); 4 | var Options = require("./Options"); 5 | var Settings = require("./Settings"); 6 | var Style = require("./Style"); 7 | 8 | var makeSpan = buildCommon.makeSpan; 9 | 10 | var buildTree = function(tree, expression, settings) { 11 | settings = settings || new Settings({}); 12 | 13 | var startStyle = Style.TEXT; 14 | if (settings.displayMode) { 15 | startStyle = Style.DISPLAY; 16 | } 17 | 18 | // Setup the default options 19 | var options = new Options({ 20 | style: startStyle, 21 | size: "size5", 22 | }); 23 | 24 | // `buildHTML` sometimes messes with the parse tree (like turning bins -> 25 | // ords), so we build the MathML version first. 26 | var mathMLNode = buildMathML(tree, expression, options); 27 | var htmlNode = buildHTML(tree, options); 28 | 29 | var katexNode = makeSpan(["katex"], [ 30 | mathMLNode, htmlNode, 31 | ]); 32 | 33 | if (settings.displayMode) { 34 | return makeSpan(["katex-display"], [katexNode]); 35 | } else { 36 | return katexNode; 37 | } 38 | }; 39 | 40 | module.exports = buildTree; 41 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/src/domTree.js: -------------------------------------------------------------------------------- 1 | /** 2 | * These objects store the data about the DOM nodes we create, as well as some 3 | * extra data. They can then be transformed into real DOM nodes with the 4 | * `toNode` function or HTML markup using `toMarkup`. They are useful for both 5 | * storing extra properties on the nodes, as well as providing a way to easily 6 | * work with the DOM. 7 | * 8 | * Similar functions for working with MathML nodes exist in mathMLTree.js. 9 | */ 10 | 11 | var utils = require("./utils"); 12 | 13 | /** 14 | * Create an HTML className based on a list of classes. In addition to joining 15 | * with spaces, we also remove null or empty classes. 16 | */ 17 | var createClass = function(classes) { 18 | classes = classes.slice(); 19 | for (var i = classes.length - 1; i >= 0; i--) { 20 | if (!classes[i]) { 21 | classes.splice(i, 1); 22 | } 23 | } 24 | 25 | return classes.join(" "); 26 | }; 27 | 28 | /** 29 | * This node represents a span node, with a className, a list of children, and 30 | * an inline style. It also contains information about its height, depth, and 31 | * maxFontSize. 32 | */ 33 | function span(classes, children, height, depth, maxFontSize, style) { 34 | this.classes = classes || []; 35 | this.children = children || []; 36 | this.height = height || 0; 37 | this.depth = depth || 0; 38 | this.maxFontSize = maxFontSize || 0; 39 | this.style = style || {}; 40 | this.attributes = {}; 41 | } 42 | 43 | /** 44 | * Sets an arbitrary attribute on the span. Warning: use this wisely. Not all 45 | * browsers support attributes the same, and having too many custom attributes 46 | * is probably bad. 47 | */ 48 | span.prototype.setAttribute = function(attribute, value) { 49 | this.attributes[attribute] = value; 50 | }; 51 | 52 | /** 53 | * Convert the span into an HTML node 54 | */ 55 | span.prototype.toNode = function() { 56 | var span = document.createElement("span"); 57 | 58 | // Apply the class 59 | span.className = createClass(this.classes); 60 | 61 | // Apply inline styles 62 | for (var style in this.style) { 63 | if (Object.prototype.hasOwnProperty.call(this.style, style)) { 64 | span.style[style] = this.style[style]; 65 | } 66 | } 67 | 68 | // Apply attributes 69 | for (var attr in this.attributes) { 70 | if (Object.prototype.hasOwnProperty.call(this.attributes, attr)) { 71 | span.setAttribute(attr, this.attributes[attr]); 72 | } 73 | } 74 | 75 | // Append the children, also as HTML nodes 76 | for (var i = 0; i < this.children.length; i++) { 77 | span.appendChild(this.children[i].toNode()); 78 | } 79 | 80 | return span; 81 | }; 82 | 83 | /** 84 | * Convert the span into an HTML markup string 85 | */ 86 | span.prototype.toMarkup = function() { 87 | var markup = " 0) { 197 | span = document.createElement("span"); 198 | span.style.marginRight = this.italic + "em"; 199 | } 200 | 201 | if (this.classes.length > 0) { 202 | span = span || document.createElement("span"); 203 | span.className = createClass(this.classes); 204 | } 205 | 206 | for (var style in this.style) { 207 | if (this.style.hasOwnProperty(style)) { 208 | span = span || document.createElement("span"); 209 | span.style[style] = this.style[style]; 210 | } 211 | } 212 | 213 | if (span) { 214 | span.appendChild(node); 215 | return span; 216 | } else { 217 | return node; 218 | } 219 | }; 220 | 221 | /** 222 | * Creates markup for a symbol node. 223 | */ 224 | symbolNode.prototype.toMarkup = function() { 225 | // TODO(alpert): More duplication than I'd like from 226 | // span.prototype.toMarkup and symbolNode.prototype.toNode... 227 | var needsSpan = false; 228 | 229 | var markup = " 0) { 241 | styles += "margin-right:" + this.italic + "em;"; 242 | } 243 | for (var style in this.style) { 244 | if (this.style.hasOwnProperty(style)) { 245 | styles += utils.hyphenate(style) + ":" + this.style[style] + ";"; 246 | } 247 | } 248 | 249 | if (styles) { 250 | needsSpan = true; 251 | markup += " style=\"" + utils.escape(styles) + "\""; 252 | } 253 | 254 | var escaped = utils.escape(this.value); 255 | if (needsSpan) { 256 | markup += ">"; 257 | markup += escaped; 258 | markup += ""; 259 | return markup; 260 | } else { 261 | return escaped; 262 | } 263 | }; 264 | 265 | module.exports = { 266 | span: span, 267 | documentFragment: documentFragment, 268 | symbolNode: symbolNode, 269 | }; 270 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/src/environments.js: -------------------------------------------------------------------------------- 1 | /* eslint no-constant-condition:0 */ 2 | var fontMetrics = require("./fontMetrics"); 3 | var parseData = require("./parseData"); 4 | var ParseError = require("./ParseError"); 5 | 6 | var ParseNode = parseData.ParseNode; 7 | 8 | /** 9 | * Parse the body of the environment, with rows delimited by \\ and 10 | * columns delimited by &, and create a nested list in row-major order 11 | * with one group per cell. 12 | */ 13 | var q = 0 ; 14 | function parseArray(parser, result) { 15 | var row = []; 16 | var body = [row]; 17 | var rowGaps = []; 18 | 19 | while (true) { 20 | 21 | // if (q == 1) console.error(parser.nextToken.text); 22 | try { 23 | var cell = parser.parseExpression(false, null); 24 | } catch (e) { 25 | // console.error(e); 26 | exit(); 27 | } 28 | // if (q == 1) exit(); 29 | row.push(new ParseNode("ordgroup", cell, parser.mode)); 30 | var next = parser.nextToken.text; 31 | if (next === "&") { 32 | parser.consume(); 33 | } else if (next === "\\end" || next == "}") { 34 | break; 35 | } else if (next === "\\\\" || next === "\\cr") { 36 | var cr = parser.parseFunction(); 37 | rowGaps.push(cr.value.size); 38 | row = []; 39 | body.push(row); 40 | } else { 41 | // TODO: Clean up the following hack once #385 got merged 42 | var pos = Math.min(parser.pos + 1, parser.lexer._input.length); 43 | throw new ParseError("Expected & or \\\\ or \\end", 44 | parser.lexer, pos); 45 | } 46 | } 47 | result.body = body; 48 | result.rowGaps = rowGaps; 49 | // if (q == 1) exit(); 50 | var node = new ParseNode(result.type, result, parser.mode); 51 | return node; 52 | } 53 | 54 | /* 55 | * An environment definition is very similar to a function definition: 56 | * it is declared with a name or a list of names, a set of properties 57 | * and a handler containing the actual implementation. 58 | * 59 | * The properties include: 60 | * - numArgs: The number of arguments after the \begin{name} function. 61 | * - argTypes: (optional) Just like for a function 62 | * - allowedInText: (optional) Whether or not the environment is allowed inside 63 | * text mode (default false) (not enforced yet) 64 | * - numOptionalArgs: (optional) Just like for a function 65 | * A bare number instead of that object indicates the numArgs value. 66 | * 67 | * The handler function will receive two arguments 68 | * - context: information and references provided by the parser 69 | * - args: an array of arguments passed to \begin{name} 70 | * The context contains the following properties: 71 | * - envName: the name of the environment, one of the listed names. 72 | * - parser: the parser object 73 | * - lexer: the lexer object 74 | * - positions: the positions associated with these arguments from args. 75 | * The handler must return a ParseResult. 76 | */ 77 | 78 | function defineEnvironment(names, props, handler) { 79 | if (typeof names === "string") { 80 | names = [names]; 81 | } 82 | if (typeof props === "number") { 83 | props = { numArgs: props }; 84 | } 85 | // Set default values of environments 86 | var data = { 87 | numArgs: props.numArgs || 0, 88 | argTypes: props.argTypes, 89 | greediness: 1, 90 | allowedInText: !!props.allowedInText, 91 | numOptionalArgs: props.numOptionalArgs || 0, 92 | handler: handler, 93 | }; 94 | for (var i = 0; i < names.length; ++i) { 95 | module.exports[names[i]] = data; 96 | } 97 | } 98 | 99 | // Arrays are part of LaTeX, defined in lttab.dtx so its documentation 100 | // is part of the source2e.pdf file of LaTeX2e source documentation. 101 | defineEnvironment("array", { 102 | numArgs: 1, 103 | }, function(context, args) { 104 | var colalign = args[0]; 105 | colalign = colalign.value.map ? colalign.value : [colalign]; 106 | var cols = colalign.map(function(node) { 107 | var ca = node.value; 108 | if ("lcr".indexOf(ca) !== -1) { 109 | return { 110 | type: "align", 111 | align: ca, 112 | }; 113 | } else if (ca === "|") { 114 | return { 115 | type: "separator", 116 | separator: "|", 117 | }; 118 | } 119 | // throw new ParseError( 120 | // "Unknown column alignment: " + node.value, 121 | // context.lexer, context.positions[1]); 122 | }); 123 | var res = { 124 | type: "array", 125 | style: "array", 126 | cols: cols, 127 | hskipBeforeAndAfter: true, // \@preamble in lttab.dtx 128 | }; 129 | res = parseArray(context.parser, res); 130 | return res; 131 | }); 132 | 133 | defineEnvironment("tabular", { 134 | numArgs: 1, 135 | }, function(context, args) { 136 | var colalign = args[0]; 137 | colalign = colalign.value.map ? colalign.value : [colalign]; 138 | var cols = colalign.map(function(node) { 139 | var ca = node.value; 140 | if ("lcr".indexOf(ca) !== -1) { 141 | return { 142 | type: "align", 143 | align: ca, 144 | }; 145 | } else if (ca === "|") { 146 | return { 147 | type: "separator", 148 | separator: "|", 149 | }; 150 | } 151 | // throw new ParseError( 152 | // "Unknown column alignment: " + node.value, 153 | // context.lexer, context.positions[1]); 154 | }); 155 | var res = { 156 | type: "array", 157 | style: "tabular", 158 | cols: cols, 159 | hskipBeforeAndAfter: true, // \@preamble in lttab.dtx 160 | }; 161 | res = parseArray(context.parser, res); 162 | return res; 163 | }); 164 | 165 | // The matrix environments of amsmath builds on the array environment 166 | // of LaTeX, which is discussed above. 167 | defineEnvironment([ 168 | "matrix", 169 | "pmatrix", 170 | "bmatrix", 171 | "Bmatrix", 172 | "vmatrix", 173 | "Vmatrix", 174 | ], { 175 | }, function(context) { 176 | var delimiters = { 177 | "matrix": null, 178 | "pmatrix": ["(", ")"], 179 | "bmatrix": ["[", "]"], 180 | "Bmatrix": ["\\{", "\\}"], 181 | "vmatrix": ["|", "|"], 182 | "Vmatrix": ["\\Vert", "\\Vert"], 183 | }[context.envName]; 184 | var res = { 185 | type: "array", 186 | style: "matrix", 187 | hskipBeforeAndAfter: false, // \hskip -\arraycolsep in amsmath 188 | }; 189 | q = 1; 190 | res = parseArray(context.parser, res); 191 | 192 | if (delimiters) { 193 | res = new ParseNode("leftright", { 194 | body: [res], 195 | left: delimiters[0], 196 | right: delimiters[1], 197 | }, context.mode); 198 | } 199 | return res; 200 | }); 201 | 202 | // A cases environment (in amsmath.sty) is almost equivalent to 203 | // \def\arraystretch{1.2}% 204 | // \left\{\begin{array}{@{}l@{\quad}l@{}} … \end{array}\right. 205 | defineEnvironment("picture", { 206 | }, function(context) { 207 | var res = { 208 | type: "array", 209 | style: "picture", 210 | arraystretch: 1.2, 211 | cols: [{ 212 | type: "align", 213 | align: "l", 214 | pregap: 0, 215 | postgap: fontMetrics.metrics.quad, 216 | }, { 217 | type: "align", 218 | align: "l", 219 | pregap: 0, 220 | postgap: 0, 221 | }], 222 | }; 223 | res = parseArray(context.parser, res); 224 | res = new ParseNode("leftright", { 225 | body: [res], 226 | left: "\\{", 227 | right: ".", 228 | }, context.mode); 229 | return res; 230 | }); 231 | 232 | defineEnvironment("cases", { 233 | }, function(context) { 234 | var res = { 235 | type: "array", 236 | style: "cases", 237 | arraystretch: 1.2, 238 | cols: [{ 239 | type: "align", 240 | align: "l", 241 | pregap: 0, 242 | postgap: fontMetrics.metrics.quad, 243 | }, { 244 | type: "align", 245 | align: "l", 246 | pregap: 0, 247 | postgap: 0, 248 | }], 249 | }; 250 | res = parseArray(context.parser, res); 251 | res = new ParseNode("leftright", { 252 | body: [res], 253 | left: "\\{", 254 | right: ".", 255 | }, context.mode); 256 | return res; 257 | }); 258 | 259 | // An aligned environment is like the align* environment 260 | // except it operates within math mode. 261 | // Note that we assume \nomallineskiplimit to be zero, 262 | // so that \strut@ is the same as \strut. 263 | defineEnvironment("aligned", { 264 | }, function(context) { 265 | var res = { 266 | type: "array", 267 | style: "aligned", 268 | cols: [], 269 | }; 270 | res = parseArray(context.parser, res); 271 | var emptyGroup = new ParseNode("ordgroup", [], context.mode); 272 | var numCols = 0; 273 | res.value.body.forEach(function(row) { 274 | var i; 275 | for (i = 1; i < row.length; i += 2) { 276 | row[i].value.unshift(emptyGroup); 277 | } 278 | if (numCols < row.length) { 279 | numCols = row.length; 280 | } 281 | }); 282 | for (var i = 0; i < numCols; ++i) { 283 | var align = "r"; 284 | var pregap = 0; 285 | if (i % 2 === 1) { 286 | align = "l"; 287 | } else if (i > 0) { 288 | pregap = 2; // one \qquad between columns 289 | } 290 | res.value.cols[i] = { 291 | type: "align", 292 | align: align, 293 | pregap: pregap, 294 | postgap: 0, 295 | }; 296 | } 297 | return res; 298 | }); 299 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/src/fontMetrics.js: -------------------------------------------------------------------------------- 1 | /* eslint no-unused-vars:0 */ 2 | 3 | var Style = require("./Style"); 4 | 5 | /** 6 | * This file contains metrics regarding fonts and individual symbols. The sigma 7 | * and xi variables, as well as the metricMap map contain data extracted from 8 | * TeX, TeX font metrics, and the TTF files. These data are then exposed via the 9 | * `metrics` variable and the getCharacterMetrics function. 10 | */ 11 | 12 | // These font metrics are extracted from TeX by using 13 | // \font\a=cmmi10 14 | // \showthe\fontdimenX\a 15 | // where X is the corresponding variable number. These correspond to the font 16 | // parameters of the symbol fonts. In TeX, there are actually three sets of 17 | // dimensions, one for each of textstyle, scriptstyle, and scriptscriptstyle, 18 | // but we only use the textstyle ones, and scale certain dimensions accordingly. 19 | // See the TeXbook, page 441. 20 | var sigma1 = 0.025; 21 | var sigma2 = 0; 22 | var sigma3 = 0; 23 | var sigma4 = 0; 24 | var sigma5 = 0.431; 25 | var sigma6 = 1; 26 | var sigma7 = 0; 27 | var sigma8 = 0.677; 28 | var sigma9 = 0.394; 29 | var sigma10 = 0.444; 30 | var sigma11 = 0.686; 31 | var sigma12 = 0.345; 32 | var sigma13 = 0.413; 33 | var sigma14 = 0.363; 34 | var sigma15 = 0.289; 35 | var sigma16 = 0.150; 36 | var sigma17 = 0.247; 37 | var sigma18 = 0.386; 38 | var sigma19 = 0.050; 39 | var sigma20 = 2.390; 40 | var sigma21 = 1.01; 41 | var sigma21Script = 0.81; 42 | var sigma21ScriptScript = 0.71; 43 | var sigma22 = 0.250; 44 | 45 | // These font metrics are extracted from TeX by using 46 | // \font\a=cmex10 47 | // \showthe\fontdimenX\a 48 | // where X is the corresponding variable number. These correspond to the font 49 | // parameters of the extension fonts (family 3). See the TeXbook, page 441. 50 | var xi1 = 0; 51 | var xi2 = 0; 52 | var xi3 = 0; 53 | var xi4 = 0; 54 | var xi5 = 0.431; 55 | var xi6 = 1; 56 | var xi7 = 0; 57 | var xi8 = 0.04; 58 | var xi9 = 0.111; 59 | var xi10 = 0.166; 60 | var xi11 = 0.2; 61 | var xi12 = 0.6; 62 | var xi13 = 0.1; 63 | 64 | // This value determines how large a pt is, for metrics which are defined in 65 | // terms of pts. 66 | // This value is also used in katex.less; if you change it make sure the values 67 | // match. 68 | var ptPerEm = 10.0; 69 | 70 | // The space between adjacent `|` columns in an array definition. From 71 | // `\showthe\doublerulesep` in LaTeX. 72 | var doubleRuleSep = 2.0 / ptPerEm; 73 | 74 | /** 75 | * This is just a mapping from common names to real metrics 76 | */ 77 | var metrics = { 78 | xHeight: sigma5, 79 | quad: sigma6, 80 | num1: sigma8, 81 | num2: sigma9, 82 | num3: sigma10, 83 | denom1: sigma11, 84 | denom2: sigma12, 85 | sup1: sigma13, 86 | sup2: sigma14, 87 | sup3: sigma15, 88 | sub1: sigma16, 89 | sub2: sigma17, 90 | supDrop: sigma18, 91 | subDrop: sigma19, 92 | axisHeight: sigma22, 93 | defaultRuleThickness: xi8, 94 | bigOpSpacing1: xi9, 95 | bigOpSpacing2: xi10, 96 | bigOpSpacing3: xi11, 97 | bigOpSpacing4: xi12, 98 | bigOpSpacing5: xi13, 99 | ptPerEm: ptPerEm, 100 | emPerEx: sigma5 / sigma6, 101 | doubleRuleSep: doubleRuleSep, 102 | 103 | // TODO(alpert): Missing parallel structure here. We should probably add 104 | // style-specific metrics for all of these. 105 | delim1: sigma20, 106 | getDelim2: function(style) { 107 | if (style.size === Style.TEXT.size) { 108 | return sigma21; 109 | } else if (style.size === Style.SCRIPT.size) { 110 | return sigma21Script; 111 | } else if (style.size === Style.SCRIPTSCRIPT.size) { 112 | return sigma21ScriptScript; 113 | } 114 | throw new Error("Unexpected style size: " + style.size); 115 | }, 116 | }; 117 | 118 | // This map contains a mapping from font name and character code to character 119 | // metrics, including height, depth, italic correction, and skew (kern from the 120 | // character to the corresponding \skewchar) 121 | // This map is generated via `make metrics`. It should not be changed manually. 122 | var metricMap = require("./fontMetricsData"); 123 | 124 | /** 125 | * This function is a convenience function for looking up information in the 126 | * metricMap table. It takes a character as a string, and a style. 127 | * 128 | * Note: the `width` property may be undefined if fontMetricsData.js wasn't 129 | * built using `Make extended_metrics`. 130 | */ 131 | var getCharacterMetrics = function(character, style) { 132 | var metrics = metricMap[style][character.charCodeAt(0)]; 133 | if (metrics) { 134 | return { 135 | depth: metrics[0], 136 | height: metrics[1], 137 | italic: metrics[2], 138 | skew: metrics[3], 139 | width: metrics[4], 140 | }; 141 | } 142 | }; 143 | 144 | module.exports = { 145 | metrics: metrics, 146 | getCharacterMetrics: getCharacterMetrics, 147 | }; 148 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/src/mathMLTree.js: -------------------------------------------------------------------------------- 1 | /** 2 | * These objects store data about MathML nodes. This is the MathML equivalent 3 | * of the types in domTree.js. Since MathML handles its own rendering, and 4 | * since we're mainly using MathML to improve accessibility, we don't manage 5 | * any of the styling state that the plain DOM nodes do. 6 | * 7 | * The `toNode` and `toMarkup` functions work simlarly to how they do in 8 | * domTree.js, creating namespaced DOM nodes and HTML text markup respectively. 9 | */ 10 | 11 | var utils = require("./utils"); 12 | 13 | /** 14 | * This node represents a general purpose MathML node of any type. The 15 | * constructor requires the type of node to create (for example, `"mo"` or 16 | * `"mspace"`, corresponding to `` and `` tags). 17 | */ 18 | function MathNode(type, children) { 19 | this.type = type; 20 | this.attributes = {}; 21 | this.children = children || []; 22 | } 23 | 24 | /** 25 | * Sets an attribute on a MathML node. MathML depends on attributes to convey a 26 | * semantic content, so this is used heavily. 27 | */ 28 | MathNode.prototype.setAttribute = function(name, value) { 29 | this.attributes[name] = value; 30 | }; 31 | 32 | /** 33 | * Converts the math node into a MathML-namespaced DOM element. 34 | */ 35 | MathNode.prototype.toNode = function() { 36 | var node = document.createElementNS( 37 | "http://www.w3.org/1998/Math/MathML", this.type); 38 | 39 | for (var attr in this.attributes) { 40 | if (Object.prototype.hasOwnProperty.call(this.attributes, attr)) { 41 | node.setAttribute(attr, this.attributes[attr]); 42 | } 43 | } 44 | 45 | for (var i = 0; i < this.children.length; i++) { 46 | node.appendChild(this.children[i].toNode()); 47 | } 48 | 49 | return node; 50 | }; 51 | 52 | /** 53 | * Converts the math node into an HTML markup string. 54 | */ 55 | MathNode.prototype.toMarkup = function() { 56 | var markup = "<" + this.type; 57 | 58 | // Add the attributes 59 | for (var attr in this.attributes) { 60 | if (Object.prototype.hasOwnProperty.call(this.attributes, attr)) { 61 | markup += " " + attr + "=\""; 62 | markup += utils.escape(this.attributes[attr]); 63 | markup += "\""; 64 | } 65 | } 66 | 67 | markup += ">"; 68 | 69 | for (var i = 0; i < this.children.length; i++) { 70 | markup += this.children[i].toMarkup(); 71 | } 72 | 73 | markup += ""; 74 | 75 | return markup; 76 | }; 77 | 78 | /** 79 | * This node represents a piece of text. 80 | */ 81 | function TextNode(text) { 82 | this.text = text; 83 | } 84 | 85 | /** 86 | * Converts the text node into a DOM text node. 87 | */ 88 | TextNode.prototype.toNode = function() { 89 | return document.createTextNode(this.text); 90 | }; 91 | 92 | /** 93 | * Converts the text node into HTML markup (which is just the text itself). 94 | */ 95 | TextNode.prototype.toMarkup = function() { 96 | return utils.escape(this.text); 97 | }; 98 | 99 | module.exports = { 100 | MathNode: MathNode, 101 | TextNode: TextNode, 102 | }; 103 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/src/parseData.js: -------------------------------------------------------------------------------- 1 | /** 2 | * The resulting parse tree nodes of the parse tree. 3 | */ 4 | function ParseNode(type, value, mode) { 5 | this.type = type; 6 | this.value = value; 7 | this.mode = mode; 8 | } 9 | 10 | module.exports = { 11 | ParseNode: ParseNode, 12 | }; 13 | 14 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/src/parseTree.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Provides a single function for parsing an expression using a Parser 3 | * TODO(emily): Remove this 4 | */ 5 | 6 | var Parser = require("./Parser"); 7 | 8 | /** 9 | * Parses an expression using a Parser, then returns the parsed result. 10 | */ 11 | var parseTree = function(toParse, settings) { 12 | var parser = new Parser(toParse, settings); 13 | 14 | return parser.parse(); 15 | }; 16 | 17 | module.exports = parseTree; 18 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/katex/src/utils.js: -------------------------------------------------------------------------------- 1 | /** 2 | * This file contains a list of utility functions which are useful in other 3 | * files. 4 | */ 5 | 6 | /** 7 | * Provide an `indexOf` function which works in IE8, but defers to native if 8 | * possible. 9 | */ 10 | var nativeIndexOf = Array.prototype.indexOf; 11 | var indexOf = function(list, elem) { 12 | if (list == null) { 13 | return -1; 14 | } 15 | if (nativeIndexOf && list.indexOf === nativeIndexOf) { 16 | return list.indexOf(elem); 17 | } 18 | var i = 0; 19 | var l = list.length; 20 | for (; i < l; i++) { 21 | if (list[i] === elem) { 22 | return i; 23 | } 24 | } 25 | return -1; 26 | }; 27 | 28 | /** 29 | * Return whether an element is contained in a list 30 | */ 31 | var contains = function(list, elem) { 32 | return indexOf(list, elem) !== -1; 33 | }; 34 | 35 | /** 36 | * Provide a default value if a setting is undefined 37 | */ 38 | var deflt = function(setting, defaultIfUndefined) { 39 | return setting === undefined ? defaultIfUndefined : setting; 40 | }; 41 | 42 | // hyphenate and escape adapted from Facebook's React under Apache 2 license 43 | 44 | var uppercase = /([A-Z])/g; 45 | var hyphenate = function(str) { 46 | return str.replace(uppercase, "-$1").toLowerCase(); 47 | }; 48 | 49 | var ESCAPE_LOOKUP = { 50 | "&": "&", 51 | ">": ">", 52 | "<": "<", 53 | "\"": """, 54 | "'": "'", 55 | }; 56 | 57 | var ESCAPE_REGEX = /[&><"']/g; 58 | 59 | function escaper(match) { 60 | return ESCAPE_LOOKUP[match]; 61 | } 62 | 63 | /** 64 | * Escapes text to prevent scripting attacks. 65 | * 66 | * @param {*} text Text value to escape. 67 | * @return {string} An escaped string. 68 | */ 69 | function escape(text) { 70 | return ("" + text).replace(ESCAPE_REGEX, escaper); 71 | } 72 | 73 | /** 74 | * A function to set the text content of a DOM element in all supported 75 | * browsers. Note that we don't define this if there is no document. 76 | */ 77 | var setTextContent; 78 | if (typeof document !== "undefined") { 79 | var testNode = document.createElement("span"); 80 | if ("textContent" in testNode) { 81 | setTextContent = function(node, text) { 82 | node.textContent = text; 83 | }; 84 | } else { 85 | setTextContent = function(node, text) { 86 | node.innerText = text; 87 | }; 88 | } 89 | } 90 | 91 | /** 92 | * A function to clear a node. 93 | */ 94 | function clearNode(node) { 95 | setTextContent(node, ""); 96 | } 97 | 98 | module.exports = { 99 | contains: contains, 100 | deflt: deflt, 101 | escape: escape, 102 | hyphenate: hyphenate, 103 | indexOf: indexOf, 104 | setTextContent: setTextContent, 105 | clearNode: clearNode, 106 | }; 107 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/match-at/README.md: -------------------------------------------------------------------------------- 1 | # match-at [![Build Status](https://travis-ci.org/spicyj/match-at.svg?branch=master)](https://travis-ci.org/spicyj/match-at) 2 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/match-at/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "match-at", 3 | "version": "0.1.0", 4 | "description": "Relocatable regular expressions.", 5 | "repository": { 6 | "type": "git", 7 | "url": "https://github.com/spicyj/match-at" 8 | }, 9 | "main": "lib/matchAt.js", 10 | "files": [ 11 | "lib/" 12 | ], 13 | "devDependencies": { 14 | "babel": "^4.7.16", 15 | "jest-cli": "^0.4.0", 16 | "react-tools": "^0.13.1" 17 | }, 18 | "jest": { 19 | "scriptPreprocessor": "/jestSupport/preprocessor.js", 20 | "unmockedModulePathPatterns": [ 21 | "" 22 | ] 23 | }, 24 | "scripts": { 25 | "prepublish": "babel -d lib/ src/", 26 | "test": "jest" 27 | }, 28 | "gitHead": "4197daff69720734c72ba3321ed68a41c0527fb2", 29 | "bugs": { 30 | "url": "https://github.com/spicyj/match-at/issues" 31 | }, 32 | "homepage": "https://github.com/spicyj/match-at", 33 | "_id": "match-at@0.1.0", 34 | "_shasum": "f561e7709ff9a105b85cc62c6b8ee7c15bf24f31", 35 | "_from": "match-at@", 36 | "_npmVersion": "2.2.0", 37 | "_nodeVersion": "0.10.35", 38 | "_npmUser": { 39 | "name": "spicyj", 40 | "email": "ben@benalpert.com" 41 | }, 42 | "maintainers": [ 43 | { 44 | "name": "spicyj", 45 | "email": "ben@benalpert.com" 46 | } 47 | ], 48 | "dist": { 49 | "shasum": "f561e7709ff9a105b85cc62c6b8ee7c15bf24f31", 50 | "tarball": "https://registry.npmjs.org/match-at/-/match-at-0.1.0.tgz" 51 | }, 52 | "directories": {}, 53 | "_resolved": "https://registry.npmjs.org/match-at/-/match-at-0.1.0.tgz" 54 | } 55 | -------------------------------------------------------------------------------- /thirdparty/harvardnlp_im2markup/third_party/multi-bleu.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | # $Id$ 7 | use warnings; 8 | use strict; 9 | 10 | my $lowercase = 0; 11 | if ($ARGV[0] eq "-lc") { 12 | $lowercase = 1; 13 | shift; 14 | } 15 | 16 | my $stem = $ARGV[0]; 17 | if (!defined $stem) { 18 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; 19 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 20 | exit(1); 21 | } 22 | 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 24 | 25 | my @REF; 26 | my $ref=0; 27 | while(-e "$stem$ref") { 28 | &add_to_ref("$stem$ref",\@REF); 29 | $ref++; 30 | } 31 | &add_to_ref($stem,\@REF) if -e $stem; 32 | die("ERROR: could not find reference file $stem") unless scalar @REF; 33 | 34 | # add additional references explicitly specified on the command line 35 | shift; 36 | foreach my $stem (@ARGV) { 37 | &add_to_ref($stem,\@REF) if -e $stem; 38 | } 39 | 40 | 41 | 42 | sub add_to_ref { 43 | my ($file,$REF) = @_; 44 | my $s=0; 45 | if ($file =~ /.gz$/) { 46 | open(REF,"gzip -dc $file|") or die "Can't read $file"; 47 | } else { 48 | open(REF,$file) or die "Can't read $file"; 49 | } 50 | while() { 51 | chop; 52 | push @{$$REF[$s++]}, $_; 53 | } 54 | close(REF); 55 | } 56 | 57 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 58 | my $s=0; 59 | while() { 60 | chop; 61 | $_ = lc if $lowercase; 62 | my @WORD = split; 63 | my %REF_NGRAM = (); 64 | my $length_translation_this_sentence = scalar(@WORD); 65 | my ($closest_diff,$closest_length) = (9999,9999); 66 | foreach my $reference (@{$REF[$s]}) { 67 | # print "$s $_ <=> $reference\n"; 68 | $reference = lc($reference) if $lowercase; 69 | my @WORD = split(' ',$reference); 70 | my $length = scalar(@WORD); 71 | my $diff = abs($length_translation_this_sentence-$length); 72 | if ($diff < $closest_diff) { 73 | $closest_diff = $diff; 74 | $closest_length = $length; 75 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 76 | } elsif ($diff == $closest_diff) { 77 | $closest_length = $length if $length < $closest_length; 78 | # from two references with the same closeness to me 79 | # take the *shorter* into account, not the "first" one. 80 | } 81 | for(my $n=1;$n<=4;$n++) { 82 | my %REF_NGRAM_N = (); 83 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 84 | my $ngram = "$n"; 85 | for(my $w=0;$w<$n;$w++) { 86 | $ngram .= " ".$WORD[$start+$w]; 87 | } 88 | $REF_NGRAM_N{$ngram}++; 89 | } 90 | foreach my $ngram (keys %REF_NGRAM_N) { 91 | if (!defined($REF_NGRAM{$ngram}) || 92 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 93 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 94 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; 95 | } 96 | } 97 | } 98 | } 99 | $length_translation += $length_translation_this_sentence; 100 | $length_reference += $closest_length; 101 | for(my $n=1;$n<=4;$n++) { 102 | my %T_NGRAM = (); 103 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 104 | my $ngram = "$n"; 105 | for(my $w=0;$w<$n;$w++) { 106 | $ngram .= " ".$WORD[$start+$w]; 107 | } 108 | $T_NGRAM{$ngram}++; 109 | } 110 | foreach my $ngram (keys %T_NGRAM) { 111 | $ngram =~ /^(\d+) /; 112 | my $n = $1; 113 | # my $corr = 0; 114 | # print "$i e $ngram $T_NGRAM{$ngram}
\n"; 115 | $TOTAL[$n] += $T_NGRAM{$ngram}; 116 | if (defined($REF_NGRAM{$ngram})) { 117 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 118 | $CORRECT[$n] += $T_NGRAM{$ngram}; 119 | # $corr = $T_NGRAM{$ngram}; 120 | # print "$i e correct1 $T_NGRAM{$ngram}
\n"; 121 | } 122 | else { 123 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 124 | # $corr = $REF_NGRAM{$ngram}; 125 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; 126 | } 127 | } 128 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 129 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 130 | } 131 | } 132 | $s++; 133 | } 134 | my $brevity_penalty = 1; 135 | my $bleu = 0; 136 | 137 | my @bleu=(); 138 | 139 | for(my $n=1;$n<=4;$n++) { 140 | if (defined ($TOTAL[$n])){ 141 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; 142 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 143 | }else{ 144 | $bleu[$n]=0; 145 | } 146 | } 147 | 148 | if ($length_reference==0){ 149 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 150 | exit(1); 151 | } 152 | 153 | if ($length_translation<$length_reference) { 154 | $brevity_penalty = exp(1-$length_reference/$length_translation); 155 | } 156 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 157 | my_log( $bleu[2] ) + 158 | my_log( $bleu[3] ) + 159 | my_log( $bleu[4] ) ) / 4) ; 160 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 161 | 100*$bleu, 162 | 100*$bleu[1], 163 | 100*$bleu[2], 164 | 100*$bleu[3], 165 | 100*$bleu[4], 166 | $brevity_penalty, 167 | $length_translation / $length_reference, 168 | $length_translation, 169 | $length_reference; 170 | 171 | sub my_log { 172 | return -9999999999 unless $_[0]; 173 | return log($_[0]); 174 | } 175 | 176 | --------------------------------------------------------------------------------