├── README.md ├── LICENSE └── tokenize.sh /README.md: -------------------------------------------------------------------------------- 1 | # opennmt-tokenize 2 | opennmt-tokenize 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 floydhub 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tokenize.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Bash wrapper to invoe OpenNMT's tokenize script 4 | # For more details, see https://github.com/OpenNMT/OpenNMT/tree/master/tools 5 | 6 | # Reading in command line arguments 7 | for i in "$@" 8 | do 9 | case $i in 10 | -i=*|--input=*) 11 | INPUT="${i#*=}" 12 | shift # past argument=value 13 | ;; 14 | -o=*|--output=*) 15 | OUTPUT="${i#*=}" 16 | shift # past argument=value 17 | ;; 18 | -mode=*|--mode=*) 19 | MODE="-mode ${i#*=}" 20 | shift # past argument=value 21 | ;; 22 | -sep_annonate=*|--sep_annonate=*) 23 | value="${i#*=}" 24 | shopt -s nocasematch 25 | case "$value" in 26 | "true") SEP_ANNONATE="-sep_annonate";; 27 | *) SEP_ANNONATE="";; 28 | esac 29 | shift # past argument=value 30 | ;; 31 | -case_feature=*|--case_feature=*) 32 | value="${i#*=}" 33 | shopt -s nocasematch 34 | case "$value" in 35 | "true") CASE_FEATURE="-case_feature";; 36 | *) CASE_FEATURE="";; 37 | esac 38 | shift # past argument=value 39 | ;; 40 | -bpe_model=*|--bpe_model=*) 41 | value="${i#*=}" 42 | shopt -s nocasematch 43 | case "$value" in 44 | "true") BPE_MODEL="-bpe_model";; 45 | *) BPE_MODEL="";; 46 | esac 47 | shift # past argument=value 48 | ;; 49 | # --default) 50 | # DEFAULT=YES 51 | # shift # past argument with no value 52 | # ;; 53 | *) 54 | # unknown option 55 | ;; 56 | esac 57 | done 58 | 59 | echo "MODE = ${MODE}" 60 | echo "SEP_ANNONATE = ${SEP_ANNONATE}" 61 | echo "CASE_FEATURE = ${CASE_FEATURE}" 62 | echo "BPE_MODEL = ${BPE_MODEL}" 63 | 64 | cd /opennmt 65 | th tools/tokenize.lua ${MODE} ${SEP_ANNONATE} ${CASE_FEATURE} ${BPE_MODEL} < ${INPUT} > ${OUTPUT} 66 | --------------------------------------------------------------------------------