├── NLP4J ├── bin │ ├── nlpdecode │ ├── nlpdecode.bat │ ├── version │ └── version.bat ├── config-CRAFT.xml ├── config-GENIA.xml ├── etc │ └── log4j.properties ├── lexica │ ├── en-ambiguity-classes-simplified-lowercase.xz │ └── en-brown-clusters-simplified-lowercase.xz ├── models │ ├── CRAFT.DEP.model.xz │ ├── CRAFT.POS.model.xz │ ├── GENIA.DEP.model.xz │ └── GENIA.POS.model.xz └── repo │ ├── args4j │ └── args4j │ │ └── 2.32 │ │ └── args4j-2.32.jar │ ├── edu │ └── emory │ │ └── mathcs │ │ └── nlp │ │ ├── nlp4j-api │ │ └── 1.1.4-SNAPSHOT │ │ │ └── nlp4j-api-1.1.4-SNAPSHOT.jar │ │ └── nlp4j-cli │ │ └── 1.1.4-SNAPSHOT │ │ └── nlp4j-cli-1.1.4-SNAPSHOT.jar │ ├── it │ └── unimi │ │ └── dsi │ │ └── fastutil │ │ └── 7.0.12 │ │ └── fastutil-7.0.12.jar │ ├── log4j │ └── log4j │ │ └── 1.2.17 │ │ └── log4j-1.2.17.jar │ └── org │ ├── apache │ └── commons │ │ ├── commons-csv │ │ └── 1.2 │ │ │ └── commons-csv-1.2.jar │ │ └── commons-math3 │ │ └── 3.5 │ │ └── commons-math3-3.5.jar │ ├── magicwerk │ └── brownies-collections │ │ └── 0.9.13 │ │ └── brownies-collections-0.9.13.jar │ ├── slf4j │ ├── slf4j-api │ │ └── 1.7.21 │ │ │ └── slf4j-api-1.7.21.jar │ └── slf4j-log4j12 │ │ └── 1.7.21 │ │ └── slf4j-log4j12-1.7.21.jar │ └── tukaani │ └── xz │ └── 1.5 │ └── xz-1.5.jar ├── README.md ├── StanfordBiaffineParser-v2 ├── config │ ├── CRAFT.cfg │ ├── GENIA.cfg │ ├── defaults.cfg │ └── template.cfg ├── main.py └── parser │ ├── __init__.py │ ├── bucket.py │ ├── configurable.py │ ├── dataset.py │ ├── misc │ ├── __init__.py │ ├── bucketer.py │ ├── colors.py │ ├── get_encoding.py │ ├── mst.py │ └── zipf.py │ ├── multibucket.py │ ├── network.py │ ├── neural │ ├── __init__.py │ ├── functions.py │ ├── linalg.py │ ├── models │ │ ├── __init__.py │ │ ├── embeds │ │ │ ├── __init__.py │ │ │ ├── base_embed.py │ │ │ ├── cnn_embed.py │ │ │ ├── mlp_embed.py │ │ │ └── rnn_embed.py │ │ ├── nlp │ │ │ ├── __init__.py │ │ │ ├── parsers │ │ │ │ ├── __init__.py │ │ │ │ ├── base_parser.py │ │ │ │ ├── bin_parser.py │ │ │ │ ├── fish_parser.py │ │ │ │ ├── gama_parser.py │ │ │ │ ├── parser.py │ │ │ │ └── xbar_parser.py │ │ │ └── taggers │ │ │ │ ├── __init__.py │ │ │ │ ├── base_tagger.py │ │ │ │ ├── base_xtagger.py │ │ │ │ ├── tagger.py │ │ │ │ └── xtagger.py │ │ └── nn.py │ ├── optimizers │ │ ├── __init__.py │ │ ├── base_optimizer.py │ │ ├── radam_optimizer.py │ │ └── sgd_optimizer.py │ ├── recur_cells │ │ ├── .directory │ │ ├── __init__.py │ │ ├── base_cell.py │ │ ├── cif_lstm_cell.py │ │ ├── gru_cell.py │ │ ├── lstm_cell.py │ │ └── rnn_cell.py │ └── rnn.py │ ├── scripts │ ├── compression_ratio.py │ ├── count_nonprojective.py │ ├── heaps_law.py │ └── reinsert_compounds.py │ ├── trash │ ├── retrained_vocab.py │ └── weighted_mean.py │ └── vocabs │ ├── __init__.py │ ├── base_vocab.py │ ├── index_vocab.py │ ├── multivocab.py │ ├── ngram_multivocab.py │ ├── ngram_vocab.py │ ├── pretrained_vocab.py │ ├── subtoken_vocab.py │ └── token_vocab.py ├── convert_NLP4J_to_CoNLL.py ├── data ├── raw.txt ├── sentence_segmented.txt ├── tokenized_sentence_segmented.txt └── tokenized_sentence_segmented.txt.column ├── get_ColumnFormat.py └── jPTDP-v1 └── README.md /NLP4J/bin/nlpdecode: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2001-2006 The Apache Software Foundation. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ---------------------------------------------------------------------------- 17 | # 18 | # Copyright (c) 2001-2006 The Apache Software Foundation. All rights 19 | # reserved. 20 | 21 | 22 | # resolve links - $0 may be a softlink 23 | PRG="$0" 24 | 25 | while [ -h "$PRG" ]; do 26 | ls=`ls -ld "$PRG"` 27 | link=`expr "$ls" : '.*-> \(.*\)$'` 28 | if expr "$link" : '/.*' > /dev/null; then 29 | PRG="$link" 30 | else 31 | PRG=`dirname "$PRG"`/"$link" 32 | fi 33 | done 34 | 35 | PRGDIR=`dirname "$PRG"` 36 | BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd` 37 | 38 | # Reset the REPO variable. If you need to influence this use the environment setup file. 39 | REPO= 40 | 41 | 42 | # OS specific support. $var _must_ be set to either true or false. 43 | cygwin=false; 44 | darwin=false; 45 | case "`uname`" in 46 | CYGWIN*) cygwin=true ;; 47 | Darwin*) darwin=true 48 | if [ -z "$JAVA_VERSION" ] ; then 49 | JAVA_VERSION="CurrentJDK" 50 | else 51 | echo "Using Java version: $JAVA_VERSION" 52 | fi 53 | if [ -z "$JAVA_HOME" ]; then 54 | if [ -x "/usr/libexec/java_home" ]; then 55 | JAVA_HOME=`/usr/libexec/java_home` 56 | else 57 | JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home 58 | fi 59 | fi 60 | ;; 61 | esac 62 | 63 | if [ -z "$JAVA_HOME" ] ; then 64 | if [ -r /etc/gentoo-release ] ; then 65 | JAVA_HOME=`java-config --jre-home` 66 | fi 67 | fi 68 | 69 | # For Cygwin, ensure paths are in UNIX format before anything is touched 70 | if $cygwin ; then 71 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 72 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"` 73 | fi 74 | 75 | # If a specific java binary isn't specified search for the standard 'java' binary 76 | if [ -z "$JAVACMD" ] ; then 77 | if [ -n "$JAVA_HOME" ] ; then 78 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 79 | # IBM's JDK on AIX uses strange locations for the executables 80 | JAVACMD="$JAVA_HOME/jre/sh/java" 81 | else 82 | JAVACMD="$JAVA_HOME/bin/java" 83 | fi 84 | else 85 | JAVACMD=`which java` 86 | fi 87 | fi 88 | 89 | if [ ! -x "$JAVACMD" ] ; then 90 | echo "Error: JAVA_HOME is not defined correctly." 1>&2 91 | echo " We cannot execute $JAVACMD" 1>&2 92 | exit 1 93 | fi 94 | 95 | if [ -z "$REPO" ] 96 | then 97 | REPO="$BASEDIR"/repo 98 | fi 99 | 100 | CLASSPATH="$BASEDIR"/etc:"$REPO"/edu/emory/mathcs/nlp/nlp4j-english/1.1.2/nlp4j-english-1.1.2.jar:"$REPO"/edu/emory/mathcs/nlp/nlp4j-api/1.1.4-SNAPSHOT/nlp4j-api-1.1.4-SNAPSHOT.jar:"$REPO"/org/slf4j/slf4j-api/1.7.21/slf4j-api-1.7.21.jar:"$REPO"/org/tukaani/xz/1.5/xz-1.5.jar:"$REPO"/it/unimi/dsi/fastutil/7.0.12/fastutil-7.0.12.jar:"$REPO"/org/magicwerk/brownies-collections/0.9.13/brownies-collections-0.9.13.jar:"$REPO"/org/apache/commons/commons-math3/3.5/commons-math3-3.5.jar:"$REPO"/org/apache/commons/commons-csv/1.2/commons-csv-1.2.jar:"$REPO"/org/slf4j/slf4j-log4j12/1.7.21/slf4j-log4j12-1.7.21.jar:"$REPO"/log4j/log4j/1.2.17/log4j-1.2.17.jar:"$REPO"/args4j/args4j/2.32/args4j-2.32.jar:"$REPO"/edu/emory/mathcs/nlp/nlp4j-cli/1.1.4-SNAPSHOT/nlp4j-cli-1.1.4-SNAPSHOT.jar 101 | 102 | ENDORSED_DIR= 103 | if [ -n "$ENDORSED_DIR" ] ; then 104 | CLASSPATH=$BASEDIR/$ENDORSED_DIR/*:$CLASSPATH 105 | fi 106 | 107 | if [ -n "$CLASSPATH_PREFIX" ] ; then 108 | CLASSPATH=$CLASSPATH_PREFIX:$CLASSPATH 109 | fi 110 | 111 | # For Cygwin, switch paths to Windows format before running java 112 | if $cygwin; then 113 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"` 114 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` 115 | [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"` 116 | [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"` 117 | [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"` 118 | fi 119 | 120 | exec "$JAVACMD" $JAVA_OPTS -Xmx8g -XX:+UseConcMarkSweepGC \ 121 | -classpath "$CLASSPATH" \ 122 | -Dapp.name="nlpdecode" \ 123 | -Dapp.pid="$$" \ 124 | -Dapp.repo="$REPO" \ 125 | -Dapp.home="$BASEDIR" \ 126 | -Dbasedir="$BASEDIR" \ 127 | edu.emory.mathcs.nlp.bin.NLPDecode \ 128 | "$@" 129 | -------------------------------------------------------------------------------- /NLP4J/bin/nlpdecode.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | set BASEDIR=%~dp0\.. 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-english\1.1.2\nlp4j-english-1.1.2.jar;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-api\1.1.4-SNAPSHOT\nlp4j-api-1.1.4-SNAPSHOT.jar;"%REPO%"\org\slf4j\slf4j-api\1.7.21\slf4j-api-1.7.21.jar;"%REPO%"\org\tukaani\xz\1.5\xz-1.5.jar;"%REPO%"\it\unimi\dsi\fastutil\7.0.12\fastutil-7.0.12.jar;"%REPO%"\org\magicwerk\brownies-collections\0.9.13\brownies-collections-0.9.13.jar;"%REPO%"\org\apache\commons\commons-math3\3.5\commons-math3-3.5.jar;"%REPO%"\org\apache\commons\commons-csv\1.2\commons-csv-1.2.jar;"%REPO%"\org\slf4j\slf4j-log4j12\1.7.21\slf4j-log4j12-1.7.21.jar;"%REPO%"\log4j\log4j\1.2.17\log4j-1.2.17.jar;"%REPO%"\args4j\args4j\2.32\args4j-2.32.jar;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-cli\1.1.4-SNAPSHOT\nlp4j-cli-1.1.4-SNAPSHOT.jar 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xmx8g -XX:+UseConcMarkSweepGC -classpath %CLASSPATH% -Dapp.name="nlpdecode" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" edu.emory.mathcs.nlp.bin.NLPDecode %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /NLP4J/bin/version: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2001-2006 The Apache Software Foundation. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ---------------------------------------------------------------------------- 17 | # 18 | # Copyright (c) 2001-2006 The Apache Software Foundation. All rights 19 | # reserved. 20 | 21 | 22 | # resolve links - $0 may be a softlink 23 | PRG="$0" 24 | 25 | while [ -h "$PRG" ]; do 26 | ls=`ls -ld "$PRG"` 27 | link=`expr "$ls" : '.*-> \(.*\)$'` 28 | if expr "$link" : '/.*' > /dev/null; then 29 | PRG="$link" 30 | else 31 | PRG=`dirname "$PRG"`/"$link" 32 | fi 33 | done 34 | 35 | PRGDIR=`dirname "$PRG"` 36 | BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd` 37 | 38 | # Reset the REPO variable. If you need to influence this use the environment setup file. 39 | REPO= 40 | 41 | 42 | # OS specific support. $var _must_ be set to either true or false. 43 | cygwin=false; 44 | darwin=false; 45 | case "`uname`" in 46 | CYGWIN*) cygwin=true ;; 47 | Darwin*) darwin=true 48 | if [ -z "$JAVA_VERSION" ] ; then 49 | JAVA_VERSION="CurrentJDK" 50 | else 51 | echo "Using Java version: $JAVA_VERSION" 52 | fi 53 | if [ -z "$JAVA_HOME" ]; then 54 | if [ -x "/usr/libexec/java_home" ]; then 55 | JAVA_HOME=`/usr/libexec/java_home` 56 | else 57 | JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home 58 | fi 59 | fi 60 | ;; 61 | esac 62 | 63 | if [ -z "$JAVA_HOME" ] ; then 64 | if [ -r /etc/gentoo-release ] ; then 65 | JAVA_HOME=`java-config --jre-home` 66 | fi 67 | fi 68 | 69 | # For Cygwin, ensure paths are in UNIX format before anything is touched 70 | if $cygwin ; then 71 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 72 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"` 73 | fi 74 | 75 | # If a specific java binary isn't specified search for the standard 'java' binary 76 | if [ -z "$JAVACMD" ] ; then 77 | if [ -n "$JAVA_HOME" ] ; then 78 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 79 | # IBM's JDK on AIX uses strange locations for the executables 80 | JAVACMD="$JAVA_HOME/jre/sh/java" 81 | else 82 | JAVACMD="$JAVA_HOME/bin/java" 83 | fi 84 | else 85 | JAVACMD=`which java` 86 | fi 87 | fi 88 | 89 | if [ ! -x "$JAVACMD" ] ; then 90 | echo "Error: JAVA_HOME is not defined correctly." 1>&2 91 | echo " We cannot execute $JAVACMD" 1>&2 92 | exit 1 93 | fi 94 | 95 | if [ -z "$REPO" ] 96 | then 97 | REPO="$BASEDIR"/repo 98 | fi 99 | 100 | CLASSPATH="$BASEDIR"/etc:"$REPO"/edu/emory/mathcs/nlp/nlp4j-english/1.1.2/nlp4j-english-1.1.2.jar:"$REPO"/edu/emory/mathcs/nlp/nlp4j-api/1.1.4-SNAPSHOT/nlp4j-api-1.1.4-SNAPSHOT.jar:"$REPO"/org/slf4j/slf4j-api/1.7.21/slf4j-api-1.7.21.jar:"$REPO"/org/tukaani/xz/1.5/xz-1.5.jar:"$REPO"/it/unimi/dsi/fastutil/7.0.12/fastutil-7.0.12.jar:"$REPO"/org/magicwerk/brownies-collections/0.9.13/brownies-collections-0.9.13.jar:"$REPO"/org/apache/commons/commons-math3/3.5/commons-math3-3.5.jar:"$REPO"/org/apache/commons/commons-csv/1.2/commons-csv-1.2.jar:"$REPO"/org/slf4j/slf4j-log4j12/1.7.21/slf4j-log4j12-1.7.21.jar:"$REPO"/log4j/log4j/1.2.17/log4j-1.2.17.jar:"$REPO"/args4j/args4j/2.32/args4j-2.32.jar:"$REPO"/edu/emory/mathcs/nlp/nlp4j-cli/1.1.4-SNAPSHOT/nlp4j-cli-1.1.4-SNAPSHOT.jar 101 | 102 | ENDORSED_DIR= 103 | if [ -n "$ENDORSED_DIR" ] ; then 104 | CLASSPATH=$BASEDIR/$ENDORSED_DIR/*:$CLASSPATH 105 | fi 106 | 107 | if [ -n "$CLASSPATH_PREFIX" ] ; then 108 | CLASSPATH=$CLASSPATH_PREFIX:$CLASSPATH 109 | fi 110 | 111 | # For Cygwin, switch paths to Windows format before running java 112 | if $cygwin; then 113 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"` 114 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` 115 | [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"` 116 | [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"` 117 | [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"` 118 | fi 119 | 120 | exec "$JAVACMD" $JAVA_OPTS -Xmx10g -XX:+UseConcMarkSweepGC \ 121 | -classpath "$CLASSPATH" \ 122 | -Dapp.name="version" \ 123 | -Dapp.pid="$$" \ 124 | -Dapp.repo="$REPO" \ 125 | -Dapp.home="$BASEDIR" \ 126 | -Dbasedir="$BASEDIR" \ 127 | edu.emory.mathcs.nlp.bin.Version \ 128 | "$@" 129 | -------------------------------------------------------------------------------- /NLP4J/bin/version.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | set BASEDIR=%~dp0\.. 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-english\1.1.2\nlp4j-english-1.1.2.jar;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-api\1.1.4-SNAPSHOT\nlp4j-api-1.1.4-SNAPSHOT.jar;"%REPO%"\org\slf4j\slf4j-api\1.7.21\slf4j-api-1.7.21.jar;"%REPO%"\org\tukaani\xz\1.5\xz-1.5.jar;"%REPO%"\it\unimi\dsi\fastutil\7.0.12\fastutil-7.0.12.jar;"%REPO%"\org\magicwerk\brownies-collections\0.9.13\brownies-collections-0.9.13.jar;"%REPO%"\org\apache\commons\commons-math3\3.5\commons-math3-3.5.jar;"%REPO%"\org\apache\commons\commons-csv\1.2\commons-csv-1.2.jar;"%REPO%"\org\slf4j\slf4j-log4j12\1.7.21\slf4j-log4j12-1.7.21.jar;"%REPO%"\log4j\log4j\1.2.17\log4j-1.2.17.jar;"%REPO%"\args4j\args4j\2.32\args4j-2.32.jar;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-cli\1.1.4-SNAPSHOT\nlp4j-cli-1.1.4-SNAPSHOT.jar 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xmx10g -XX:+UseConcMarkSweepGC -classpath %CLASSPATH% -Dapp.name="version" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" edu.emory.mathcs.nlp.bin.Version %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /NLP4J/config-CRAFT.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 8 | 9 | 10 | lexica/en-ambiguity-classes-simplified-lowercase.xz 11 | lexica/en-brown-clusters-simplified-lowercase.xz 12 | 13 | 14 | 15 | models/CRAFT.POS.model.xz 16 | models/CRAFT.DEP.model.xz 17 | 18 | 19 | -------------------------------------------------------------------------------- /NLP4J/config-GENIA.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 8 | 9 | 10 | lexica/en-ambiguity-classes-simplified-lowercase.xz 11 | lexica/en-brown-clusters-simplified-lowercase.xz 12 | 13 | 14 | 15 | models/GENIA.POS.model.xz 16 | models/GENIA.DEP.model.xz 17 | 18 | 19 | -------------------------------------------------------------------------------- /NLP4J/etc/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=INFO, A1 3 | 4 | # A1 is set to be a ConsoleAppender. 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 6 | 7 | # A1 uses PatternLayout. 8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.A1.layout.conversionPattern=%m%n 10 | -------------------------------------------------------------------------------- /NLP4J/lexica/en-ambiguity-classes-simplified-lowercase.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/lexica/en-ambiguity-classes-simplified-lowercase.xz -------------------------------------------------------------------------------- /NLP4J/lexica/en-brown-clusters-simplified-lowercase.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/lexica/en-brown-clusters-simplified-lowercase.xz -------------------------------------------------------------------------------- /NLP4J/models/CRAFT.DEP.model.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/models/CRAFT.DEP.model.xz -------------------------------------------------------------------------------- /NLP4J/models/CRAFT.POS.model.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/models/CRAFT.POS.model.xz -------------------------------------------------------------------------------- /NLP4J/models/GENIA.DEP.model.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/models/GENIA.DEP.model.xz -------------------------------------------------------------------------------- /NLP4J/models/GENIA.POS.model.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/models/GENIA.POS.model.xz -------------------------------------------------------------------------------- /NLP4J/repo/args4j/args4j/2.32/args4j-2.32.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/args4j/args4j/2.32/args4j-2.32.jar -------------------------------------------------------------------------------- /NLP4J/repo/edu/emory/mathcs/nlp/nlp4j-api/1.1.4-SNAPSHOT/nlp4j-api-1.1.4-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/edu/emory/mathcs/nlp/nlp4j-api/1.1.4-SNAPSHOT/nlp4j-api-1.1.4-SNAPSHOT.jar -------------------------------------------------------------------------------- /NLP4J/repo/edu/emory/mathcs/nlp/nlp4j-cli/1.1.4-SNAPSHOT/nlp4j-cli-1.1.4-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/edu/emory/mathcs/nlp/nlp4j-cli/1.1.4-SNAPSHOT/nlp4j-cli-1.1.4-SNAPSHOT.jar -------------------------------------------------------------------------------- /NLP4J/repo/it/unimi/dsi/fastutil/7.0.12/fastutil-7.0.12.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/it/unimi/dsi/fastutil/7.0.12/fastutil-7.0.12.jar -------------------------------------------------------------------------------- /NLP4J/repo/log4j/log4j/1.2.17/log4j-1.2.17.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/log4j/log4j/1.2.17/log4j-1.2.17.jar -------------------------------------------------------------------------------- /NLP4J/repo/org/apache/commons/commons-csv/1.2/commons-csv-1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/apache/commons/commons-csv/1.2/commons-csv-1.2.jar -------------------------------------------------------------------------------- /NLP4J/repo/org/apache/commons/commons-math3/3.5/commons-math3-3.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/apache/commons/commons-math3/3.5/commons-math3-3.5.jar -------------------------------------------------------------------------------- /NLP4J/repo/org/magicwerk/brownies-collections/0.9.13/brownies-collections-0.9.13.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/magicwerk/brownies-collections/0.9.13/brownies-collections-0.9.13.jar -------------------------------------------------------------------------------- /NLP4J/repo/org/slf4j/slf4j-api/1.7.21/slf4j-api-1.7.21.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/slf4j/slf4j-api/1.7.21/slf4j-api-1.7.21.jar -------------------------------------------------------------------------------- /NLP4J/repo/org/slf4j/slf4j-log4j12/1.7.21/slf4j-log4j12-1.7.21.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/slf4j/slf4j-log4j12/1.7.21/slf4j-log4j12-1.7.21.jar -------------------------------------------------------------------------------- /NLP4J/repo/org/tukaani/xz/1.5/xz-1.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/tukaani/xz/1.5/xz-1.5.jar -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Biomedical POS tagging and dependency parsing models 3 | 4 | Biomedical POS tagging and dependency parsing models are trained on [GENIA](http://www.geniaproject.org/) and [CRAFT](http://BioPosDep-corpora.sourceforge.net/CRAFT/). See [our following paper](https://arxiv.org/abs/1808.03731) for more details: 5 | 6 | @Article{NguyenK2019, 7 | author="Nguyen, Dat Quoc and Verspoor, Karin", 8 | title="From POS tagging to dependency parsing for biomedical event extraction", 9 | journal="BMC Bioinformatics", 10 | year="2019", 11 | month="Feb", 12 | day="12", 13 | volume="20", 14 | number="1", 15 | pages="72", 16 | doi="10.1186/s12859-019-2604-0", 17 | url="https://doi.org/10.1186/s12859-019-2604-0" 18 | } 19 | 20 | Our models are **free** for non-commercial use and distributed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International ([CC BY-NC-SA](https://creativecommons.org/licenses/by-nc-sa/4.0/)) License. 21 | 22 | pos dep 23 | 24 | # Usage 25 | 26 | #### The first step is to perform POS tagging and dependency parsing using [NLP4J](https://emorynlp.github.io/nlp4j/) models. Here, NLP4J would also perform _TOKENIZATION_ and _SENTENCE SEGMENTATION_ if input files are raw text corpora. Then, the output of NLP4J will be used as input for other dependency parsing models. 27 | 28 | ### Perform biomedical POS tagging and dependency parsing using retrained NLP4J models 29 | 30 | #### Installation 31 | 32 | Download NLP4J models from [https://github.com/datquocnguyen/BioPosDep/archive/master.zip](https://github.com/datquocnguyen/BioPosDep/archive/master.zip) (70MB) or clone these models using `git`: 33 | 34 | $ git clone https://github.com/datquocnguyen/BioPosDep.git 35 | 36 | To run the models, it is expected that `Java` is already set to run in command line or terminal. 37 | 38 | #### Command line 39 | 40 | # Using models trained on GENIA 41 | BioPosDep/NLP4J$ bin/nlpdecode -c config-GENIA.xml -i -format [-ie -oe ] 42 | 43 | # Using models trained on CRAFT 44 | BioPosDep/NLP4J$ bin/nlpdecode -c config-CRAFT.xml -i -format [-ie -oe ] 45 | 46 | -i : input path (required) 47 | -format : format of the input data (raw|line|tsv; default: raw) 48 | -ie : input file extension (default: *) 49 | -oe : output file extension (default: nlp) 50 | 51 | - `-i` specifies the input path pointing to either a file or a directory. When the path points to a file, only the specific file is processed. When the path points to a directory, all files with the file extension `-ie` under the specific directory are processed. 52 | - `-format` specifies the format of the input file: `raw`, `line`, or `tsv` 53 | - `raw` accepts texts in any format 54 | - `line` expects a sentence per line 55 | - `tsv` expects columns delimited by `\t` and sentences separated by `\n` 56 | - `-ie` specifies the input file extension. The default value `*` implies files with any extension. This option is used only when the input path `-i` points to a directory. 57 | - `-oe` specifies the output file extension appended to each input filename. The corresponding output file, consisting of the NLP output, will be generated. 58 | 59 | #### Examples 60 | 61 | # For a raw corpus input 62 | BioPosDep/NLP4J$ bin/nlpdecode -c config-GENIA.xml -i ../data/raw.txt -format raw -oe genia 63 | BioPosDep/NLP4J$ bin/nlpdecode -c config-CRAFT.xml -i ../data/raw.txt -format raw -oe craft 64 | 65 | # For a sentence-segmented corpus input (without tokenization!) 66 | BioPosDep/NLP4J$ bin/nlpdecode -c config-GENIA.xml -i ../data/sentence_segmented.txt -format line -oe genia 67 | BioPosDep/NLP4J$ bin/nlpdecode -c config-CRAFT.xml -i ../data/sentence_segmented.txt -format line -oe craft 68 | 69 | # For a "pre-processed" tokenized and sentence-segmented corpus 70 | # Convert into a column-based format 71 | BioPosDep/NLP4J$ python ../get_ColumnFormat.py ../data/tokenized_sentence_segmented.txt 72 | # Apply models using "tsv". Here we expect word forms at the second column (i.e. column index of 1). 73 | # Adjust in config-GENIA.xml and config-CRAFT.xml if users already have a column-formated corpus with a different index of the word form column. 74 | BioPosDep/NLP4J$ bin/nlpdecode -c config-GENIA.xml -i ../data/tokenized_sentence_segmented.txt.column -format tsv -oe genia 75 | BioPosDep/NLP4J$ bin/nlpdecode -c config-CRAFT.xml -i ../data/tokenized_sentence_segmented.txt.column -format tsv -oe craft 76 | 77 | 78 | From the examples above, output files `.genia` and `.craft ` are generated in folder `data`, containing POS and dependency annotations. 79 | 80 | 81 | #### NOTE 82 | Those NLP4J output files are in a 9-column format. To further apply other dependency parsing models, they must be converted to 10-column format: 83 | 84 | # Command line 85 | BioPosDep$ python convert_NLP4J_to_CoNLL.py 86 | 87 | # Examples 88 | BioPosDep$ python convert_NLP4J_to_CoNLL.py data/raw.txt.genia 89 | BioPosDep$ python convert_NLP4J_to_CoNLL.py data/raw.txt.craft 90 | 91 | ##### Two 10-column output files `raw.txt.genia.conll` and `raw.txt.craft.conll` are generated in folder `data`, which will be used as inputs for other models. 92 | 93 | ### Using retrained Stanford [Biaffine](https://github.com/tdozat/Parser-v2) parsing models 94 | 95 | #### Installation 96 | 97 | # Install prerequisite packages 98 | BioPosDep/StanfordBiaffineParser-v2$ virtualenv .TF1_0 99 | BioPosDep/StanfordBiaffineParser-v2$ source .TF1_0/bin/activate 100 | BioPosDep/StanfordBiaffineParser-v2$ pip install tensorflow==1.0 101 | BioPosDep/StanfordBiaffineParser-v2$ pip install numpy==1.11.0 102 | BioPosDep/StanfordBiaffineParser-v2$ pip install scipy==1.0.0 103 | BioPosDep/StanfordBiaffineParser-v2$ pip install matplotlib==2.1.2 104 | BioPosDep/StanfordBiaffineParser-v2$ pip install backports.lzma 105 | 106 | - Download file `Pre-trained-Biaffine-v2.zip` from [HERE](https://drive.google.com/file/d/18IYSJEV0uwbg468lFXejS0Wyw2_8Pjfa/view?usp=sharing). 107 | - Unzip the file, then copy/move folder `models` and file `PubMed-shuffle-win2-500Kwords.txt` into folder `BioPosDep/StanfordBiaffineParser-v2`. 108 | 109 | 110 | 111 | #### Command line 112 | 113 | # Using model trained on GENIA 114 | BioPosDep/StanfordBiaffineParser-v2$ python main.py --save_dir models/GENIA parse 115 | 116 | # Using model trained on CRAFT 117 | BioPosDep/StanfordBiaffineParser-v2$ python main.py --save_dir models/CRAFT parse 118 | 119 | # Output parsed files are by default saved in the model directory with the same name as the input file. 120 | # NOTE: We can also specify the output directory with the --output_dir flag and/or the output file name with the --output_file flag. 121 | 122 | #### Examples 123 | 124 | # Activate TensorFlow 1.0 before running models: 125 | BioPosDep/StanfordBiaffineParser-v2$ source .TF1_0/bin/activate 126 | BioPosDep/StanfordBiaffineParser-v2$ python main.py --save_dir models/GENIA parse ../data/raw.txt.genia.conll 127 | BioPosDep/StanfordBiaffineParser-v2$ python main.py --save_dir models/CRAFT parse ../data/raw.txt.craft.conll 128 | 129 | Two output parsed files `raw.txt.genia.conll` and `raw.txt.craft.conll` are generated in folders `models/GENIA` and `models/CRAFT`, respectively. 130 | 131 | ### Using retrained jPTDP models 132 | 133 | See [https://github.com/datquocnguyen/jPTDP](https://github.com/datquocnguyen/jPTDP) for details. 134 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/config/CRAFT.cfg: -------------------------------------------------------------------------------- 1 | #*************************************************************** 2 | # Where things are located 3 | [Configurable] 4 | train_files = /home/ubuntu/WORKSPACE/StanfordBiaffineParser-v2/predictedPOS/CRAFT.train.conll.20wayJK.txt 5 | parse_files = /home/ubuntu/WORKSPACE/StanfordBiaffineParser-v2/predictedPOS/CRAFT.dev.conll.20wayJK.txt 6 | 7 | [Pretrained Vocab] 8 | filename = /home/ubuntu/WORKSPACE/PubMed-shuffle-win2-500Kwords.txt 9 | # skips the first line of the file, which sometimes contains metadata about the embedding matrix 10 | skip_header = True 11 | 12 | #*************************************************************** 13 | # Embedding hyperparameters 14 | [Char Vocab] 15 | # {RNNEmbed, CNNEmbed, MLPEmbed} 16 | embed_model = RNNEmbed 17 | 18 | # The aggregated word vocab, pretrained vocab, and char vocab 19 | [Multivocab] 20 | # probability of dropping a word embedding 21 | embed_keep_prob = .67 22 | 23 | [Tag Vocab] 24 | # probability of dropping a tag embedding 25 | embed_keep_prob = .67 26 | 27 | [RNN Embed] 28 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell} 29 | recur_cell = LSTMCell 30 | # number of LSTM layers 31 | n_layers = 3 32 | # number of recurrent units 33 | recur_size = 400 34 | # probability of dropping a connection between timesteps at a single layer 35 | recur_keep_prob = .67 36 | # probability of dropping a connection between layers at a single timestep 37 | ff_keep_prob = .67 38 | 39 | #*************************************************************** 40 | # NLP model hyperparameters 41 | [Tagger] 42 | #if you only want it to produce the first column of tags, set this to just 'tags' 43 | output_vocabs = tags:xtags 44 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell} 45 | recur_cell = LSTMCell 46 | # number of LSTM layers 47 | n_layers = 2 48 | # number of recurrent units in each direction of the BiLSTM 49 | recur_size = 400 50 | # number of units in the tag classifier 51 | mlp_size = 600 52 | # probability of dropping a node in the MLP or the classifier 53 | mlp_keep_prob = .67 54 | # probability of dropping a connection between timesteps at a single layer 55 | recur_keep_prob = .5 56 | # probability of dropping a connection between layers at a single timestep 57 | ff_keep_prob = .67 58 | 59 | [Parser] 60 | # if you only want it to use the first column of tags, set this to 'words:tags' 61 | input_vocabs = words:tags:xtags 62 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell} 63 | recur_cell = LSTMCell 64 | # number of layers 65 | n_layers = 3 66 | # number of recurrent units 67 | recur_size = 400 68 | # number of units in the edge classifier 69 | arc_mlp_size = 600 70 | # number of units in the label classifier (you probably want this to be small!) 71 | rel_mlp_size = 100 72 | # probability of dropping a node in the MLP or the classifier 73 | mlp_keep_prob = .67 74 | # probability of dropping a connection between timesteps at a single layer 75 | recur_keep_prob = .67 76 | # probability of dropping a connection between layers at a single timestep 77 | ff_keep_prob = .67 78 | 79 | #*************************************************************** 80 | # Training hyperparameters 81 | [Network] 82 | # {Parser, Tagger} 83 | nlp_model = Parser 84 | quit_after_n_iters_without_improvement = 5000 85 | max_train_iters = 20001 86 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/config/GENIA.cfg: -------------------------------------------------------------------------------- 1 | #*************************************************************** 2 | # Where things are located 3 | [Configurable] 4 | train_files = /home/ubuntu/WORKSPACE/StanfordBiaffineParser-v2/predictedPOS/GENIA.train.conll.20wayJK.txt 5 | parse_files = /home/ubuntu/WORKSPACE/StanfordBiaffineParser-v2/predictedPOS/GENIA.dev.conll.20wayJK.txt 6 | 7 | [Pretrained Vocab] 8 | filename = /home/ubuntu/WORKSPACE/PubMed-shuffle-win2-500Kwords.txt 9 | # skips the first line of the file, which sometimes contains metadata about the embedding matrix 10 | skip_header = True 11 | 12 | #*************************************************************** 13 | # Embedding hyperparameters 14 | [Char Vocab] 15 | # {RNNEmbed, CNNEmbed, MLPEmbed} 16 | embed_model = RNNEmbed 17 | 18 | # The aggregated word vocab, pretrained vocab, and char vocab 19 | [Multivocab] 20 | # probability of dropping a word embedding 21 | embed_keep_prob = .67 22 | 23 | [Tag Vocab] 24 | # probability of dropping a tag embedding 25 | embed_keep_prob = .67 26 | 27 | [RNN Embed] 28 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell} 29 | recur_cell = LSTMCell 30 | # number of LSTM layers 31 | n_layers = 3 32 | # number of recurrent units 33 | recur_size = 400 34 | # probability of dropping a connection between timesteps at a single layer 35 | recur_keep_prob = .67 36 | # probability of dropping a connection between layers at a single timestep 37 | ff_keep_prob = .67 38 | 39 | #*************************************************************** 40 | # NLP model hyperparameters 41 | [Tagger] 42 | #if you only want it to produce the first column of tags, set this to just 'tags' 43 | output_vocabs = tags:xtags 44 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell} 45 | recur_cell = LSTMCell 46 | # number of LSTM layers 47 | n_layers = 2 48 | # number of recurrent units in each direction of the BiLSTM 49 | recur_size = 400 50 | # number of units in the tag classifier 51 | mlp_size = 600 52 | # probability of dropping a node in the MLP or the classifier 53 | mlp_keep_prob = .67 54 | # probability of dropping a connection between timesteps at a single layer 55 | recur_keep_prob = .5 56 | # probability of dropping a connection between layers at a single timestep 57 | ff_keep_prob = .67 58 | 59 | [Parser] 60 | # if you only want it to use the first column of tags, set this to 'words:tags' 61 | input_vocabs = words:tags:xtags 62 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell} 63 | recur_cell = LSTMCell 64 | # number of layers 65 | n_layers = 3 66 | # number of recurrent units 67 | recur_size = 400 68 | # number of units in the edge classifier 69 | arc_mlp_size = 600 70 | # number of units in the label classifier (you probably want this to be small!) 71 | rel_mlp_size = 100 72 | # probability of dropping a node in the MLP or the classifier 73 | mlp_keep_prob = .67 74 | # probability of dropping a connection between timesteps at a single layer 75 | recur_keep_prob = .67 76 | # probability of dropping a connection between layers at a single timestep 77 | ff_keep_prob = .67 78 | 79 | #*************************************************************** 80 | # Training hyperparameters 81 | [Network] 82 | # {Parser, Tagger} 83 | nlp_model = Parser 84 | quit_after_n_iters_without_improvement = 5000 85 | max_train_iters = 20001 86 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/config/defaults.cfg: -------------------------------------------------------------------------------- 1 | #*************************************************************** 2 | # High level stuff 3 | [DEFAULT] 4 | save_dir = saves/defaults 5 | data_dir = data 6 | lc = en 7 | treebank = English 8 | lang = English 9 | 10 | [Configurable] 11 | train_files = %(data_dir)s/CoNLL17/UD_%(treebank)s/%(lc)s-ud-train.conllu 12 | parse_files = %(data_dir)s/CoNLL17/UD_%(treebank)s/%(lc)s-ud-dev.conllu 13 | verbose = True 14 | name = None 15 | 16 | #*************************************************************** 17 | # Vocab data structures 18 | [Base Vocab] 19 | # TODO take special_tokens out of here and put them in the classes 20 | cased = None 21 | embed_size = 100 22 | 23 | [Pretrained Vocab] 24 | special_tokens=::: 25 | skip_header = True 26 | name = pretrained 27 | filename = %(data_dir)s/embeddings/%(lang)s/%(lc)s.vectors.xz 28 | cased = False 29 | max_rank = 0 30 | 31 | [Token Vocab] 32 | name = tokens 33 | embed_keep_prob = .67 34 | min_occur_count = 2 35 | max_rank = 100000 36 | 37 | [Index Vocab] 38 | special_tokens=: 39 | 40 | [Dep Vocab] 41 | name = deps 42 | 43 | [Head Vocab] 44 | name = heads 45 | 46 | [Word Vocab] 47 | special_tokens=::: 48 | name = words 49 | filename = %(save_dir)s/%(name)s.txt 50 | cased = False 51 | 52 | [Lemma Vocab] 53 | name = lemmas 54 | filename = %(save_dir)s/%(name)s.txt 55 | 56 | [Tag Vocab] 57 | special_tokens=PAD:ROOT:DROP:UNK 58 | name = tags 59 | filename = %(save_dir)s/%(name)s.txt 60 | cased = True 61 | 62 | [X Tag Vocab] 63 | name = xtags 64 | filename = %(save_dir)s/%(name)s.txt 65 | 66 | [Rel Vocab] 67 | special_tokens=pad:root:drop:unk 68 | name = rels 69 | filename = %(save_dir)s/%(name)s.txt 70 | cased = True 71 | 72 | [Subtoken Vocab] 73 | max_rank = 0 74 | # TODO Setting this to more than 1 triggers a bug 75 | n_buckets = 2 76 | embed_model = CNNEmbed 77 | embed_keep_prob = 1 78 | 79 | [Char Vocab] 80 | special_tokens = :::::: 81 | name = chars 82 | filename = %(save_dir)s/%(name)s.txt 83 | embed_model = RNNEmbed 84 | 85 | [Ngram Vocab] 86 | special_tokens = :::: 87 | name = ngrams 88 | filename = %(save_dir)s/%(name)s.txt 89 | embed_model = MLPEmbed 90 | 91 | [Ngram Multivocab] 92 | special_tokens = :::: 93 | name = multi-ngram 94 | max_n = 5 95 | embed_model = MLPEmbed 96 | 97 | [Bytepair Vocab] 98 | name = bytepairs 99 | filename = %(save_dir)s/%(name)s.txt 100 | n_bytepairs = 500 101 | embed_model = MLPEmbed 102 | 103 | [Multivocab] 104 | embed_keep_prob = .67 105 | 106 | #*************************************************************** 107 | # Neural models 108 | [NN] 109 | recur_cell = LSTMCell 110 | n_layers = 3 111 | mlp_func = leaky_relu 112 | conv_func = leaky_relu 113 | # TODO make sure you add this to Base Cell 114 | recur_size = 200 115 | window_size = 5 116 | conv_size = 200 117 | mlp_size = 200 118 | rnn_func = birnn 119 | conv_keep_prob = .67 120 | mlp_keep_prob = .67 121 | recur_keep_prob = .67 122 | ff_keep_prob = .67 123 | 124 | [Base Cell] 125 | forget_bias = 0 126 | recur_func = tanh 127 | recur_size = 300 128 | 129 | [RNN Cell] 130 | recur_func = leaky_relu 131 | recur_size = 400 132 | 133 | [Base Embed] 134 | 135 | [MLP Embed] 136 | 137 | [RNN Embed] 138 | rnn_func = rnn 139 | 140 | [CNN Embed] 141 | 142 | [Base Tagger] 143 | input_vocabs = words 144 | output_vocabs = tags 145 | 146 | [Base X Tagger] 147 | input_vocabs = words 148 | output_vocabs = tags:xtags 149 | 150 | [Tagger] 151 | name = tagger 152 | n_layers = 2 153 | recur_keep_prob = .5 154 | 155 | [X Tagger] 156 | name = xtagger 157 | n_layers = 2 158 | recur_keep_prob = .5 159 | 160 | [Base Parser] 161 | # TODO take off xtags later 162 | input_vocabs = words:tags:xtags 163 | output_vocabs = rels:heads 164 | 165 | [Parser] 166 | name = parser 167 | arc_mlp_size = 400 168 | rel_mlp_size = 100 169 | 170 | [Xbar Parser] 171 | name = xbar_parser 172 | p_mlp_size = 400 173 | arc_mlp_size = 400 174 | rel_mlp_size = 100 175 | 176 | [Bin Parser] 177 | name = bin_parser 178 | p_mlp_size = 400 179 | arc_mlp_size = 400 180 | rel_mlp_size = 100 181 | 182 | [Fish Parser] 183 | name = fish_parser 184 | lambda_mlp_size = 400 185 | arc_mlp_size = 400 186 | rel_mlp_size = 100 187 | 188 | [Gama Parser] 189 | name = fish_parser 190 | p_mlp_size = 400 191 | arc_mlp_size = 400 192 | rel_mlp_size = 100 193 | 194 | [Joint Parser] 195 | tag_mlp_size = 500 196 | arc_mlp_size = 500 197 | rel_mlp_size = 100 198 | 199 | #*************************************************************** 200 | # Sequence data structures 201 | [Multibucket] 202 | n_buckets = 2 203 | name = multibucket 204 | 205 | [Bucket] 206 | name = None 207 | 208 | [Dataset] 209 | #TODO make sure you can get rid of data_files 210 | 211 | [Trainset] 212 | name = trainset 213 | data_files = train_files 214 | n_buckets = 10 215 | batch_by = tokens 216 | batch_size = 5000 217 | 218 | [Parseset] 219 | name = parseset 220 | data_files = parse_files 221 | n_buckets = 5 222 | batch_by = tokens 223 | batch_size = 50000 224 | 225 | 226 | #*************************************************************** 227 | # Training 228 | [Network] 229 | name = network 230 | subtoken_vocab = CharVocab 231 | nlp_model = Parser 232 | min_train_iters = 1000 233 | max_train_iters = 20001 234 | validate_every = 100 235 | save_every = 1 236 | quit_after_n_iters_without_improvement = 5000 237 | per_process_gpu_memory_fraction = -1 238 | 239 | #*************************************************************** 240 | # Miscellaneous 241 | [Radam Optimizer] 242 | name = radam 243 | # TODO keep adjusting lr? 244 | learning_rate = 2e-3 245 | decay = .75 246 | decay_steps = 5000 247 | clip = 5 248 | mu = .9 249 | nu = .9 250 | gamma = 0 251 | chi = 0 252 | epsilon = 1e-12 253 | 254 | [Zipf] 255 | n_zipfs = 3 256 | name = zipf 257 | filename = %(save_dir)s/%(name)s.txt 258 | batch_size = 500 259 | max_train_iters = 5000 260 | print_every = 500 261 | 262 | [Bucketer] 263 | name = bucketer 264 | filename = %(save_dir)s/%(name)s.txt 265 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/config/template.cfg: -------------------------------------------------------------------------------- 1 | #*************************************************************** 2 | # Where things are located 3 | [Configurable] 4 | train_files = colon/separated/list/of/files:supports/glob/* 5 | parse_files = colon/separated/list/of/files:supports/glob/* 6 | 7 | [Pretrained Vocab] 8 | filename = location/of/pretrained/embeddings 9 | # skips the first line of the file, which sometimes contains metadata about the embedding matrix 10 | skip_header = True 11 | 12 | #*************************************************************** 13 | # Embedding hyperparameters 14 | [Char Vocab] 15 | # {RNNEmbed, CNNEmbed, MLPEmbed} 16 | embed_model = RNNEmbed 17 | 18 | # The aggregated word vocab, pretrained vocab, and char vocab 19 | [Multivocab] 20 | # probability of dropping a word embedding 21 | embed_keep_prob = .67 22 | 23 | [Tag Vocab] 24 | # probability of dropping a tag embedding 25 | embed_keep_prob = .67 26 | 27 | [RNN Embed] 28 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell} 29 | recur_cell = LSTMCell 30 | # number of LSTM layers 31 | n_layers = 3 32 | # number of recurrent units 33 | recur_size = 400 34 | # probability of dropping a connection between timesteps at a single layer 35 | recur_keep_prob = .67 36 | # probability of dropping a connection between layers at a single timestep 37 | ff_keep_prob = .67 38 | 39 | #*************************************************************** 40 | # NLP model hyperparameters 41 | [Tagger] 42 | #if you only want it to produce the first column of tags, set this to just 'tags' 43 | output_vocabs = tags:xtags 44 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell} 45 | recur_cell = LSTMCell 46 | # number of LSTM layers 47 | n_layers = 2 48 | # number of recurrent units in each direction of the BiLSTM 49 | recur_size = 400 50 | # number of units in the tag classifier 51 | mlp_size = 600 52 | # probability of dropping a node in the MLP or the classifier 53 | mlp_keep_prob = .67 54 | # probability of dropping a connection between timesteps at a single layer 55 | recur_keep_prob = .5 56 | # probability of dropping a connection between layers at a single timestep 57 | ff_keep_prob = .67 58 | 59 | [Parser] 60 | # if you only want it to use the first column of tags, set this to 'words:tags' 61 | input_vocabs = words:tags:xtags 62 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell} 63 | recur_cell = LSTMCell 64 | # number of layers 65 | n_layers = 3 66 | # number of recurrent units 67 | recur_size = 400 68 | # number of units in the edge classifier 69 | arc_mlp_size = 600 70 | # number of units in the label classifier (you probably want this to be small!) 71 | rel_mlp_size = 100 72 | # probability of dropping a node in the MLP or the classifier 73 | mlp_keep_prob = .67 74 | # probability of dropping a connection between timesteps at a single layer 75 | recur_keep_prob = .67 76 | # probability of dropping a connection between layers at a single timestep 77 | ff_keep_prob = .67 78 | 79 | #*************************************************************** 80 | # Training hyperparameters 81 | [Network] 82 | # {Parser, Tagger} 83 | nlp_model = Parser 84 | quit_after_n_iters_without_improvement = 5000 85 | max_train_iters = 50000 86 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import re 23 | import os 24 | import sys 25 | import codecs 26 | from argparse import ArgumentParser 27 | 28 | from parser import Configurable 29 | from parser import Network 30 | 31 | # TODO make the pretrained vocab names a list given to TokenVocab 32 | #*************************************************************** 33 | # Set up the argparser 34 | argparser = ArgumentParser('Network') 35 | argparser.add_argument('--save_dir', required=True) 36 | subparsers = argparser.add_subparsers() 37 | section_names = set() 38 | # --section_name opt1=value1 opt2=value2 opt3=value3 39 | with codecs.open('config/defaults.cfg') as f: 40 | section_regex = re.compile('\[(.*)\]') 41 | for line in f: 42 | match = section_regex.match(line) 43 | if match: 44 | section_names.add(match.group(1).lower().replace(' ', '_')) 45 | 46 | #=============================================================== 47 | # Train 48 | #--------------------------------------------------------------- 49 | def train(save_dir, **kwargs): 50 | """""" 51 | 52 | kwargs['config_file'] = kwargs.pop('config_file', '') 53 | load = kwargs.pop('load') 54 | try: 55 | if not load and os.path.isdir(save_dir): 56 | raw_input('Save directory already exists. Press to continue or to abort.') 57 | if os.path.isfile(os.path.join(save_dir, 'config.cfg')): 58 | os.remove(os.path.join(save_dir, 'config.cfg')) 59 | except KeyboardInterrupt: 60 | print() 61 | sys.exit(0) 62 | network = Network(**kwargs) 63 | network.train(load=load) 64 | return 65 | #--------------------------------------------------------------- 66 | 67 | train_parser = subparsers.add_parser('train') 68 | train_parser.set_defaults(action=train) 69 | train_parser.add_argument('--load', action='store_true') 70 | train_parser.add_argument('--config_file') 71 | for section_name in section_names: 72 | train_parser.add_argument('--'+section_name, nargs='+') 73 | 74 | #=============================================================== 75 | # Parse 76 | #--------------------------------------------------------------- 77 | def parse(save_dir, **kwargs): 78 | """""" 79 | 80 | kwargs['config_file'] = os.path.join(save_dir, 'config.cfg') 81 | files = kwargs.pop('files') 82 | output_file = kwargs.pop('output_file', None) 83 | output_dir = kwargs.pop('output_dir', None) 84 | if len(files) > 1 and output_file is not None: 85 | raise ValueError('Cannot provide a value for --output_file when parsing multiple files') 86 | kwargs['is_evaluation'] = True 87 | network = Network(**kwargs) 88 | network.parse(files, output_file=output_file, output_dir=output_dir) 89 | return 90 | #--------------------------------------------------------------- 91 | 92 | parse_parser = subparsers.add_parser('parse') 93 | parse_parser.set_defaults(action=parse) 94 | parse_parser.add_argument('files', nargs='+') 95 | for section_name in section_names: 96 | parse_parser.add_argument('--'+section_name, nargs='+') 97 | parse_parser.add_argument('--output_file') 98 | parse_parser.add_argument('--output_dir') 99 | 100 | #*************************************************************** 101 | # Parse the arguments 102 | kwargs = vars(argparser.parse_args()) 103 | action = kwargs.pop('action') 104 | save_dir = kwargs.pop('save_dir') 105 | kwargs = {key: value for key, value in kwargs.iteritems() if value is not None} 106 | for section, values in kwargs.iteritems(): 107 | if section in section_names: 108 | values = [value.split('=', 1) for value in values] 109 | kwargs[section] = {opt: value for opt, value in values} 110 | if 'default' not in kwargs: 111 | kwargs['default'] = {} 112 | kwargs['default']['save_dir'] = save_dir 113 | action(save_dir, **kwargs) 114 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | 4 | from configurable import Configurable 5 | from bucket import Bucket 6 | from multibucket import Multibucket 7 | from network import Network -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/bucket.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from parser.configurable import Configurable 26 | 27 | #*************************************************************** 28 | class Bucket(Configurable): 29 | """""" 30 | 31 | #============================================================= 32 | def __init__(self, *args, **kwargs): 33 | """""" 34 | 35 | embed_model = kwargs.pop('embed_model', None) 36 | super(Bucket, self).__init__(*args, **kwargs) 37 | 38 | self._indices = [] 39 | self._maxlen = 0 40 | self._depth = 1 41 | self._tokens = [] 42 | if embed_model is not None: 43 | self._embed_model = embed_model.from_configurable(self, name=self.name) 44 | else: 45 | self._embed_model = None 46 | return 47 | 48 | #============================================================= 49 | def __call__(self, vocab, keep_prob=None, moving_params=None): 50 | """""" 51 | 52 | return self.embed_model(vocab, keep_prob=keep_prob, moving_params=moving_params) 53 | 54 | #============================================================= 55 | def open(self, maxlen, depth=None): 56 | """""" 57 | 58 | if depth is None: 59 | self._indices = [[0]] 60 | else: 61 | self._indices = [[[0]*depth]] 62 | self._tokens = [['']] 63 | self._maxlen = maxlen 64 | self._depth = depth 65 | return self 66 | 67 | #============================================================= 68 | def add(self, idxs, tokens=None): 69 | """""" 70 | 71 | if isinstance(self.indices, np.ndarray): 72 | raise TypeError("The bucket has already been closed, you can't add to it") 73 | if len(idxs) > len(self) and len(self) != -1: 74 | raise ValueError('Bucket of max len %d received sequence of len %d' % (len(self), len(idxs))) 75 | 76 | self.indices.append(idxs) 77 | if tokens is not None: 78 | self.tokens.append(tokens) 79 | return len(self.indices) - 1 80 | 81 | #============================================================= 82 | def get_tokens(self, batch): 83 | """""" 84 | 85 | return [self.tokens[sent_idx] for sent_idx in batch] 86 | 87 | #============================================================= 88 | def close(self): 89 | """""" 90 | 91 | if self.depth is None: 92 | indices = np.zeros((len(self.indices), len(self)), dtype=np.int32) 93 | for i, sequence in enumerate(self.indices): 94 | indices[i,0:len(sequence)] = sequence 95 | else: 96 | indices = np.zeros((len(self.indices), len(self), self.depth), dtype=np.int32) 97 | for i, sequence in enumerate(self.indices): 98 | for j, index in enumerate(sequence): 99 | indices[i,j,0:len(index)] = index 100 | self._indices = indices 101 | 102 | #============================================================= 103 | @classmethod 104 | def from_dataset(cls, dataset, bkt_idx, *args, **kwargs): 105 | """""" 106 | 107 | kwargs = dict(kwargs) 108 | kwargs['name'] = '{name}-{bkt_idx}'.format(name=dataset.name, bkt_idx=bkt_idx) 109 | bucket = cls.from_configurable(dataset, *args, **kwargs) 110 | indices = [] 111 | for multibucket in dataset: 112 | indices.append(multibucket[bkt_idx].indices) 113 | for i in xrange(len(indices)): 114 | if len(indices[i].shape) == 2: 115 | indices[i] = indices[i][:,:,None] 116 | bucket._indices = np.concatenate(indices, axis=2) 117 | bucket._maxlen = bucket.indices.shape[1] 118 | bucket._depth = bucket.indices.shape[2] 119 | return bucket 120 | 121 | #============================================================= 122 | def reset_placeholders(self): 123 | self.embed_model.reset_placeholders() 124 | return 125 | #============================================================= 126 | @property 127 | def tokens(self): 128 | return self._tokens 129 | @property 130 | def indices(self): 131 | return self._indices 132 | @property 133 | def embed_model(self): 134 | return self._embed_model 135 | @property 136 | def depth(self): 137 | return self._depth 138 | @property 139 | def placeholder(self): 140 | return self.embed_model.placeholder 141 | 142 | #============================================================= 143 | def __len__(self): 144 | return self._maxlen 145 | def __enter__(self): 146 | return self 147 | def __exit__(self, exception_type, exception_value, trace): 148 | if exception_type is not None: 149 | raise exception_type(exception_value) 150 | self.close() 151 | return 152 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/misc/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/misc/colors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | colors = { 23 | None: '\033[0m', 24 | 'bold': '\033[1m', 25 | 'italic': '\033[3m', 26 | 'uline': '\033[4m', 27 | 'blink': '\033[5m', 28 | 'hlight': '\033[7m', 29 | 30 | 'black': '\033[30m', 31 | 'red': '\033[31m', 32 | 'green': '\033[32m', 33 | 'yellow': '\033[33m', 34 | 'blue': '\033[34m', 35 | 'magenta': '\033[35m', 36 | 'cyan': '\033[36m', 37 | 'white': '\033[37m', 38 | 39 | 'black_hlight': '\033[40m', 40 | 'red_hlight': '\033[41m', 41 | 'green_hlight': '\033[42m', 42 | 'yellow_hlight': '\033[43m', 43 | 'blue_hlight': '\033[44m', 44 | 'magenta_hlight': '\033[45m', 45 | 'cyan_hlight': '\033[46m', 46 | 'white_hlight': '\033[47m', 47 | 48 | 'bright_black': '\033[90m', 49 | 'bright_red': '\033[91m', 50 | 'bright_green': '\033[92m', 51 | 'bright_yellow': '\033[93m', 52 | 'bright_blue': '\033[94m', 53 | 'bright_magenta': '\033[95m', 54 | 'bright_cyan': '\033[96m', 55 | 'bright_white': '\033[97m', 56 | 57 | 'bright_black_hlight': '\033[100m', 58 | 'bright_red_hlight': '\033[101m', 59 | 'bright_green_hlight': '\033[102m', 60 | 'bright_orange_hlight': '\033[103m', 61 | 'bright_blue_hlight': '\033[1010m', 62 | 'bright_magenta_hlight': '\033[105m', 63 | 'bright_cyan_hlight': '\033[106m', 64 | 'bright_white_hlight': '\033[107m', 65 | } 66 | 67 | def ctext(text, *color_list): 68 | return ''.join(colors[color] for color in color_list) + text + colors[None] 69 | def color_pattern(text1, text2, *color_list): 70 | multicolor = ''.join(colors[color] for color in color_list) 71 | return multicolor + colors['bold'] + text1 + colors[None] + ' ' + multicolor + colors['uline'] + text2 + colors[None] 72 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/misc/get_encoding.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import codecs 23 | 24 | #*************************************************************** 25 | encodings = ['utf-8', 'ascii'] 26 | 27 | def get_encoding(filename): 28 | """""" 29 | 30 | success = False 31 | for encoding in encodings: 32 | with codecs.open(filename, encoding=encoding) as f: 33 | try: 34 | for i, line in enumerate(f): 35 | pass 36 | success = True 37 | break 38 | except ValueError as e: 39 | print('Encoding {0} failed for file {1} at line {2}: {3}\n{4}'.format(encoding, filename, i, line, e)) 40 | continue 41 | 42 | if success: 43 | return encoding 44 | else: 45 | raise ValueError('No valid encoding found for file {0}'.format(filename)) 46 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/multibucket.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from parser import Configurable 26 | from parser import Bucket 27 | from parser.misc.colors import ctext 28 | 29 | #*************************************************************** 30 | class Multibucket(Configurable): 31 | """""" 32 | 33 | #============================================================= 34 | def __init__(self, *args, **kwargs): 35 | """""" 36 | 37 | self._embed_model = kwargs.pop('embed_model', None) 38 | super(Multibucket, self).__init__(*args, **kwargs) 39 | 40 | self._indices = [] 41 | self._buckets = [] 42 | self._len2idx = {} 43 | self.placeholder = None 44 | return 45 | 46 | #============================================================= 47 | def __call__(self, vocab, keep_prob=None, moving_params=None): 48 | """""" 49 | 50 | # This placeholder is used to ensure the bucket data is in the right order 51 | reuse = None if moving_params is None else True 52 | self.generate_placeholder() 53 | embeddings = [] 54 | for i, bucket in enumerate(self): 55 | if i > 0: 56 | reuse = True 57 | with tf.variable_scope(self.name+'-multibucket', reuse=reuse): 58 | embeddings.append(bucket(vocab, keep_prob=keep_prob, moving_params=moving_params)) 59 | return tf.nn.embedding_lookup(tf.concat(embeddings, axis=0), self.placeholder) 60 | 61 | #============================================================= 62 | def reset_placeholders(self): 63 | self.placeholder = None 64 | for bucket in self: 65 | bucket.reset_placeholders() 66 | return 67 | 68 | #============================================================= 69 | def generate_placeholder(self): 70 | """""" 71 | 72 | if self.placeholder is None: 73 | self.placeholder = tf.placeholder(tf.int32, shape=(None,), name=self.name+'-multibucket') 74 | return self.placeholder 75 | 76 | #============================================================= 77 | def open(self, maxlens, depth=None): 78 | """""" 79 | 80 | self._indices = [(0,0)] 81 | self._buckets = [] 82 | self._len2idx = {} 83 | prevlen = -1 84 | for idx, maxlen in enumerate(maxlens): 85 | self._buckets.append(Bucket.from_configurable(self, embed_model=self.embed_model, name='%s-%d' % (self.name, idx)).open(maxlen, depth=depth)) 86 | self._len2idx.update(zip(range(prevlen+1, maxlen+1), [idx]*(maxlen-prevlen))) 87 | prevlen = maxlen 88 | return self 89 | 90 | #============================================================= 91 | def add(self, idxs, tokens=None): 92 | """""" 93 | 94 | if isinstance(self.indices, np.ndarray): 95 | raise TypeError("The buckets have already been closed, you can't add to them") 96 | 97 | idx = self._len2idx.get(len(idxs), len(self)-1) 98 | bkt_idx = self[idx].add(idxs, tokens=tokens) 99 | self.indices.append( (idx, bkt_idx) ) 100 | return len(self.indices) - 1 101 | 102 | #============================================================= 103 | def close(self): 104 | """""" 105 | 106 | for bucket in self: 107 | bucket.close() 108 | 109 | self._indices = np.array(self.indices, dtype=[('bkt_idx', 'i4'), ('idx', 'i4')]) 110 | return 111 | 112 | #============================================================= 113 | def inv_idxs(self): 114 | """""" 115 | 116 | return np.argsort(np.concatenate([np.where(self.indices['bkt_idx'][1:] == bkt_idx)[0] for bkt_idx in xrange(len(self))])) 117 | 118 | #============================================================= 119 | def get_tokens(self, bkt_idx, batch): 120 | """""" 121 | 122 | return self[bkt_idx].get_tokens(batch) 123 | 124 | #============================================================= 125 | @classmethod 126 | def from_dataset(cls, dataset, *args, **kwargs): 127 | """""" 128 | 129 | multibucket = cls.from_configurable(dataset, *args, **kwargs) 130 | indices = [] 131 | for multibucket_ in dataset: 132 | indices.append(multibucket_.indices) 133 | for i in xrange(1, len(indices)): 134 | assert np.equal(indices[0].astype(int), indices[i].astype(int)).all() 135 | multibucket._indices = np.array(multibucket_.indices) 136 | buckets = [Bucket.from_dataset(dataset, i, *args, **kwargs) for i in xrange(len(multibucket_))] 137 | multibucket._buckets = buckets 138 | if dataset.verbose: 139 | for bucket in multibucket: 140 | print('Bucket {name} is {shape}'.format(name=bucket.name, shape=ctext(' x '.join(str(x) for x in bucket.indices.shape), 'bright_blue'))) 141 | return multibucket 142 | 143 | #============================================================= 144 | @property 145 | def indices(self): 146 | return self._indices 147 | @property 148 | def embed_model(self): 149 | return self._embed_model 150 | 151 | #============================================================= 152 | def __str__(self): 153 | return str(self._buckets) 154 | def __iter__(self): 155 | return (bucket for bucket in self._buckets) 156 | def __getitem__(self, key): 157 | return self._buckets[key] 158 | def __len__(self): 159 | return len(self._buckets) 160 | def __enter__(self): 161 | return self 162 | def __exit__(self, exception_type, exception_value, trace): 163 | if exception_type is not None: 164 | raise exception_type(exception_value) 165 | self.close() 166 | return 167 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/__init__.py: -------------------------------------------------------------------------------- 1 | import models 2 | import optimizers 3 | import recur_cells -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import numpy as np 9 | import tensorflow as tf 10 | 11 | #*************************************************************** 12 | sig_const = np.arctanh(1/3) 13 | tanh_const = np.arctanh(np.sqrt(1/3)) 14 | 15 | def gate(x): 16 | return tf.nn.sigmoid(2*x) 17 | 18 | def tanh(x): 19 | return tf.nn.tanh(x) 20 | 21 | def gated_tanh(x): 22 | dim = len(x.get_shape().as_list())-1 23 | cell_act, gate_act = tf.split(x, 2, dim) 24 | return gate(gate_act) * tanh(cell_act) 25 | 26 | def identity(x): 27 | return tf.identity(x) 28 | 29 | def gated_identity(x): 30 | dim = len(x.get_shape().as_list())-1 31 | cell_act, gate_act = tf.split(x, 2, dim) 32 | return gate(gate_act) * identity(cell_act) 33 | 34 | def softplus(x): 35 | return tf.softplus(2*x)/2 36 | 37 | def elu(x): 38 | return tf.nn.elu(x) 39 | 40 | def relu(x): 41 | return tf.nn.relu(x) 42 | 43 | def leaky_relu(x): 44 | return tf.maximum(.1*x, x) -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/__init__.py: -------------------------------------------------------------------------------- 1 | from nn import NN -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/embeds/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from mlp_embed import MLPEmbed 19 | from rnn_embed import RNNEmbed 20 | from cnn_embed import CNNEmbed -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/embeds/base_embed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from parser.vocabs import TokenVocab, Multivocab 26 | from parser.neural.models import NN 27 | 28 | #*************************************************************** 29 | class BaseEmbed(NN): 30 | """""" 31 | 32 | #============================================================= 33 | def __init__(self, *args, **kwargs): 34 | """""" 35 | 36 | super(BaseEmbed, self).__init__(*args, **kwargs) 37 | # This placeholder represents the data in the bucket that called BaseEmbed.__init__ 38 | self.placeholder = None 39 | return 40 | 41 | #============================================================= 42 | def reset_placeholders(self): 43 | self.placeholder = None 44 | return 45 | 46 | #============================================================= 47 | def __call__(self, vocab, keep_prob=None, moving_params=None): 48 | """""" 49 | 50 | self.moving_params = moving_params 51 | if isinstance(vocab, Multivocab): 52 | multivocab = vocab 53 | self.generate_placeholder([None,None,None]) 54 | embeddings = [TokenVocab.__call__(vocab, self.placeholder[:,:,i]) for i, vocab in enumerate(multivocab)] 55 | embeddings = tf.stack(embeddings, axis=2) 56 | # (n x b x g x d) -> (n x b x d) 57 | with tf.variable_scope('Pre-Attn'): 58 | embeddings = self.linear_attention(embeddings) 59 | self._tokens_to_keep = tf.to_float(tf.greater(self.placeholder[:,:,0], vocab.PAD)) 60 | else: 61 | self.generate_placeholder([None,None]) 62 | # (n x b x d) 63 | embeddings = TokenVocab.__call__(vocab, self.placeholder) 64 | self._tokens_to_keep = tf.to_float(tf.greater(self.placeholder, vocab.PAD)) 65 | self._batch_size = tf.shape(self.placeholder)[0] 66 | self._bucket_size = tf.shape(self.placeholder)[1] 67 | self._sequence_lengths = tf.to_int32(tf.reduce_sum(self.tokens_to_keep, axis=1)) 68 | self._n_tokens = tf.reduce_sum(self.sequence_lengths) 69 | return embeddings 70 | 71 | #============================================================= 72 | def generate_placeholder(self, shape): 73 | if self.placeholder is None: 74 | self.placeholder = tf.placeholder(tf.int32, shape=shape, name='%s-bkt' % self.name) 75 | return self.placeholder 76 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/embeds/cnn_embed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from parser.neural.models.embeds.base_embed import BaseEmbed 26 | 27 | #*************************************************************** 28 | class CNNEmbed(BaseEmbed): 29 | """""" 30 | 31 | #============================================================= 32 | def __call__(self, vocab, **kwargs): 33 | """""" 34 | 35 | # (n x b x d) 36 | embeddings = super(CNNEmbed, self).__call__(vocab, **kwargs) 37 | # (n x b x d) -> (n x b x h) 38 | with tf.variable_scope('CNN'): 39 | conv = self.CNN(embeddings, self.window_size, self.conv_size) 40 | # (n x b x h) -> (n x h) 41 | hidden = tf.reduce_max(conv, axis=1) 42 | # (n x h) -> (n x o) 43 | linear = self.linear(hidden, vocab.token_embed_size) 44 | return linear 45 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/embeds/mlp_embed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from parser.neural.models.embeds.base_embed import BaseEmbed 26 | 27 | #*************************************************************** 28 | class MLPEmbed(BaseEmbed): 29 | """""" 30 | 31 | #============================================================= 32 | def __call__(self, vocab, **kwargs): 33 | """""" 34 | 35 | # (n x b x d) 36 | embeddings = super(MLPEmbed, self).__call__(vocab, **kwargs) 37 | # (n x b x d) -> (n x d) 38 | with tf.variable_scope('Attn'): 39 | attn = self.linear_attention(embeddings) 40 | # (n x d) -> (n x h) 41 | with tf.variable_scope('MLP'): 42 | hidden = self.MLP(attn, self.mlp_size) 43 | # (n x h) -> (n x o) 44 | linear = self.linear(hidden, vocab.token_embed_size) 45 | return linear -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/embeds/rnn_embed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | import parser.neural.rnn as rnn 26 | from parser.neural.models.embeds.base_embed import BaseEmbed 27 | 28 | #*************************************************************** 29 | class RNNEmbed(BaseEmbed): 30 | """""" 31 | 32 | #============================================================= 33 | def __call__(self, vocab, **kwargs): 34 | """""" 35 | 36 | # (n x b x d) 37 | embeddings = super(RNNEmbed, self).__call__(vocab, **kwargs) 38 | # (n x b x d) -> (n x b x h) 39 | with tf.variable_scope('RNN'): 40 | recur, state = self.RNN(embeddings, self.recur_size) 41 | if self.rnn_func == rnn.birnn: 42 | state_fw, state_bw = tf.unstack(state) 43 | state_fw = tf.split(state_fw, 2, axis=1)[0] 44 | state_bw = tf.split(state_bw, 2, axis=1)[0] 45 | state = tf.concat([state_fw, state_bw], 1) 46 | elif self.rnn_func == rnn.rnn: 47 | state = tf.split(state, 2, axis=1)[0] 48 | # (n x b x h) -> (n x h) 49 | with tf.variable_scope('MLP'): 50 | hidden = self.linear_attention(recur) 51 | # (n x h) -> (n x o) 52 | linear = self.linear(tf.concat([hidden, state], axis=1), vocab.token_embed_size) 53 | return linear 54 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/nlp/__init__.py: -------------------------------------------------------------------------------- 1 | from parsers import * 2 | from taggers import * 3 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | from parser import Parser 2 | from fish_parser import FishParser 3 | from gama_parser import GamaParser 4 | from xbar_parser import XbarParser 5 | from bin_parser import BinParser 6 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/base_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import re 23 | import codecs 24 | import numpy as np 25 | import tensorflow as tf 26 | import matplotlib.pyplot as plt 27 | 28 | from parser.misc.colors import ctext, color_pattern 29 | from parser.misc.mst import nonprojective, argmax 30 | from parser.neural.models.nn import NN 31 | 32 | #*************************************************************** 33 | class BaseParser(NN): 34 | """""" 35 | 36 | PAD = 0 37 | ROOT = 1 38 | 39 | #============================================================= 40 | def __call__(self, vocabs, moving_params=None): 41 | """""" 42 | 43 | self.moving_params = moving_params 44 | if isinstance(vocabs, dict): 45 | self.vocabs = vocabs 46 | else: 47 | self.vocabs = {vocab.name: vocab for vocab in vocabs} 48 | 49 | input_vocabs = [self.vocabs[name] for name in self.input_vocabs] 50 | #embed = tf.concat([vocab(moving_params=self.moving_params) for vocab in input_vocabs], 2) 51 | embed = self.embed_concat(input_vocabs) 52 | for vocab in self.vocabs.values(): 53 | if vocab not in input_vocabs: 54 | vocab.generate_placeholder() 55 | placeholder = self.vocabs['words'].placeholder 56 | if len(placeholder.get_shape().as_list()) == 3: 57 | placeholder = placeholder[:,:,0] 58 | self._tokens_to_keep = tf.to_float(tf.greater(placeholder, self.ROOT)) 59 | self._batch_size = tf.shape(placeholder)[0] 60 | self._bucket_size = tf.shape(placeholder)[1] 61 | self._sequence_lengths = tf.reduce_sum(tf.to_int32(tf.greater(placeholder, self.PAD)), axis=1) 62 | self._n_tokens = tf.to_int32(tf.reduce_sum(self.tokens_to_keep)) 63 | 64 | top_recur = embed 65 | for i in xrange(self.n_layers): 66 | with tf.variable_scope('RNN%d' % i): 67 | top_recur, _ = self.RNN(top_recur, self.recur_size) 68 | return top_recur 69 | 70 | #============================================================= 71 | def process_accumulators(self, accumulators, time=None): 72 | """""" 73 | 74 | n_tokens, n_seqs, loss, rel_corr, arc_corr, corr, seq_corr = accumulators 75 | acc_dict = { 76 | 'Loss': loss, 77 | 'LS': rel_corr/n_tokens*100, 78 | 'UAS': arc_corr/n_tokens*100, 79 | 'LAS': corr/n_tokens*100, 80 | 'SS': seq_corr/n_seqs*100, 81 | } 82 | if time is not None: 83 | acc_dict.update({ 84 | 'Token_rate': n_tokens / time, 85 | 'Seq_rate': n_seqs / time, 86 | }) 87 | return acc_dict 88 | 89 | #============================================================= 90 | def update_history(self, history, accumulators): 91 | """""" 92 | 93 | acc_dict = self.process_accumulators(accumulators) 94 | for key, value in acc_dict.iteritems(): 95 | history[key].append(value) 96 | return history['LAS'][-1] 97 | 98 | #============================================================= 99 | def print_accuracy(self, accumulators, time, prefix='Train'): 100 | """""" 101 | 102 | acc_dict = self.process_accumulators(accumulators, time=time) 103 | strings = [] 104 | strings.append(color_pattern('Loss:', '{Loss:7.3f}', 'bright_red')) 105 | strings.append(color_pattern('LS:', '{LS:5.2f}%', 'bright_cyan')) 106 | strings.append(color_pattern('UAS:', '{UAS:5.2f}%', 'bright_cyan')) 107 | strings.append(color_pattern('LAS:', '{LAS:5.2f}%', 'bright_cyan')) 108 | strings.append(color_pattern('SS:', '{SS:5.2f}%', 'bright_green')) 109 | strings.append(color_pattern('Speed:', '{Seq_rate:6.1f} seqs/sec', 'bright_magenta')) 110 | string = ctext('{0} ', 'bold') + ' | '.join(strings) 111 | print(string.format(prefix, **acc_dict)) 112 | return 113 | 114 | #============================================================= 115 | def plot(self, history, prefix='Train'): 116 | """""" 117 | 118 | pass 119 | 120 | #============================================================= 121 | def check(self, preds, sents, fileobj): 122 | """""" 123 | 124 | for tokens, arc_preds, rel_preds in zip(sents, preds[0], preds[1]): 125 | for token, arc_pred, rel_pred in zip(zip(*tokens), arc_preds, rel_preds): 126 | arc = self.vocabs['heads'][arc_pred] 127 | rel = self.vocabs['rels'][rel_pred] 128 | fileobj.write('\t'.join(token+(arc, rel))+'\n') 129 | fileobj.write('\n') 130 | return 131 | 132 | #============================================================= 133 | def write_probs(self, sents, output_file, probs, inv_idxs): 134 | """""" 135 | 136 | #parse_algorithm = self.parse_algorithm 137 | 138 | # Turns list of tuples of tensors into list of matrices 139 | arc_probs = [arc_prob for batch in probs for arc_prob in batch[0]] 140 | rel_probs = [rel_prob for batch in probs for rel_prob in batch[1]] 141 | tokens_to_keep = [weight for batch in probs for weight in batch[2]] 142 | tokens = [sent for batch in sents for sent in batch] 143 | 144 | with codecs.open(output_file, 'w', encoding='utf-8', errors='ignore') as f: 145 | j = 0 146 | for i in inv_idxs: 147 | sent, arc_prob, rel_prob, weights = tokens[i], arc_probs[i], rel_probs[i], tokens_to_keep[i] 148 | sent = zip(*sent) 149 | sequence_length = int(np.sum(weights))+1 150 | arc_prob = arc_prob[:sequence_length][:,:sequence_length] 151 | #arc_preds = np.argmax(arc_prob, axis=1) 152 | arc_preds = nonprojective(arc_prob) 153 | arc_preds_one_hot = np.zeros([rel_prob.shape[0], rel_prob.shape[2]]) 154 | arc_preds_one_hot[np.arange(len(arc_preds)), arc_preds] = 1. 155 | rel_preds = np.argmax(np.einsum('nrb,nb->nr', rel_prob, arc_preds_one_hot), axis=1) 156 | for token, arc_pred, rel_pred, weight in zip(sent, arc_preds[1:], rel_preds[1:], weights[1:]): 157 | token = list(token) 158 | token.insert(5, '_') 159 | token.append('_') 160 | token.append('_') 161 | token[6] = self.vocabs['heads'][arc_pred] 162 | token[7] = self.vocabs['rels'][rel_pred] 163 | f.write('\t'.join(token)+'\n') 164 | j += 1 165 | if j < len(inv_idxs): 166 | f.write('\n') 167 | return 168 | 169 | #============================================================= 170 | @property 171 | def train_keys(self): 172 | return ('n_tokens', 'n_seqs', 'loss', 'n_rel_correct', 'n_arc_correct', 'n_correct', 'n_seqs_correct') 173 | 174 | #============================================================= 175 | @property 176 | def valid_keys(self): 177 | return ('arc_preds', 'rel_preds') 178 | 179 | #============================================================= 180 | @property 181 | def parse_keys(self): 182 | return ('arc_probs', 'rel_probs', 'tokens_to_keep') 183 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/bin_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser 26 | 27 | #*************************************************************** 28 | class BinParser(BaseParser): 29 | """""" 30 | 31 | #============================================================= 32 | def __call__(self, vocabs, moving_params=None): 33 | """""" 34 | 35 | top_recur = super(BinParser, self).__call__(vocabs, moving_params=moving_params) 36 | int_tokens_to_keep = tf.to_int32(self.tokens_to_keep) 37 | 38 | with tf.variable_scope('MLP'): 39 | dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size + self.p_mlp_size, 40 | n_splits=2) 41 | arc_dep_mlp, rel_dep_mlp, p_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size], axis=2) 42 | arc_head_mlp, rel_head_mlp, p_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size], axis=2) 43 | 44 | with tf.variable_scope('p'): 45 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b) 46 | arc_ps = self.bilinear(p_dep_mlp, p_head_mlp, 1) 47 | # (b x 1) 48 | arc_logits = -tf.nn.softplus(arc_ps) 49 | 50 | with tf.variable_scope('Arc'): 51 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b) 52 | arc_logits += self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False) 53 | # (n x b x b) 54 | arc_probs = tf.nn.softmax(arc_logits) 55 | # (n x b) 56 | arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1)) 57 | # (n x b) 58 | arc_targets = self.vocabs['heads'].placeholder 59 | # (n x b) 60 | arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep 61 | # () 62 | arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep) 63 | 64 | with tf.variable_scope('Rel'): 65 | # (n x b x d) o (d x r x d) o (n x b x d).T -> (n x b x r x b) 66 | rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels'])) 67 | # (n x b x r x b) 68 | rel_probs = tf.nn.softmax(rel_logits, dim=2) 69 | # (n x b x b) 70 | one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size) 71 | # (n x b x b) -> (n x b x b x 1) 72 | one_hot = tf.expand_dims(one_hot, axis=3) 73 | # (n x b x r x b) o (n x b x b x 1) -> (n x b x r x 1) 74 | select_rel_logits = tf.matmul(rel_logits, one_hot) 75 | # (n x b x r x 1) -> (n x b x r) 76 | select_rel_logits = tf.squeeze(select_rel_logits, axis=3) 77 | # (n x b) 78 | rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1)) 79 | # (n x b) 80 | rel_targets = self.vocabs['rels'].placeholder 81 | # (n x b) 82 | rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep 83 | # () 84 | rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep) 85 | 86 | n_arc_correct = tf.reduce_sum(arc_correct) 87 | n_rel_correct = tf.reduce_sum(rel_correct) 88 | correct = arc_correct * rel_correct 89 | n_correct = tf.reduce_sum(correct) 90 | n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1))) 91 | loss = arc_loss + rel_loss 92 | 93 | outputs = { 94 | 'arc_logits': arc_logits, 95 | 'arc_probs': arc_probs, 96 | 'arc_preds': arc_preds, 97 | 'arc_targets': arc_targets, 98 | 'arc_correct': arc_correct, 99 | 'arc_loss': arc_loss, 100 | 'n_arc_correct': n_arc_correct, 101 | 102 | 'rel_logits': rel_logits, 103 | 'rel_probs': rel_probs, 104 | 'rel_preds': rel_preds, 105 | 'rel_targets': rel_targets, 106 | 'rel_correct': rel_correct, 107 | 'rel_loss': rel_loss, 108 | 'n_rel_correct': n_rel_correct, 109 | 110 | 'n_tokens': self.n_tokens, 111 | 'n_seqs': self.batch_size, 112 | 'tokens_to_keep': self.tokens_to_keep, 113 | 'n_correct': n_correct, 114 | 'n_seqs_correct': n_seqs_correct, 115 | 'loss': loss 116 | } 117 | 118 | return outputs 119 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/fish_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser 26 | 27 | #*************************************************************** 28 | class FishParser(BaseParser): 29 | """""" 30 | 31 | #============================================================= 32 | def __call__(self, vocabs, moving_params=None): 33 | """""" 34 | 35 | top_recur = super(FishParser, self).__call__(vocabs, moving_params=moving_params) 36 | int_tokens_to_keep = tf.to_int32(self.tokens_to_keep) 37 | 38 | with tf.variable_scope('MLP'): 39 | dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size + self.lambda_mlp_size, 40 | n_splits=2) 41 | arc_dep_mlp, rel_dep_mlp, lambda_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.lambda_mlp_size], axis=2) 42 | arc_head_mlp, rel_head_mlp, lambda_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.lambda_mlp_size], axis=2) 43 | 44 | with tf.variable_scope('Lambda'): 45 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b) 46 | arc_lambdas = self.bilinear(lambda_dep_mlp, lambda_head_mlp, 1) + 5 47 | # (b x 1) 48 | i_mat = tf.expand_dims(tf.expand_dims(tf.range(self.bucket_size), 1), 0) 49 | # (1 x b) 50 | j_mat = tf.expand_dims(tf.expand_dims(tf.range(self.bucket_size), 0), 0) 51 | # (b x 1) - (1 x b) -> (b x b) 52 | k_mat = tf.abs(i_mat - j_mat) 53 | # (b x 1) 54 | n_mat = tf.expand_dims(tf.expand_dims(self.sequence_lengths, 1), 1) - 1 - i_mat 55 | # (b x b) * (n x b x b) - (n x b x b) - (b x b) -> (n x b x b) 56 | arc_logits = tf.to_float(k_mat)*arc_lambdas - tf.exp(arc_lambdas) - tf.lgamma(tf.to_float(k_mat+1)) 57 | 58 | with tf.variable_scope('Arc'): 59 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b) 60 | arc_logits += self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False) 61 | # (n x b x b) 62 | arc_probs = tf.nn.softmax(arc_logits) 63 | # (n x b) 64 | arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1)) 65 | # (n x b) 66 | arc_targets = self.vocabs['heads'].placeholder 67 | # (n x b) 68 | arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep 69 | # () 70 | arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep) 71 | 72 | with tf.variable_scope('Rel'): 73 | # (n x b x d) o (d x r x d) o (n x b x d).T -> (n x b x r x b) 74 | rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels'])) 75 | # (n x b x r x b) 76 | rel_probs = tf.nn.softmax(rel_logits, dim=2) 77 | # (n x b x b) 78 | one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size) 79 | # (n x b x b) -> (n x b x b x 1) 80 | one_hot = tf.expand_dims(one_hot, axis=3) 81 | # (n x b x r x b) o (n x b x b x 1) -> (n x b x r x 1) 82 | select_rel_logits = tf.matmul(rel_logits, one_hot) 83 | # (n x b x r x 1) -> (n x b x r) 84 | select_rel_logits = tf.squeeze(select_rel_logits, axis=3) 85 | # (n x b) 86 | rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1)) 87 | # (n x b) 88 | rel_targets = self.vocabs['rels'].placeholder 89 | # (n x b) 90 | rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep 91 | # () 92 | rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep) 93 | 94 | n_arc_correct = tf.reduce_sum(arc_correct) 95 | n_rel_correct = tf.reduce_sum(rel_correct) 96 | correct = arc_correct * rel_correct 97 | n_correct = tf.reduce_sum(correct) 98 | n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1))) 99 | loss = arc_loss + rel_loss 100 | 101 | outputs = { 102 | 'arc_logits': arc_logits, 103 | 'arc_lambdas': arc_lambdas, 104 | 'arc_probs': arc_probs, 105 | 'arc_preds': arc_preds, 106 | 'arc_targets': arc_targets, 107 | 'arc_correct': arc_correct, 108 | 'arc_loss': arc_loss, 109 | 'n_arc_correct': n_arc_correct, 110 | 111 | 'rel_logits': rel_logits, 112 | 'rel_probs': rel_probs, 113 | 'rel_preds': rel_preds, 114 | 'rel_targets': rel_targets, 115 | 'rel_correct': rel_correct, 116 | 'rel_loss': rel_loss, 117 | 'n_rel_correct': n_rel_correct, 118 | 119 | 'n_tokens': self.n_tokens, 120 | 'n_seqs': self.batch_size, 121 | 'tokens_to_keep': self.tokens_to_keep, 122 | 'n_correct': n_correct, 123 | 'n_seqs_correct': n_seqs_correct, 124 | 'loss': loss 125 | } 126 | 127 | return outputs 128 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/gama_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser 26 | 27 | #*************************************************************** 28 | class GamaParser(BaseParser): 29 | """""" 30 | 31 | #============================================================= 32 | def __call__(self, vocabs, moving_params=None): 33 | """""" 34 | 35 | top_recur = super(GamaParser, self).__call__(vocabs, moving_params=moving_params) 36 | int_tokens_to_keep = tf.to_int32(self.tokens_to_keep) 37 | 38 | with tf.variable_scope('MLP'): 39 | dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size + 2*self.p_mlp_size, 40 | n_splits=2) 41 | arc_dep_mlp, rel_dep_mlp, mu_dep_mlp, sigma_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size, self.p_mlp_size], axis=2) 42 | arc_head_mlp, rel_head_mlp, mu_head_mlp, sigma_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size, self.p_mlp_size], axis=2) 43 | 44 | with tf.variable_scope('dist'): 45 | with tf.variable_scope('mu'): 46 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b) 47 | arc_mus = self.bilinear(mu_dep_mlp, mu_head_mlp, 1)**2 48 | with tf.variable_scope('sigma'): 49 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b) 50 | arc_sigmas = self.bilinear(sigma_dep_mlp, sigma_head_mlp, 1, initializer=None)**2 + .1 51 | # (b x 1) 52 | i_mat = tf.expand_dims(tf.range(self.bucket_size), 1) 53 | # (1 x b) 54 | j_mat = tf.expand_dims(tf.range(self.bucket_size), 0) 55 | # (b x 1) - (1 x b) -> (b x b) 56 | k_mat = tf.to_float(tf.abs(i_mat - j_mat)) 57 | 58 | arc_logits = -.5*tf.log(2*np.pi * arc_sigmas) - .5*(k_mat-arc_mus)**2 / arc_sigmas 59 | #arc_rs += tf.to_float(k_mat)#tf.to_float(tf.expand_dims(tf.expand_dims(self.sequence_lengths, 1), 1)) 60 | # (b x 1) 61 | #n_mat = tf.expand_dims(self.sequence_lengths, 1) - 1 - i_mat 62 | # (b x b) * (n x b x b) - (n x b x b) - (b x b) -> (n x b x b) 63 | #arc_logits = (tf.lgamma(arc_rs+1) - tf.lgamma(k_mat) - tf.lgamma(arc_rs-k_mat+2) + 64 | # k_mat * tf.log(arc_ps) + (arc_rs-k_mat+1)*tf.log(1-arc_ps) ) 65 | with tf.variable_scope('Arc'): 66 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b) 67 | arc_logits += self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False) 68 | # (n x b x b) 69 | arc_probs = tf.nn.softmax(arc_logits) 70 | # (n x b) 71 | arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1)) 72 | # (n x b) 73 | arc_targets = self.vocabs['heads'].placeholder 74 | # (n x b) 75 | arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep 76 | # () 77 | arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep) 78 | 79 | with tf.variable_scope('Rel'): 80 | # (n x b x d) o (d x r x d) o (n x b x d).T -> (n x b x r x b) 81 | rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels'])) 82 | # (n x b x r x b) 83 | rel_probs = tf.nn.softmax(rel_logits, dim=2) 84 | # (n x b x b) 85 | one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size) 86 | # (n x b x b) -> (n x b x b x 1) 87 | one_hot = tf.expand_dims(one_hot, axis=3) 88 | # (n x b x r x b) o (n x b x b x 1) -> (n x b x r x 1) 89 | select_rel_logits = tf.matmul(rel_logits, one_hot) 90 | # (n x b x r x 1) -> (n x b x r) 91 | select_rel_logits = tf.squeeze(select_rel_logits, axis=3) 92 | # (n x b) 93 | rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1)) 94 | # (n x b) 95 | rel_targets = self.vocabs['rels'].placeholder 96 | # (n x b) 97 | rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep 98 | # () 99 | rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep) 100 | 101 | n_arc_correct = tf.reduce_sum(arc_correct) 102 | n_rel_correct = tf.reduce_sum(rel_correct) 103 | correct = arc_correct * rel_correct 104 | n_correct = tf.reduce_sum(correct) 105 | n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1))) 106 | loss = arc_loss + rel_loss 107 | 108 | outputs = { 109 | 'arc_logits': arc_logits, 110 | 'arc_mus': arc_mus, 111 | 'arc_sigmas': arc_sigmas, 112 | 'arc_probs': arc_probs, 113 | 'arc_preds': arc_preds, 114 | 'arc_targets': arc_targets, 115 | 'arc_correct': arc_correct, 116 | 'arc_loss': arc_loss, 117 | 'n_arc_correct': n_arc_correct, 118 | 119 | 'rel_logits': rel_logits, 120 | 'rel_probs': rel_probs, 121 | 'rel_preds': rel_preds, 122 | 'rel_targets': rel_targets, 123 | 'rel_correct': rel_correct, 124 | 'rel_loss': rel_loss, 125 | 'n_rel_correct': n_rel_correct, 126 | 127 | 'n_tokens': self.n_tokens, 128 | 'n_seqs': self.batch_size, 129 | 'tokens_to_keep': self.tokens_to_keep, 130 | 'n_correct': n_correct, 131 | 'n_seqs_correct': n_seqs_correct, 132 | 'loss': loss 133 | } 134 | 135 | return outputs 136 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser 26 | 27 | #*************************************************************** 28 | class Parser(BaseParser): 29 | """""" 30 | 31 | #============================================================= 32 | def __call__(self, vocabs, moving_params=None): 33 | """""" 34 | 35 | top_recur = super(Parser, self).__call__(vocabs, moving_params=moving_params) 36 | int_tokens_to_keep = tf.to_int32(self.tokens_to_keep) 37 | 38 | with tf.variable_scope('MLP'): 39 | dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size, 40 | n_splits=2) 41 | arc_dep_mlp, rel_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size], axis=2) 42 | arc_head_mlp, rel_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size], axis=2) 43 | 44 | with tf.variable_scope('Arc'): 45 | # (n x b x d) * (d x 1 x d) * (n x b x d).T -> (n x b x b) 46 | arc_logits = self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False) 47 | # (n x b x b) 48 | arc_probs = tf.nn.softmax(arc_logits) 49 | # (n x b) 50 | arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1)) 51 | # (n x b) 52 | arc_targets = self.vocabs['heads'].placeholder 53 | # (n x b) 54 | arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep 55 | # () 56 | arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep) 57 | 58 | with tf.variable_scope('Rel'): 59 | # (n x b x d) * (d x r x d) * (n x b x d).T -> (n x b x r x b) 60 | rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels'])) 61 | # (n x b x r x b) 62 | rel_probs = tf.nn.softmax(rel_logits, dim=2) 63 | # (n x b x b) 64 | one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size) 65 | # (n x b x b) -> (n x b x b x 1) 66 | one_hot = tf.expand_dims(one_hot, axis=3) 67 | # (n x b x r x b) * (n x b x b x 1) -> (n x b x r x 1) 68 | select_rel_logits = tf.matmul(rel_logits, one_hot) 69 | # (n x b x r x 1) -> (n x b x r) 70 | select_rel_logits = tf.squeeze(select_rel_logits, axis=3) 71 | # (n x b) 72 | rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1)) 73 | # (n x b) 74 | rel_targets = self.vocabs['rels'].placeholder 75 | # (n x b) 76 | rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep 77 | # () 78 | rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep) 79 | 80 | n_arc_correct = tf.reduce_sum(arc_correct) 81 | n_rel_correct = tf.reduce_sum(rel_correct) 82 | correct = arc_correct * rel_correct 83 | n_correct = tf.reduce_sum(correct) 84 | n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1))) 85 | loss = arc_loss + rel_loss 86 | 87 | outputs = { 88 | 'arc_logits': arc_logits, 89 | 'arc_probs': arc_probs, 90 | 'arc_preds': arc_preds, 91 | 'arc_targets': arc_targets, 92 | 'arc_correct': arc_correct, 93 | 'arc_loss': arc_loss, 94 | 'n_arc_correct': n_arc_correct, 95 | 96 | 'rel_logits': rel_logits, 97 | 'rel_probs': rel_probs, 98 | 'rel_preds': rel_preds, 99 | 'rel_targets': rel_targets, 100 | 'rel_correct': rel_correct, 101 | 'rel_loss': rel_loss, 102 | 'n_rel_correct': n_rel_correct, 103 | 104 | 'n_tokens': self.n_tokens, 105 | 'n_seqs': self.batch_size, 106 | 'tokens_to_keep': self.tokens_to_keep, 107 | 'n_correct': n_correct, 108 | 'n_seqs_correct': n_seqs_correct, 109 | 'loss': loss 110 | } 111 | 112 | return outputs 113 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/xbar_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser 26 | 27 | #*************************************************************** 28 | class XbarParser(BaseParser): 29 | """""" 30 | 31 | #============================================================= 32 | def __call__(self, vocabs, moving_params=None): 33 | """""" 34 | 35 | top_recur = super(XbarParser, self).__call__(vocabs, moving_params=moving_params) 36 | int_tokens_to_keep = tf.to_int32(self.tokens_to_keep) 37 | 38 | with tf.variable_scope('MLP'): 39 | dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size + self.p_mlp_size, 40 | n_splits=2) 41 | arc_dep_mlp, rel_dep_mlp, p_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size], axis=2) 42 | arc_head_mlp, rel_head_mlp, p_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size], axis=2) 43 | 44 | with tf.variable_scope('p'): 45 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b) 46 | arc_ps = self.bilinear(p_dep_mlp, p_head_mlp, 1, add_bias2=False) 47 | # (b x 1) 48 | i_mat = tf.expand_dims(tf.expand_dims(tf.range(self.bucket_size), 1), 0) 49 | # (1 x b) 50 | j_mat = tf.expand_dims(tf.expand_dims(tf.range(self.bucket_size), 0), 0) 51 | # (b x 1) > (1 x b) -> (b x b) 52 | k_mat = tf.tile(j_mat > i_mat, [self.batch_size,1,1]) 53 | # (b x 1) 54 | n_mat = tf.expand_dims(tf.expand_dims(self.sequence_lengths, 1), 1) - 1 - i_mat 55 | # (n x b x b) + (b x b) * (n x b x b) + (b x b) * (n x b x b) -> (n x b x b) 56 | arc_logits = -tf.nn.softplus(tf.where(k_mat, arc_ps, -arc_ps)) 57 | # (n x b x b) - (b x b) * (b x b) -> (n x b x b) 58 | 59 | with tf.variable_scope('Arc'): 60 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b) 61 | arc_logits += self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False) 62 | # (n x b x b) 63 | arc_probs = tf.nn.softmax(arc_logits) 64 | # (n x b) 65 | arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1)) 66 | # (n x b) 67 | arc_targets = self.vocabs['heads'].placeholder 68 | # (n x b) 69 | arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep 70 | # () 71 | arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep) 72 | 73 | with tf.variable_scope('Rel'): 74 | # (n x b x d) o (d x r x d) o (n x b x d).T -> (n x b x r x b) 75 | rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels'])) 76 | # (n x b x r x b) 77 | rel_probs = tf.nn.softmax(rel_logits, dim=2) 78 | # (n x b x b) 79 | one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size) 80 | # (n x b x b) -> (n x b x b x 1) 81 | one_hot = tf.expand_dims(one_hot, axis=3) 82 | # (n x b x r x b) o (n x b x b x 1) -> (n x b x r x 1) 83 | select_rel_logits = tf.matmul(rel_logits, one_hot) 84 | # (n x b x r x 1) -> (n x b x r) 85 | select_rel_logits = tf.squeeze(select_rel_logits, axis=3) 86 | # (n x b) 87 | rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1)) 88 | # (n x b) 89 | rel_targets = self.vocabs['rels'].placeholder 90 | # (n x b) 91 | rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep 92 | # () 93 | rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep) 94 | 95 | n_arc_correct = tf.reduce_sum(arc_correct) 96 | n_rel_correct = tf.reduce_sum(rel_correct) 97 | correct = arc_correct * rel_correct 98 | n_correct = tf.reduce_sum(correct) 99 | n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1))) 100 | loss = arc_loss + rel_loss 101 | 102 | outputs = { 103 | 'arc_logits': arc_logits, 104 | 'arc_probs': arc_probs, 105 | 'arc_preds': arc_preds, 106 | 'arc_targets': arc_targets, 107 | 'arc_correct': arc_correct, 108 | 'arc_loss': arc_loss, 109 | 'n_arc_correct': n_arc_correct, 110 | 111 | 'rel_logits': rel_logits, 112 | 'rel_probs': rel_probs, 113 | 'rel_preds': rel_preds, 114 | 'rel_targets': rel_targets, 115 | 'rel_correct': rel_correct, 116 | 'rel_loss': rel_loss, 117 | 'n_rel_correct': n_rel_correct, 118 | 119 | 'n_tokens': self.n_tokens, 120 | 'n_seqs': self.batch_size, 121 | 'tokens_to_keep': self.tokens_to_keep, 122 | 'n_correct': n_correct, 123 | 'n_seqs_correct': n_seqs_correct, 124 | 'loss': loss 125 | } 126 | 127 | return outputs 128 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/__init__.py: -------------------------------------------------------------------------------- 1 | from tagger import Tagger 2 | from xtagger import XTagger 3 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/base_tagger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import re 23 | import codecs 24 | import numpy as np 25 | import tensorflow as tf 26 | import matplotlib.pyplot as plt 27 | 28 | from parser.misc.colors import ctext, color_pattern 29 | from parser.neural.models.nn import NN 30 | 31 | #*************************************************************** 32 | class BaseTagger(NN): 33 | """""" 34 | 35 | PAD = 0 36 | ROOT = 1 37 | 38 | #============================================================= 39 | def __call__(self, vocabs, moving_params=None): 40 | """""" 41 | 42 | self.moving_params = moving_params 43 | if isinstance(vocabs, dict): 44 | self.vocabs = vocabs 45 | else: 46 | self.vocabs = {vocab.name: vocab for vocab in vocabs} 47 | 48 | input_vocabs = [self.vocabs[name] for name in self.input_vocabs] 49 | embed = self.embed_concat(input_vocabs) 50 | for vocab in self.vocabs.values(): 51 | if vocab not in input_vocabs: 52 | vocab.generate_placeholder() 53 | placeholder = self.vocabs['words'].placeholder 54 | if len(placeholder.get_shape().as_list()) == 3: 55 | placeholder = placeholder[:,:,0] 56 | self._tokens_to_keep = tf.to_float(tf.greater(placeholder, self.ROOT)) 57 | self._batch_size = tf.shape(placeholder)[0] 58 | self._bucket_size = tf.shape(placeholder)[1] 59 | self._sequence_lengths = tf.reduce_sum(tf.to_int32(tf.greater(placeholder, self.PAD)), axis=1) 60 | self._n_tokens = tf.to_int32(tf.reduce_sum(self.tokens_to_keep)) 61 | 62 | top_recur = embed 63 | for i in xrange(self.n_layers): 64 | with tf.variable_scope('RNN%d' % i): 65 | top_recur, _ = self.RNN(top_recur, self.recur_size) 66 | return top_recur 67 | 68 | #============================================================= 69 | def process_accumulators(self, accumulators, time=None): 70 | """""" 71 | 72 | n_tokens, n_seqs, loss, corr, seq_corr = accumulators 73 | acc_dict = { 74 | 'Loss': loss, 75 | 'TS': corr/n_tokens*100, 76 | 'SS': seq_corr/n_seqs*100, 77 | } 78 | if time is not None: 79 | acc_dict.update({ 80 | 'Token_rate': n_tokens / time, 81 | 'Seq_rate': n_seqs / time, 82 | }) 83 | return acc_dict 84 | 85 | #============================================================= 86 | def update_history(self, history, accumulators): 87 | """""" 88 | 89 | acc_dict = self.process_accumulators(accumulators) 90 | for key, value in acc_dict.iteritems(): 91 | history[key].append(value) 92 | return history['TS'][-1] 93 | 94 | #============================================================= 95 | def print_accuracy(self, accumulators, time, prefix='Train'): 96 | """""" 97 | 98 | acc_dict = self.process_accumulators(accumulators, time=time) 99 | strings = [] 100 | strings.append(color_pattern('Loss:', '{Loss:7.3f}', 'bright_red')) 101 | strings.append(color_pattern('TS:', '{TS:5.2f}%', 'bright_cyan')) 102 | strings.append(color_pattern('SS:', '{SS:5.2f}%', 'bright_green')) 103 | strings.append(color_pattern('Speed:', '{Seq_rate:6.1f} seqs/sec', 'bright_magenta')) 104 | string = ctext('{0} ', 'bold') + ' | '.join(strings) 105 | print(string.format(prefix, **acc_dict)) 106 | return 107 | 108 | #============================================================= 109 | def plot(self, history, prefix='Train'): 110 | """""" 111 | 112 | pass 113 | 114 | #============================================================= 115 | def check(self, preds, sents, fileobj): 116 | """""" 117 | 118 | for tokens, preds in zip(sents, preds[0]): 119 | for token, pred in zip(zip(*tokens), preds): 120 | tag = self.vocabs['tags'][pred] 121 | fileobj.write('\t'.join(token+(tag, ))+'\n') 122 | fileobj.write('\n') 123 | return 124 | 125 | #============================================================= 126 | def write_probs(self, sents, output_file, probs, inv_idxs): 127 | """""" 128 | 129 | # Turns list of tuples of tensors into list of matrices 130 | tag_probs = [tag_prob for batch in probs for tag_prob in batch[0]] 131 | tokens_to_keep = [weight for batch in probs for weight in batch[1]] 132 | tokens = [sent for batch in sents for sent in batch] 133 | 134 | with codecs.open(output_file, 'w', encoding='utf-8', errors='ignore') as f: 135 | for i in inv_idxs: 136 | sent, tag_prob, weights = tokens[i], tag_probs[i], tokens_to_keep[i] 137 | sent = zip(*sent) 138 | tag_preds = np.argmax(tag_prob, axis=1) 139 | for token, tag_pred, weight in zip(sent, tag_preds[1:], weights[1:]): 140 | token = list(token) 141 | token.insert(5, '_') 142 | token.append('_') 143 | token.append('_') 144 | token[3] = self.vocabs['tags'][tag_pred] 145 | f.write('\t'.join(token)+'\n') 146 | f.write('\n') 147 | return 148 | 149 | #============================================================= 150 | @property 151 | def train_keys(self): 152 | return ('n_tokens', 'n_seqs', 'loss', 'n_correct', 'n_seqs_correct') 153 | 154 | #============================================================= 155 | @property 156 | def valid_keys(self): 157 | return ('preds', ) 158 | 159 | #============================================================= 160 | @property 161 | def parse_keys(self): 162 | return ('probs', 'tokens_to_keep') 163 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/base_xtagger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import re 23 | import codecs 24 | import numpy as np 25 | import tensorflow as tf 26 | import matplotlib.pyplot as plt 27 | 28 | from parser.misc.colors import ctext, color_pattern 29 | from parser.neural.models.nn import NN 30 | 31 | #*************************************************************** 32 | class BaseXTagger(NN): 33 | """""" 34 | 35 | PAD = 0 36 | ROOT = 1 37 | 38 | #============================================================= 39 | def __call__(self, vocabs, moving_params=None): 40 | """""" 41 | 42 | self.moving_params = moving_params 43 | if isinstance(vocabs, dict): 44 | self.vocabs = vocabs 45 | else: 46 | self.vocabs = {vocab.name: vocab for vocab in vocabs} 47 | 48 | input_vocabs = [self.vocabs[name] for name in self.input_vocabs] 49 | embed = self.embed_concat(input_vocabs) 50 | for vocab in self.vocabs.values(): 51 | if vocab not in input_vocabs: 52 | vocab.generate_placeholder() 53 | placeholder = self.vocabs['words'].placeholder 54 | if len(placeholder.get_shape().as_list()) == 3: 55 | placeholder = placeholder[:,:,0] 56 | self._tokens_to_keep = tf.to_float(tf.greater(placeholder, self.ROOT)) 57 | self._batch_size = tf.shape(placeholder)[0] 58 | self._bucket_size = tf.shape(placeholder)[1] 59 | self._sequence_lengths = tf.reduce_sum(tf.to_int32(tf.greater(placeholder, self.PAD)), axis=1) 60 | self._n_tokens = tf.to_int32(tf.reduce_sum(self.tokens_to_keep)) 61 | 62 | top_recur = embed 63 | for i in xrange(self.n_layers): 64 | with tf.variable_scope('RNN%d' % i): 65 | top_recur, _ = self.RNN(top_recur, self.recur_size) 66 | return top_recur 67 | 68 | #============================================================= 69 | def process_accumulators(self, accumulators, time=None): 70 | """""" 71 | 72 | n_tokens, n_seqs, loss, corr, xcorr, seq_corr = accumulators 73 | acc_dict = { 74 | 'Loss': loss, 75 | 'TS': corr/n_tokens*100, 76 | 'XTS': xcorr/n_tokens*100, 77 | 'SS': seq_corr/n_seqs*100, 78 | } 79 | if time is not None: 80 | acc_dict.update({ 81 | 'Token_rate': n_tokens / time, 82 | 'Seq_rate': n_seqs / time, 83 | }) 84 | return acc_dict 85 | 86 | #============================================================= 87 | def update_history(self, history, accumulators): 88 | """""" 89 | 90 | acc_dict = self.process_accumulators(accumulators) 91 | for key, value in acc_dict.iteritems(): 92 | history[key].append(value) 93 | return history['TS'][-1] 94 | 95 | #============================================================= 96 | def print_accuracy(self, accumulators, time, prefix='Train'): 97 | """""" 98 | 99 | acc_dict = self.process_accumulators(accumulators, time=time) 100 | strings = [] 101 | strings.append(color_pattern('Loss:', '{Loss:7.3f}', 'bright_red')) 102 | strings.append(color_pattern('TS:', '{TS:5.2f}%', 'bright_cyan')) 103 | strings.append(color_pattern('XTS:', '{XTS:5.2f}%', 'bright_cyan')) 104 | strings.append(color_pattern('SS:', '{SS:5.2f}%', 'bright_green')) 105 | strings.append(color_pattern('Speed:', '{Seq_rate:6.1f} seqs/sec', 'bright_magenta')) 106 | string = ctext('{0} ', 'bold') + ' | '.join(strings) 107 | print(string.format(prefix, **acc_dict)) 108 | return 109 | 110 | #============================================================= 111 | def plot(self, history, prefix='Train'): 112 | """""" 113 | 114 | pass 115 | 116 | #============================================================= 117 | def check(self, preds, sents, fileobj): 118 | """""" 119 | 120 | for tokens, preds, xpreds in zip(sents, preds[0], preds[1]): 121 | for token, pred, xpred in zip(zip(*tokens), preds, xpreds): 122 | tag = self.vocabs['tags'][pred] 123 | xtag = self.vocabs['xtags'][xpred] 124 | fileobj.write('\t'.join(token+(tag, xtag))+'\n') 125 | fileobj.write('\n') 126 | return 127 | 128 | #============================================================= 129 | def write_probs(self, sents, output_file, probs, inv_idxs): 130 | """""" 131 | 132 | # Turns list of tuples of tensors into list of matrices 133 | tag_probs = [tag_prob for batch in probs for tag_prob in batch[0]] 134 | xtag_probs = [xtag_prob for batch in probs for xtag_prob in batch[1]] 135 | tokens_to_keep = [weight for batch in probs for weight in batch[2]] 136 | tokens = [sent for batch in sents for sent in batch] 137 | 138 | with codecs.open(output_file, 'w', encoding='utf-8', errors='ignore') as f: 139 | for i in inv_idxs: 140 | sent, tag_prob, xtag_prob, weights = tokens[i], tag_probs[i], xtag_probs[i], tokens_to_keep[i] 141 | sent = zip(*sent) 142 | tag_preds = np.argmax(tag_prob, axis=1) 143 | xtag_preds = np.argmax(xtag_prob, axis=1) 144 | for token, tag_pred, xtag_pred, weight in zip(sent, tag_preds[1:], xtag_preds[1:], weights[1:]): 145 | token = list(token) 146 | token.insert(5, '_') 147 | token.append('_') 148 | token.append('_') 149 | token[3] = self.vocabs['tags'][tag_pred] 150 | token[4] = self.vocabs['xtags'][xtag_pred] 151 | f.write('\t'.join(token)+'\n') 152 | f.write('\n') 153 | return 154 | 155 | #============================================================= 156 | @property 157 | def train_keys(self): 158 | return ('n_tokens', 'n_seqs', 'loss', 'n_tag_correct', 'n_xtag_correct', 'n_seqs_correct') 159 | 160 | #============================================================= 161 | @property 162 | def valid_keys(self): 163 | return ('tag_preds', 'xtag_preds') 164 | 165 | #============================================================= 166 | @property 167 | def parse_keys(self): 168 | return ('tag_probs', 'xtag_probs', 'tokens_to_keep') 169 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/tagger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from parser.neural.models.nlp.taggers.base_tagger import BaseTagger 26 | 27 | #*************************************************************** 28 | class Tagger(BaseTagger): 29 | """""" 30 | 31 | #============================================================= 32 | def __call__(self, vocabs, moving_params=None): 33 | """""" 34 | 35 | top_recur = super(Tagger, self).__call__(vocabs, moving_params=moving_params) 36 | int_tokens_to_keep = tf.to_int32(self.tokens_to_keep) 37 | 38 | with tf.variable_scope('MLP'): 39 | mlp = self.MLP(top_recur, self.mlp_size) 40 | 41 | with tf.variable_scope('Tag'): 42 | logits = self.linear(mlp, len(self.vocabs['tags'])) 43 | probs = tf.nn.softmax(logits) 44 | preds = tf.to_int32(tf.argmax(logits, axis=-1)) 45 | targets = self.vocabs['tags'].placeholder 46 | correct = tf.to_int32(tf.equal(preds, targets))*int_tokens_to_keep 47 | loss = tf.losses.sparse_softmax_cross_entropy(targets, logits, self.tokens_to_keep) 48 | 49 | 50 | n_correct = tf.reduce_sum(correct) 51 | n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1))) 52 | 53 | outputs = { 54 | 'logits': logits, 55 | 'probs': probs, 56 | 'preds': preds, 57 | 'targets': targets, 58 | 'correct': correct, 59 | 'loss': loss, 60 | 'n_correct': n_correct, 61 | 62 | 'n_tokens': self.n_tokens, 63 | 'n_seqs': self.batch_size, 64 | 'tokens_to_keep': self.tokens_to_keep, 65 | 'n_correct': n_correct, 66 | 'n_seqs_correct': n_seqs_correct, 67 | 'loss': loss 68 | } 69 | 70 | return outputs 71 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/xtagger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from parser.neural.models.nlp.taggers.base_xtagger import BaseXTagger 26 | 27 | #*************************************************************** 28 | class XTagger(BaseXTagger): 29 | """""" 30 | 31 | #============================================================= 32 | def __call__(self, vocabs, moving_params=None): 33 | """""" 34 | 35 | top_recur = super(XTagger, self).__call__(vocabs, moving_params=moving_params) 36 | int_tokens_to_keep = tf.to_int32(self.tokens_to_keep) 37 | 38 | with tf.variable_scope('MLP'): 39 | tag_mlp, xtag_mlp = self.MLP(top_recur, self.mlp_size, n_splits=2) 40 | 41 | with tf.variable_scope('Tag'): 42 | tag_logits = self.linear(tag_mlp, len(self.vocabs['tags'])) 43 | tag_probs = tf.nn.softmax(tag_logits) 44 | tag_preds = tf.to_int32(tf.argmax(tag_logits, axis=-1)) 45 | tag_targets = self.vocabs['tags'].placeholder 46 | tag_correct = tf.to_int32(tf.equal(tag_preds, tag_targets))*int_tokens_to_keep 47 | tag_loss = tf.losses.sparse_softmax_cross_entropy(tag_targets, tag_logits, self.tokens_to_keep) 48 | 49 | with tf.variable_scope('XTag'): 50 | xtag_logits = self.linear(xtag_mlp, len(self.vocabs['xtags'])) 51 | xtag_probs = tf.nn.softmax(xtag_logits) 52 | xtag_preds = tf.to_int32(tf.argmax(xtag_logits, axis=-1)) 53 | xtag_targets = self.vocabs['xtags'].placeholder 54 | xtag_correct = tf.to_int32(tf.equal(xtag_preds, xtag_targets))*int_tokens_to_keep 55 | xtag_loss = tf.losses.sparse_softmax_cross_entropy(xtag_targets, xtag_logits, self.tokens_to_keep) 56 | 57 | correct = tag_correct * xtag_correct 58 | n_correct = tf.reduce_sum(correct) 59 | n_tag_correct = tf.reduce_sum(tag_correct) 60 | n_xtag_correct = tf.reduce_sum(xtag_correct) 61 | n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1))) 62 | loss = tag_loss + xtag_loss 63 | 64 | outputs = { 65 | 'tag_logits': tag_logits, 66 | 'tag_probs': tag_probs, 67 | 'tag_preds': tag_preds, 68 | 'tag_targets': tag_targets, 69 | 'tag_correct': tag_correct, 70 | 'tag_loss': tag_loss, 71 | 'n_tag_correct': n_tag_correct, 72 | 73 | 'xtag_logits': xtag_logits, 74 | 'xtag_probs': xtag_probs, 75 | 'xtag_preds': xtag_preds, 76 | 'xtag_targets': xtag_targets, 77 | 'xtag_correct': xtag_correct, 78 | 'xtag_loss': xtag_loss, 79 | 'n_xtag_correct': n_xtag_correct, 80 | 81 | 'n_tokens': self.n_tokens, 82 | 'n_seqs': self.batch_size, 83 | 'tokens_to_keep': self.tokens_to_keep, 84 | 'n_correct': n_correct, 85 | 'n_seqs_correct': n_seqs_correct, 86 | 'loss': loss 87 | } 88 | 89 | return outputs 90 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from sgd_optimizer import SGDOptimizer 19 | from radam_optimizer import RadamOptimizer -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/optimizers/radam_optimizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import tensorflow as tf 23 | 24 | from parser.neural.optimizers.base_optimizer import BaseOptimizer 25 | 26 | #*************************************************************** 27 | class RadamOptimizer(BaseOptimizer): 28 | """""" 29 | 30 | #============================================================= 31 | def _init_acc(self, var_list, grads): 32 | """""" 33 | 34 | super(RadamOptimizer, self)._init_acc(var_list, grads) 35 | for x_tm1, g_t in zip(var_list, grads): 36 | if self.mu > 0: 37 | self.get_accumulator(x_tm1, 'm') 38 | shape = self.get_variable_shape(x_tm1) 39 | if isinstance(g_t, tf.Tensor): 40 | self.get_accumulator(x_tm1, 'm/tm1', []) 41 | else: 42 | self.get_accumulator(x_tm1, 'm/tm1', [shape[0]]+[1]*(len(shape)-1)) 43 | if self.nu > 0: 44 | self.get_accumulator(x_tm1, 'v') 45 | shape = self.get_variable_shape(x_tm1) 46 | if isinstance(g_t, tf.Tensor): 47 | self.get_accumulator(x_tm1, 'v/tm1', []) 48 | else: 49 | self.get_accumulator(x_tm1, 'v/tm1', [shape[0]]+[1]*(len(shape)-1)) 50 | return 51 | 52 | #============================================================= 53 | def _apply_dense(self, cache): 54 | """""" 55 | 56 | x_tm1, g_t = cache['x_tm1'], cache['g_t'] 57 | updates = cache['updates'] 58 | 59 | if self.mu > 0: 60 | m_t, t_m = self._dense_moving_average(x_tm1, g_t, 'm', beta=self.mu) 61 | m_bar_t = (1-self.gamma) * m_t + self.gamma * g_t 62 | updates.extend([m_t, t_m]) 63 | else: 64 | m_bar_t = g_t 65 | 66 | if self.nu > 0: 67 | v_t, t_v = self._dense_moving_average(x_tm1, g_t**2, 'v', beta=self.nu) 68 | v_bar_t = tf.sqrt(v_t + self.epsilon) 69 | updates.extend([v_t, t_v]) 70 | else: 71 | v_bar_t = 1 72 | 73 | s_t = self.learning_rate * m_bar_t / v_bar_t 74 | cache['s_t'] = tf.where(tf.is_finite(s_t), s_t, tf.zeros_like(s_t)) 75 | return cache 76 | 77 | #============================================================= 78 | def _apply_sparse(self, cache): 79 | """""" 80 | 81 | x_tm1, g_t, idxs = cache['x_tm1'], cache['g_t'], cache['idxs'] 82 | idxs, idxs_ = tf.unique(idxs) 83 | g_t_ = tf.unsorted_segment_sum(g_t, idxs_, tf.size(idxs)) 84 | updates = cache['updates'] 85 | 86 | if self.mu > 0: 87 | m_t, t_m = self._sparse_moving_average(x_tm1, idxs, g_t_, 'm', beta=self.mu) 88 | m_t_ = tf.gather(m_t, idxs) 89 | m_bar_t_ = (1-self.gamma) * m_t_ + self.gamma * g_t_ 90 | updates.extend([m_t, t_m]) 91 | else: 92 | m_bar_t_ = g_t_ 93 | 94 | if self.nu > 0: 95 | v_t, t_v = self._sparse_moving_average(x_tm1, idxs, g_t_**2, 'v', beta=self.nu) 96 | v_t_ = tf.gather(v_t, idxs) 97 | v_bar_t_ = tf.sqrt(v_t_ + self.epsilon) 98 | updates.extend([v_t, t_v]) 99 | else: 100 | v_bar_t_ = 1 101 | 102 | s_t_ = self.learning_rate * m_bar_t_ / v_bar_t_ 103 | cache['s_t'] = tf.where(tf.is_finite(s_t_), s_t_, tf.zeros_like(s_t_)) 104 | cache['g_t'] = g_t_ 105 | cache['idxs'] = idxs 106 | return cache 107 | 108 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/optimizers/sgd_optimizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import tensorflow as tf 23 | 24 | from parser.neural.optimizers.base_optimizer import BaseOptimizer 25 | 26 | #*************************************************************** 27 | class SGDOptimizer(BaseOptimizer): 28 | """""" 29 | 30 | #============================================================= 31 | def _apply_dense(self, cache): 32 | """""" 33 | 34 | g_t = cache['g_t'] 35 | cache['s_t'] = self.learning_rate * g_t 36 | return cache 37 | 38 | #============================================================= 39 | def _apply_sparse(self, cache): 40 | """""" 41 | 42 | g_t, idxs = cache['g_t'], cache['idxs'] 43 | idxs, idxs_ = tf.unique(idxs) 44 | g_t_ = tf.unsorted_segment_sum(g_t, idxs_, tf.size(idxs)) 45 | 46 | cache['g_t'] = g_t_ 47 | cache['idxs'] = idxs 48 | cache['s_t'] = self.learning_rate * g_t_ 49 | 50 | return cache 51 | 52 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/recur_cells/.directory: -------------------------------------------------------------------------------- 1 | [Dolphin] 2 | Timestamp=2016,10,21,3,50,28 3 | Version=3 4 | ViewMode=1 5 | VisibleRoles=Details_text,Details_size,Details_date,Details_wordCount,Details_lineCount,CustomizedDetails 6 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/recur_cells/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from rnn_cell import RNNCell 19 | from gru_cell import GRUCell 20 | from cif_lstm_cell import CifLSTMCell 21 | from lstm_cell import LSTMCell 22 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/recur_cells/base_cell.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import tensorflow as tf 23 | 24 | from parser.configurable import Configurable 25 | 26 | #*************************************************************** 27 | class BaseCell(Configurable): 28 | """""" 29 | 30 | #============================================================= 31 | def __init__(self, output_size, *args, **kwargs): 32 | """""" 33 | 34 | self._output_size = output_size 35 | input_size = kwargs.pop('input_size', self._output_size) 36 | self.moving_params = kwargs.pop('moving_params', None) 37 | super(BaseCell, self).__init__(*args, **kwargs) 38 | self._input_size = input_size if input_size is not None else self.output_size 39 | 40 | #============================================================= 41 | def __call__(self, inputs, state, scope=None): 42 | """""" 43 | 44 | raise NotImplementedError() 45 | 46 | #============================================================= 47 | def zero_state(self, batch_size, dtype): 48 | """""" 49 | 50 | zero_state = tf.get_variable('Zero_state', 51 | shape=self.state_size, 52 | dtype=dtype, 53 | initializer=tf.zeros_initializer()) 54 | state = tf.reshape(tf.tile(zero_state, tf.stack([batch_size])), tf.stack([batch_size, self.state_size])) 55 | state.set_shape([None, self.state_size]) 56 | return state 57 | 58 | #============================================================= 59 | @property 60 | def input_size(self): 61 | return self._input_size 62 | @property 63 | def output_size(self): 64 | return self._output_size 65 | @property 66 | def state_size(self): 67 | raise NotImplementedError() 68 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/recur_cells/cif_lstm_cell.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import tensorflow as tf 23 | 24 | from parser.neural.recur_cells.base_cell import BaseCell 25 | from parser.neural.linalg import linear 26 | from parser.neural.functions import gate 27 | 28 | #*************************************************************** 29 | class CifLSTMCell(BaseCell): 30 | """""" 31 | 32 | #============================================================= 33 | def __call__(self, inputs, state, scope=None): 34 | """""" 35 | 36 | with tf.variable_scope(scope or type(self).__name__): 37 | cell_tm1, hidden_tm1 = tf.split(state, 2, axis=1) 38 | input_list = [inputs, hidden_tm1] 39 | lin = linear(input_list, 40 | self.output_size, 41 | add_bias=True, 42 | n_splits=3, 43 | moving_params=self.moving_params) 44 | cell_act, update_act, output_act = lin 45 | 46 | cell_tilde_t = cell_act 47 | update_gate = gate(update_act-self.forget_bias) 48 | output_gate = gate(output_act) 49 | cell_t = update_gate * cell_tilde_t + (1-update_gate) * cell_tm1 50 | hidden_tilde_t = self.recur_func(cell_t) 51 | hidden_t = hidden_tilde_t * output_gate 52 | 53 | return hidden_t, tf.concat([cell_t, hidden_t], 1) 54 | 55 | #============================================================= 56 | @property 57 | def state_size(self): 58 | return self.output_size * 2 59 | 60 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/recur_cells/gru_cell.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import tensorflow as tf 23 | 24 | from parser.neural.recur_cells.base_cell import BaseCell 25 | from parser.neural.linalg import linear 26 | from parser.neural.functions import gate 27 | 28 | #*************************************************************** 29 | class GRUCell(BaseCell): 30 | """""" 31 | 32 | #============================================================= 33 | def __call__(self, inputs, state, scope=None): 34 | """""" 35 | 36 | with tf.variable_scope(scope or type(self).__name__): 37 | cell_tm1, hidden_tm1 = tf.split(state, 2, axis=1) 38 | input_list = [inputs, hidden_tm1] 39 | with tf.variable_scope('Gates'): 40 | gates = linear(inputs_list, 41 | self.output_size, 42 | add_bias=True, 43 | n_splits=2, 44 | moving_params=self.moving_params) 45 | update_act, reset_act = gates 46 | update_gate = gate(update_act-self.forget_bias) 47 | reset_gate = gate(reset_act) 48 | reset_state = reset_gate * hidden_tm1 49 | input_list = [inputs, reset_state] 50 | with tf.variable_scope('Candidate'): 51 | hidden_act = linear(input_list, 52 | self.output_size, 53 | add_bias=True, 54 | moving_params=self.moving_params) 55 | hidden_tilde = self.recur_func(hidden_act) 56 | cell_t = update_gate * cell_tm1 + (1-update_gate) * hidden_tilde 57 | return cell_t, tf.concat([cell_t, cell_t], 1) 58 | 59 | #============================================================= 60 | @property 61 | def state_size(self): 62 | return self.output_size * 2 63 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/recur_cells/lstm_cell.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import tensorflow as tf 23 | 24 | from parser.neural.recur_cells.base_cell import BaseCell 25 | from parser.neural.linalg import linear 26 | from parser.neural.functions import gate, tanh 27 | 28 | #*************************************************************** 29 | class LSTMCell(BaseCell): 30 | """""" 31 | 32 | #============================================================= 33 | def __call__(self, inputs, state, scope=None): 34 | """""" 35 | 36 | with tf.variable_scope(scope or type(self).__name__): 37 | cell_tm1, hidden_tm1 = tf.split(state, 2, axis=1) 38 | input_list = [inputs, hidden_tm1] 39 | lin = linear(input_list, 40 | self.output_size, 41 | add_bias=True, 42 | n_splits=4, 43 | moving_params=self.moving_params) 44 | cell_act, input_act, forget_act, output_act = lin 45 | 46 | cell_tilde_t = tanh(cell_act) 47 | input_gate = gate(input_act) 48 | forget_gate = gate(forget_act-self.forget_bias) 49 | output_gate = gate(output_act) 50 | cell_t = input_gate * cell_tilde_t + (1-forget_gate) * cell_tm1 51 | hidden_tilde_t = self.recur_func(cell_t) 52 | hidden_t = hidden_tilde_t * output_gate 53 | 54 | return hidden_t, tf.concat([cell_t, hidden_t], 1) 55 | 56 | #============================================================= 57 | @property 58 | def state_size(self): 59 | return self.output_size * 2 60 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/recur_cells/rnn_cell.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import tensorflow as tf 23 | 24 | from parser.neural.recur_cells.base_cell import BaseCell 25 | from parser.neural.linalg import linear 26 | 27 | #*************************************************************** 28 | class RNNCell(BaseCell): 29 | """""" 30 | 31 | #============================================================= 32 | def __call__(self, inputs, state, scope=None): 33 | """""" 34 | 35 | with tf.variable_scope(scope or type(self).__name__): 36 | inputs_list = [inputs, state] 37 | hidden_act = linear(inputs_list, 38 | self.output_size, 39 | add_bias=True, 40 | moving_params=self.moving_params) 41 | hidden = self.recur_func(hidden_act) 42 | return hidden, hidden 43 | 44 | #============================================================= 45 | @property 46 | def state_size(self): 47 | return self.output_size 48 | 49 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/neural/rnn.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """RNN helpers for TensorFlow models.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | import parser.neural.linalg as linalg 26 | 27 | #=============================================================== 28 | def birnn(cell, inputs, sequence_length, initial_state_fw=None, initial_state_bw=None, ff_keep_prob=1., recur_keep_prob=1., dtype=tf.float32, scope=None): 29 | """""" 30 | 31 | # Forward direction 32 | with tf.variable_scope(scope or 'BiRNN_FW') as fw_scope: 33 | output_fw, output_state_fw = rnn(cell, inputs, sequence_length, initial_state_fw, ff_keep_prob, recur_keep_prob, dtype, scope=fw_scope) 34 | 35 | # Backward direction 36 | rev_inputs = tf.reverse_sequence(inputs, sequence_length, 1, 0) 37 | with tf.variable_scope(scope or 'BiRNN_BW') as bw_scope: 38 | output_bw, output_state_bw = rnn(cell, rev_inputs, sequence_length, initial_state_bw, ff_keep_prob, recur_keep_prob, dtype, scope=bw_scope) 39 | output_bw = tf.reverse_sequence(output_bw, sequence_length, 1, 0) 40 | # Concat each of the forward/backward outputs 41 | outputs = tf.concat([output_fw, output_bw], 2) 42 | 43 | return outputs, tf.tuple([output_state_fw, output_state_bw]) 44 | 45 | #=============================================================== 46 | def rnn(cell, inputs, sequence_length=None, initial_state=None, ff_keep_prob=1., recur_keep_prob=1., dtype=tf.float32, scope=None): 47 | """""" 48 | 49 | inputs = tf.transpose(inputs, [1, 0, 2]) # (B,T,D) => (T,B,D) 50 | 51 | parallel_iterations = 32 52 | if sequence_length is not None: 53 | sequence_length = tf.to_int32(sequence_length) 54 | 55 | with tf.variable_scope(scope or 'RNN') as varscope: 56 | #if varscope.caching_device is None: 57 | # varscope.set_caching_device(lambda op: op.device) 58 | input_shape = tf.shape(inputs) 59 | time_steps, batch_size, _ = tf.unstack(input_shape, 3) 60 | const_time_steps, const_batch_size, const_depth = inputs.get_shape().as_list() 61 | 62 | if initial_state is not None: 63 | state = initial_state 64 | else: 65 | if not dtype: 66 | raise ValueError('If no initial_state is provided, dtype must be.') 67 | state = cell.zero_state(batch_size, dtype) 68 | 69 | zero_output = tf.zeros(tf.stack([batch_size, cell.output_size]), inputs.dtype) 70 | if sequence_length is not None: 71 | min_sequence_length = tf.reduce_min(sequence_length) 72 | max_sequence_length = tf.reduce_max(sequence_length) 73 | 74 | time = tf.constant(0, dtype=tf.int32, name='time') 75 | 76 | output_ta = tf.TensorArray(dtype=inputs.dtype, 77 | size=time_steps, 78 | tensor_array_name='dynamic_rnn_output') 79 | 80 | input_ta = tf.TensorArray(dtype=inputs.dtype, 81 | size=time_steps, 82 | tensor_array_name='dynamic_rnn_input') 83 | 84 | if ff_keep_prob < 1: 85 | noise_shape = tf.stack([1, batch_size, const_depth]) 86 | inputs = tf.nn.dropout(inputs, ff_keep_prob, noise_shape=noise_shape) 87 | 88 | if recur_keep_prob < 1: 89 | ones = tf.ones(tf.stack([batch_size, cell.output_size])) 90 | state_dropout = tf.nn.dropout(ones, recur_keep_prob) 91 | state_dropout = tf.concat([ones] * (cell.state_size // cell.output_size - 1) + [state_dropout], 1) 92 | else: 93 | state_dropout = 1 94 | 95 | input_ta = input_ta.unstack(inputs) 96 | 97 | #----------------------------------------------------------- 98 | def _time_step(time, state, output_ta_t): 99 | """""" 100 | 101 | input_t = input_ta.read(time) 102 | 103 | #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - 104 | def _empty_update(): 105 | return zero_output, state 106 | #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - 107 | def _call_cell(): 108 | return cell(input_t, state * state_dropout) 109 | #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - 110 | def _maybe_copy_some_through(): 111 | new_output, new_state = _call_cell() 112 | 113 | return tf.cond( 114 | time < min_sequence_length, 115 | lambda: (new_output, new_state), 116 | lambda: (tf.where(time >= sequence_length, zero_output, new_output), 117 | tf.where(time >= sequence_length, state, new_state))) 118 | #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - 119 | 120 | if sequence_length is not None: 121 | output, new_state = tf.cond( 122 | time >= max_sequence_length, 123 | _empty_update, 124 | _maybe_copy_some_through) 125 | else: 126 | (output, new_state) = _call_cell() 127 | 128 | output_ta_t = output_ta_t.write(time, output) 129 | 130 | return (time + 1, new_state, output_ta_t) 131 | #----------------------------------------------------------- 132 | 133 | _, final_state, output_final_ta = tf.while_loop( 134 | cond=lambda time, _1, _2: time < time_steps, 135 | body=_time_step, 136 | loop_vars=(time, state, output_ta), 137 | parallel_iterations=parallel_iterations) 138 | 139 | final_outputs = output_final_ta.stack() 140 | 141 | outputs = tf.transpose(final_outputs, [1, 0, 2]) # (T,B,D) => (B,T,D) 142 | return outputs, final_state 143 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/scripts/compression_ratio.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | import re 24 | import argparse 25 | import codecs 26 | from backports import lzma 27 | 28 | import numpy as np 29 | from numpy.linalg import inv 30 | import matplotlib.pyplot as plt 31 | from collections import Counter 32 | 33 | #*************************************************************** 34 | if __name__ == '__main__': 35 | """""" 36 | 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('-k', '--k_trials', type=int, default=100) 39 | parser.add_argument('-n', '--n_words', type=int, default=5000) 40 | parser.add_argument('files', nargs='+') 41 | 42 | args = parser.parse_args() 43 | type_counter = Counter() 44 | for filename in args.files: 45 | with codecs.open(filename, encoding='utf-8', errors='ignore') as f: 46 | for line in f: 47 | line = line.strip() 48 | if line: 49 | if not re.match('#|[0-9]+[-.][0-9]+', line): 50 | type_counter[line.split('\t')[1]] += 1 51 | 52 | types = type_counter.keys() 53 | total = sum(type_counter.values()) 54 | probs = [type_counter[type_] / total for type_ in types] 55 | 56 | trials = [] 57 | n_words = min(args.n_words, len(types)) or len(types) 58 | for _ in xrange(args.k_trials): 59 | chosen_types = np.random.choice(types, size=n_words, replace=False, p=probs) 60 | with codecs.open('uncompressed.txt', 'w', encoding='utf-8', errors='ignore') as f: 61 | f.write('\n'.join(chosen_types)) 62 | with lzma.open('compressed.txt.xz', 'wb') as f: 63 | f.write('\n'.join(chosen_types).encode('utf-8', 'ignore')) 64 | trials.append(os.path.getsize('compressed.txt.xz')/os.path.getsize('uncompressed.txt')) 65 | os.remove('uncompressed.txt') 66 | os.remove('compressed.txt.xz') 67 | print(np.mean(trials)) 68 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/scripts/count_nonprojective.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | import re 24 | import argparse 25 | 26 | import numpy as np 27 | from collections import defaultdict 28 | 29 | #*************************************************************** 30 | class DepTree: 31 | """""" 32 | 33 | #============================================================= 34 | def __init__(self, buff): 35 | """""" 36 | 37 | self._head2deps = defaultdict(list) 38 | self._dep2head = dict() 39 | self._str = [] 40 | for line in buff: 41 | dep_idx = int(line[0]) 42 | head_idx = int(line[6]) 43 | self.head2deps[head_idx].append(dep_idx) 44 | self.dep2head[dep_idx] = head_idx 45 | self._str.append(line[1]) 46 | return 47 | 48 | #============================================================= 49 | def count_nonprojective(self): 50 | """""" 51 | 52 | nonproj = [] 53 | for dep in self: 54 | head = self.dep2head[dep] 55 | span_min = min(dep, head) 56 | span_max = max(dep, head) 57 | for mid_dep in xrange(span_min+1, span_max): 58 | mid_head = self.dep2head[mid_dep] 59 | if mid_head < span_min or mid_head > span_max: 60 | crossing = True 61 | break 62 | else: 63 | crossing = False 64 | nonproj.append(int(crossing)) 65 | return nonproj 66 | 67 | #============================================================= 68 | @property 69 | def head2deps(self): 70 | return self._head2deps 71 | @property 72 | def dep2head(self): 73 | return self._dep2head 74 | 75 | #============================================================= 76 | def __iter__(self): 77 | return (dep for dep in self.dep2head) 78 | def __len__(self): 79 | return len(self.dep2head) 80 | def __str__(self): 81 | return ' '.join(self._str)+'\n' 82 | 83 | #*************************************************************** 84 | if __name__ == '__main__': 85 | """""" 86 | 87 | parser = argparse.ArgumentParser() 88 | parser.add_argument('files', nargs='+') 89 | 90 | args = parser.parse_args() 91 | for filename in args.files: 92 | lang = re.search('([-\w]*)-ud', filename).group(1) 93 | nonproj = [] 94 | with open(filename) as f: 95 | buff = [] 96 | for line in f: 97 | line = line.strip() 98 | if line: 99 | if not re.match('#|[0-9]+[-.][0-9]+', line): 100 | buff.append(line.split('\t')) 101 | else: 102 | tree = DepTree(buff) 103 | nonproj.extend(tree.count_nonprojective()) 104 | buff = [] 105 | print(lang, np.mean(nonproj)*100) 106 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/scripts/heaps_law.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | import re 24 | import argparse 25 | 26 | import numpy as np 27 | from numpy.linalg import inv 28 | import matplotlib.pyplot as plt 29 | from collections import defaultdict 30 | 31 | #*************************************************************** 32 | if __name__ == '__main__': 33 | """""" 34 | 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument('files', nargs='+') 37 | 38 | args = parser.parse_args() 39 | words = [] 40 | types = set() 41 | n_types = [] 42 | for filename in args.files: 43 | with open(filename) as f: 44 | for line in f: 45 | line = line.strip() 46 | if line: 47 | if not re.match('#|[0-9]+[-.][0-9]+', line): 48 | words.append(line.split('\t')[1]) 49 | np.random.shuffle(words) 50 | for word in words: 51 | types.add(word) 52 | n_types.append(len(types)) 53 | 54 | K = 1 55 | b = .75 56 | y = n_types 57 | logy = np.log(y) 58 | x = np.arange(len(n_types))+1 59 | logx = np.log(x) 60 | d2ell = np.array([[1, np.mean(logx)],[np.mean(logx), np.mean(logx**2)]]) 61 | d2ellinv = inv(d2ell) 62 | ell = np.mean((logy - b*logx-K)**2 / 2) 63 | dell = np.array([np.mean(K+b*logx-logy), np.mean((K+b*logx-logy)*logx)]) 64 | updates = d2ellinv.dot(dell) 65 | K -= updates[0] 66 | b -= updates[1] 67 | print(b) 68 | #K_ = 5 69 | #b_ = .74 70 | #for i in xrange(20): 71 | # ell = np.mean((y - K_*x**b_)**2 / 2) 72 | # K_ -= 2*np.mean((K_*x**b_-y)*x**b_) / np.mean(x**(2*b_)) 73 | # b_ -= 2*np.mean((K_*x**b_-y)*K_*x**b_*logx) / np.mean((2*K_*x**b_ - y)*K_*x**b_*logx**2) 74 | # print(ell, K_, b_) 75 | #plt.figure() 76 | #plt.grid() 77 | #plt.plot(x, y) 78 | #plt.plot(x, np.exp(b*logx+K)) 79 | #plt.show() 80 | #plt.figure() 81 | #plt.grid() 82 | #plt.plot(x, logy - b*logx-K) 83 | #plt.show() 84 | #plt.figure() 85 | #plt.grid() 86 | #plt.plot(x, y - K_*x**b_) 87 | #plt.show() 88 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/scripts/reinsert_compounds.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | import sys 7 | import codecs 8 | 9 | input_file = sys.argv[2] 10 | output_file = sys.argv[1] 11 | 12 | lines = [] 13 | 14 | with codecs.open(output_file, encoding='utf-8') as f: 15 | for line in f: 16 | lines.append(line) 17 | 18 | with codecs.open(input_file, encoding='utf-8') as f: 19 | with codecs.open(output_file, 'w', encoding='utf-8') as fout: 20 | i = 0 21 | for line in f: 22 | line = line.strip() 23 | 24 | if len(line) == 0: 25 | fout.write(lines[i]) 26 | i += 1 27 | continue 28 | 29 | if line[0] == '#': 30 | continue 31 | 32 | line = line.split('\t') 33 | if '.' in line[0]: 34 | continue 35 | 36 | if '-' in line[0]: 37 | fout.write('%s\n' % ('\t'.join(line))) 38 | continue 39 | 40 | fout.write(lines[i]) 41 | i += 1 42 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/trash/retrained_vocab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | import codecs 24 | from collections import Counter 25 | 26 | import numpy as np 27 | import scipy.linalg as la 28 | import tensorflow as tf 29 | 30 | from parser.vocabs.base_vocab import BaseVocab 31 | 32 | #*************************************************************** 33 | class RetrainedVocab(BaseVocab): 34 | """""" 35 | 36 | #============================================================= 37 | def __init__(self, pretrained_vocab, *args, **kwargs): 38 | """""" 39 | 40 | super(RetrainedVocab, self).__init__(*args, **kwargs) 41 | 42 | self._pretrained_vocab = pretrained_vocab 43 | return 44 | 45 | #============================================================= 46 | def __call__(self): 47 | """""" 48 | 49 | embed_size = self.embed_size 50 | row_idxs = tf.placeholder(tf.int32, shape=(None,), name='row_idxs') 51 | col_idxs = tf.placeholder(tf.int32, shape=(None,), name='col_idxs') 52 | S, U, _ = tf.svd(self.pretrained_vocab.embeddings) 53 | self.embeddings = U[:,:embed_size] * S[:embed_size] 54 | 55 | old_rows = tf.gather(self.pretrained_vocab.embeddings, row_idxs) 56 | old_cols = tf.gather(self.pretrained_vocab.embeddings, col_idxs) 57 | new_rows = tf.gather(self.embeddings, row_idxs) 58 | new_cols = tf.gather(self.embeddings, col_idxs) 59 | old_matmul = tf.matmul(old_rows, old_cols, transpose_b=True) 60 | new_matmul = tf.matmul(new_rows, new_cols, transpose_b=True) 61 | 62 | if self.embed_loss == 'cross_entropy': 63 | old_matmul = tf.expand_dims(tf.nn.softmax(old_matmul), axis=1) 64 | new_matmul = tf.expand_dims(tf.nn.softmax(new_matmul), axis=2) 65 | loss = -tf.reduce_sum(tf.matmul(old_matmul, tf.log(new_matmul))) / tf.to_float(tf.shape(row_idxs)[0]) 66 | elif self.embed_loss == 'l2_loss': 67 | loss = tf.reduce_sum((old_matmul - new_matmul)**2 / 2) / tf.to_float(tf.shape(row_idxs)[0]) 68 | else: 69 | raise ValueError('embed_loss must be in "(cross_entropy, l2_loss)"') 70 | 71 | return {'row_idxs': row_idxs, 72 | 'col_idxs': col_idxs, 73 | 'loss': loss} 74 | 75 | #============================================================= 76 | def dump(self): 77 | """""" 78 | 79 | matrix = self.embeddings.eval() 80 | with codecs.open(self.name+'.txt', 'w') as f: 81 | for idx in xrange(self.START_IDX, len(self)): 82 | f.write('%s %s\n' % (self[idx], ' '.join(matrix[idx]))) 83 | return 84 | 85 | #============================================================= 86 | @property 87 | def pretrained_vocab(self): 88 | return self._pretrained_vocab 89 | 90 | #============================================================= 91 | def __setattr__(self, name, value): 92 | if name == '_pretrained_vocab': 93 | self._str2idx = value._str2idx 94 | self._idx2str = value._idx2str 95 | self._counts = value._counts 96 | super(RetrainedVocab, self).__setattr__(name, value) 97 | 98 | #*************************************************************** 99 | if __name__ == '__main__': 100 | """""" 101 | 102 | from parser import Configurable 103 | from parser.vocabs import PretrainedVocab 104 | configurable = Configurable(retrained_vocab={'embed_loss':'cross_entropy', 'retrained_embed_size':50}) 105 | pretrained_vocab = PretrainedVocab.from_configurable(configurable) 106 | retrained_vocab = RetrainedVocab.from_vocab(pretrained_vocab) 107 | retrain_loss = retrained_vocab(pretrained_vocab) 108 | print('RetrainedVocab passes') 109 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/trash/weighted_mean.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | 25 | from parser.neural.models import NN 26 | 27 | #*************************************************************** 28 | class WeightedMean(NN): 29 | """""" 30 | 31 | #============================================================= 32 | def __call__(self, vocab, output_size, moving_params=None): 33 | """""" 34 | 35 | inputs = tf.placeholder(tf.int32, shape=(None,None), name='inputs-%s' % self.name) 36 | 37 | self.tokens_to_keep = tf.to_float(tf.greater(inputs, vocab.PAD)) 38 | self.sequence_lengths = tf.reduce_sum(self.tokens_to_keep, axis=1, keep_dims=True) 39 | self.n_tokens = tf.reduce_sum(self.sequence_lengths) 40 | self.batch_size = tf.shape(inputs)[0] 41 | self.bucket_size = tf.shape(inputs)[1] 42 | self.moving_params = moving_params 43 | 44 | embeddings = vocab.embedding_lookup(inputs, moving_params=self.moving_params) 45 | weighted_embeddings = self.linear_attention(embeddings) 46 | mlp = self.MLP(weighted_embeddings, self.mlp_size) 47 | lin = self.linear(mlp, output_size) 48 | 49 | return {'output': lin, 'inputs': inputs} 50 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/vocabs/__init__.py: -------------------------------------------------------------------------------- 1 | from index_vocab import IndexVocab, DepVocab, HeadVocab 2 | from pretrained_vocab import PretrainedVocab 3 | from token_vocab import TokenVocab, WordVocab, LemmaVocab, TagVocab, XTagVocab, RelVocab 4 | from subtoken_vocab import SubtokenVocab, CharVocab 5 | from ngram_vocab import NgramVocab 6 | from multivocab import Multivocab 7 | from ngram_multivocab import NgramMultivocab 8 | 9 | __all__ = [ 10 | 'DepVocab', 11 | 'HeadVocab', 12 | 'PretrainedVocab', 13 | 'WordVocab', 14 | 'LemmaVocab', 15 | 'TagVocab', 16 | 'XTagVocab', 17 | 'RelVocab', 18 | 'CharVocab', 19 | 'NgramVocab', 20 | 'Multivocab', 21 | 'NgramMultivocab' 22 | ] 23 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/vocabs/base_vocab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | import re 24 | from collections import Counter 25 | 26 | import numpy as np 27 | import tensorflow as tf 28 | 29 | import parser.neural.linalg as linalg 30 | from parser import Configurable 31 | 32 | #*************************************************************** 33 | class BaseVocab(Configurable): 34 | """""" 35 | 36 | #============================================================= 37 | def __init__(self, *args, **kwargs): 38 | """""" 39 | 40 | super(BaseVocab, self).__init__(*args, **kwargs) 41 | 42 | self._cased = super(BaseVocab, self).cased 43 | self._special_tokens = super(BaseVocab, self).special_tokens 44 | self._special_tokens_set = set(self._special_tokens) 45 | self._set_special_tokens() 46 | # NOTE: __setattr__ turns these into dicts 47 | self._str2idx = zip(self.special_tokens, range(len(self.special_tokens))) 48 | self._idx2str = zip(range(len(self.special_tokens)), self.special_tokens) 49 | self._tok2idx = self._str2idx 50 | self._counts = None 51 | self._embeddings = None 52 | # NOTE this placeholder stores the token data indices 53 | # I.e. the token's index in the word/tag/glove embedding matrix 54 | # CharVocab will by default be "char" 55 | self.placeholder = None 56 | 57 | #============================================================= 58 | def _set_special_tokens(self): 59 | pattern = re.compile('\W+', re.UNICODE) 60 | for i, token in enumerate(self.special_tokens): 61 | token = token.lstrip('<') 62 | token = token.rstrip('>') 63 | token = token.upper() 64 | token = pattern.sub('', token) 65 | assert token not in self.__dict__ 66 | self.__dict__[token] = i 67 | return 68 | 69 | #============================================================= 70 | @classmethod 71 | def from_vocab(cls, vocab, *args, **kwargs): 72 | """""" 73 | 74 | args += (vocab,) 75 | return cls.from_configurable(vocab, *args, **kwargs) 76 | 77 | #============================================================= 78 | def generate_placeholder(self): 79 | """""" 80 | 81 | if self.placeholder is None: 82 | self.placeholder = tf.placeholder(tf.int32, shape=[None, None], name=self.name) 83 | return self.placeholder 84 | 85 | #============================================================= 86 | def __call__(self, placeholder=None, moving_params=None): 87 | """""" 88 | 89 | placeholder = self.generate_placeholder() if placeholder is None else placeholder 90 | embeddings = self.embeddings if moving_params is None else moving_params.average(self.embeddings) 91 | return tf.nn.embedding_lookup(embeddings, placeholder) 92 | 93 | #============================================================= 94 | def setup(self): 95 | """""" 96 | 97 | self.placeholder = None 98 | return 99 | 100 | #============================================================= 101 | def set_feed_dict(self, data, feed_dict): 102 | """""" 103 | 104 | feed_dict[self.placeholder] = data 105 | return 106 | 107 | #============================================================= 108 | def load(self): 109 | raise NotImplementedError() 110 | def dump(self): 111 | raise NotImplementedError() 112 | def count(self): 113 | raise NotImplementedError() 114 | 115 | #============================================================= 116 | def strings(self): 117 | return self._str2idx.keys() 118 | def indices(self): 119 | return self._str2idx.values() 120 | def iteritems(self): 121 | return self._str2idx.iteritems() 122 | def most_common(self, n=None): 123 | return self._counts.most_common(n) 124 | def index(self, token): 125 | if not self.cased and token not in self._special_tokens_set: 126 | token = token.lower() 127 | return self._tok2idx.get(token, self.UNK) 128 | 129 | #============================================================= 130 | @property 131 | def depth(self): 132 | return None 133 | @property 134 | def special_tokens(self): 135 | return self._special_tokens 136 | @property 137 | def cased(self): 138 | return self._cased 139 | @property 140 | def counts(self): 141 | return self._counts 142 | @property 143 | def embeddings(self): 144 | return self._embeddings 145 | #@embeddings.setter 146 | #def embeddings(self, matrix): 147 | # if matrix.shape[1] != self.embed_size: 148 | # raise ValueError("Matrix shape[1] of %d doesn't match expected shape of %d" % (matrix.shape[1], self.embed_size)) 149 | # with tf.device('/cpu:0'): 150 | # with tf.variable_scope(self.name.title()): 151 | # self._embeddings = tf.Variable(matrix, name='Embeddings', dtype=tf.float32, trainable=True) 152 | # return 153 | 154 | #============================================================= 155 | def __getitem__(self, key): 156 | if isinstance(key, basestring): 157 | if not self.cased and key not in self._special_tokens_set: 158 | key = key.lower() 159 | return self._str2idx.get(key, self.UNK) 160 | elif isinstance(key, (int, long, np.int32, np.int64)): 161 | return self._idx2str.get(key, self.special_tokens[self.UNK]) 162 | elif hasattr(key, '__iter__'): 163 | return [self[k] for k in key] 164 | else: 165 | raise ValueError('key to BaseVocab.__getitem__ must be (iterable of) string or integer') 166 | return 167 | 168 | def __setitem__(self, key, value): 169 | if isinstance(key, basestring): 170 | if not self.cased and key not in self._special_tokens_set: 171 | key = key.lower() 172 | self._str2idx[key] = value 173 | self._idx2str[value] = key 174 | elif isinstance(key, (int, long)): 175 | if not self.cased and value not in self._special_tokens_set: 176 | value = value.lower() 177 | self._idx2str[key] = value 178 | self._str2idx[value] = key 179 | elif hasattr(key, '__iter__') and hasattr(value, '__iter__'): 180 | for k, v in zip(key, value): 181 | self[k] = v 182 | else: 183 | raise ValueError('keys and values to BaseVocab.__setitem__ must be (iterable of) string or integer') 184 | 185 | def __contains__(self, key): 186 | if isinstance(key, basestring): 187 | if not self.cased and key not in self._special_tokens_set: 188 | key = key.lower() 189 | return key in self._str2idx 190 | elif isinstance(key, (int, long)): 191 | return key in self._idx2str 192 | else: 193 | raise ValueError('key to BaseVocab.__contains__ must be string or integer') 194 | return 195 | 196 | def __len__(self): 197 | return len(self._str2idx) 198 | 199 | def __iter__(self): 200 | return (key for key in sorted(self._str2idx, key=self._str2idx.get)) 201 | 202 | def __setattr__(self, name, value): 203 | if name in ('_str2idx', '_idx2str', '_str2idxs'): 204 | value = dict(value) 205 | elif name == '_counts': 206 | value = Counter(value) 207 | super(BaseVocab, self).__setattr__(name, value) 208 | return 209 | 210 | #*************************************************************** 211 | if __name__ == '__main__': 212 | """""" 213 | 214 | base_vocab = BaseVocab() 215 | print('BaseVocab passes') 216 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/vocabs/index_vocab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | import re 24 | import sys 25 | from collections import Counter 26 | 27 | import numpy as np 28 | import tensorflow as tf 29 | 30 | from parser import Configurable 31 | 32 | __all__ = ['DepVocab', 'HeadVocab'] 33 | 34 | #*************************************************************** 35 | class IndexVocab(Configurable): 36 | """""" 37 | 38 | ROOT = 0 39 | 40 | #============================================================= 41 | def __init__(self, *args, **kwargs): 42 | """""" 43 | 44 | super(IndexVocab, self).__init__(*args, **kwargs) 45 | self.placeholder = None 46 | 47 | #============================================================= 48 | def generate_placeholder(self): 49 | """""" 50 | 51 | if self.placeholder is None: 52 | self.placeholder = tf.placeholder(tf.int32, shape=[None, None], name=self.name) 53 | return self.placeholder 54 | 55 | #============================================================= 56 | def set_feed_dict(self, data, feed_dict): 57 | """""" 58 | 59 | feed_dict[self.placeholder] = data 60 | return 61 | 62 | #============================================================= 63 | def setup(self): 64 | self.placeholder = None 65 | return 66 | 67 | #============================================================= 68 | def index(self, token): 69 | return 0 if token == '_' else int(token) 70 | 71 | #============================================================= 72 | @property 73 | def depth(self): 74 | return None 75 | @property 76 | def conll_idx(self): 77 | return self._conll_idx 78 | 79 | #============================================================= 80 | def __getitem__(self, key): 81 | if isinstance(key, basestring): 82 | return int(key) 83 | elif isinstance(key, (int, long, np.int32, np.int64)): 84 | return str(key) 85 | elif hasattr(key, '__iter__'): 86 | return [self[k] for k in key] 87 | else: 88 | raise ValueError('key to BaseVocab.__getitem__ must be (iterable of) string or integer') 89 | return 90 | 91 | #*************************************************************** 92 | class DepVocab(IndexVocab): 93 | _conll_idx = 0 94 | class HeadVocab(IndexVocab): 95 | _conll_idx = 6 96 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/vocabs/multivocab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | import re 24 | import codecs 25 | from collections import Counter 26 | 27 | import numpy as np 28 | import tensorflow as tf 29 | 30 | from parser import Configurable 31 | from parser.neural import linalg 32 | from parser.vocabs import TokenVocab, SubtokenVocab 33 | 34 | __all__ = ['Multivocab'] 35 | 36 | #*************************************************************** 37 | class Multivocab(Configurable): 38 | """""" 39 | 40 | #============================================================= 41 | def __init__(self, vocabs, *args, **kwargs): 42 | """""" 43 | 44 | super(Multivocab, self).__init__(*args, **kwargs) 45 | 46 | self._vocabs = vocabs 47 | self._set_special_tokens() 48 | # NOTE Don't forget to run index_tokens() after adding test/validation files! 49 | self.placeholder = None 50 | return 51 | 52 | #============================================================= 53 | def __call__(self, placeholder=None, moving_params=None): 54 | """""" 55 | # TODO check to see if a word is all unk, and if so, replace it with a random vector 56 | 57 | embeddings = [vocab(moving_params=moving_params) for vocab in self] 58 | return tf.add_n(embeddings) 59 | 60 | #============================================================= 61 | def setup(self): 62 | """""" 63 | 64 | self.placeholder = None 65 | for vocab in self: 66 | vocab.setup() 67 | return 68 | 69 | #============================================================= 70 | def generate_placeholder(self): 71 | """""" 72 | 73 | if self.placeholder is None: 74 | self.placeholder = tf.stack([vocab.generate_placeholder() for vocab in self], axis=2) 75 | return self.placeholder 76 | 77 | #============================================================= 78 | def _set_special_tokens(self): 79 | pattern = re.compile('\W+', re.UNICODE) 80 | self._special_tokens = zip(*[vocab.special_tokens for vocab in self]) 81 | for i, token in enumerate(self.special_tokens): 82 | n = len(token) 83 | assert len(set(token)) == 1 84 | token = token[0] 85 | token = token.lstrip('<') 86 | token = token.rstrip('>') 87 | token = token.upper() 88 | token = pattern.sub('', token) 89 | assert token not in self.__dict__ 90 | self.__dict__[token] = tuple(i for _ in xrange(n)) 91 | return 92 | 93 | #============================================================= 94 | def add_files(self, conll_files): 95 | """""" 96 | 97 | conll_files = list(conll_files) 98 | token_vocabs = [] 99 | for vocab in self: 100 | if hasattr(vocab, 'token_vocab'): 101 | if vocab.token_vocab not in token_vocabs: 102 | vocab.token_vocab.count(conll_files) 103 | token_vocabs.append(vocab.token_vocab) 104 | return 105 | 106 | #============================================================= 107 | def index_tokens(self): 108 | """""" 109 | 110 | for vocab in self: 111 | if hasattr(vocab, 'index_tokens'): 112 | vocab.index_tokens() 113 | return 114 | 115 | #============================================================= 116 | def set_feed_dict(self, data, feed_dict): 117 | """""" 118 | 119 | for i, vocab in enumerate(self): 120 | vocab.set_feed_dict(data[:,:,i], feed_dict) 121 | return 122 | 123 | #============================================================= 124 | def index(self, token): 125 | return tuple(vocab.index(token) for vocab in self) 126 | 127 | #============================================================= 128 | @property 129 | def depth(self): 130 | return len(self) 131 | @property 132 | def special_tokens(self): 133 | return self._special_tokens 134 | @property 135 | def conll_idx(self): 136 | return self._conll_idx 137 | 138 | #============================================================= 139 | def __iter__(self): 140 | return (vocab for vocab in self._vocabs) 141 | def __getitem__(self, key): 142 | return self._vocabs[key] 143 | def __len__(self): 144 | return len(self._vocabs) 145 | def __setattr__(self, key, value): 146 | if key == '_vocabs': 147 | conll_idxs = set([vocab.conll_idx for vocab in value if hasattr(vocab, 'conll_idx')]) 148 | assert len(conll_idxs) == 1 149 | self._conll_idx = list(conll_idxs)[0] 150 | super(Multivocab, self).__setattr__(key, value) 151 | 152 | #*************************************************************** 153 | if __name__ == '__main__': 154 | """""" 155 | 156 | from parser.vocabs import PretrainedVocab, WordVocab, CharVocab, Multivocab 157 | 158 | configurable = Configurable() 159 | token_vocab = WordVocab.from_configurable(configurable) 160 | pretrained_vocab = PretrainedVocab.from_vocab(token_vocab) 161 | subtoken_vocab = CharVocab.from_vocab(token_vocab) 162 | multivocab = Multivocab.from_configurable(configurable, [pretrained_vocab, token_vocab, subtoken_vocab]) 163 | multivocab.add_files(configurable.valid_files) 164 | multivocab.index_tokens() 165 | print("Indices for '': %s" % str(multivocab.index(''))) 166 | print("Indices for 'the': %s" % str(multivocab.index('the'))) 167 | print("Indices for 'The': %s" % str(multivocab.index('The'))) 168 | print('Multivocab passes') 169 | 170 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/vocabs/ngram_multivocab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | import sys 24 | import codecs 25 | from collections import Counter 26 | 27 | import numpy as np 28 | import tensorflow as tf 29 | 30 | from parser import Configurable, Multibucket 31 | from parser.vocabs.base_vocab import BaseVocab 32 | from parser.vocabs import SubtokenVocab, NgramVocab, Multivocab 33 | from parser.misc.bucketer import Bucketer 34 | 35 | __all__ = ['NgramMultivocab'] 36 | 37 | #*************************************************************** 38 | class NgramMultivocab(Multivocab, SubtokenVocab): 39 | """""" 40 | 41 | #============================================================= 42 | def __init__(self, token_vocab, *args, **kwargs): 43 | """""" 44 | 45 | super(BaseVocab, self).__init__(*args, **kwargs) 46 | self._cased = super(BaseVocab, self).cased 47 | 48 | SubtokenVocab.__setattr__(self, '_token_vocab', token_vocab) 49 | self._multibucket = Multibucket.from_configurable(self, embed_model=self.embed_model, name=self.name) 50 | self._vocabs = [NgramVocab.from_vocab(self.token_vocab, i+1, cased=self.cased) for i in xrange(self.max_n)] 51 | self._special_tokens = super(BaseVocab, self).special_tokens 52 | self._special_tokens_set = set(self._special_tokens) 53 | SubtokenVocab._set_special_tokens(self) 54 | self._tok2idx = {} 55 | 56 | for vocab in self: 57 | assert vocab.token_vocab is self.token_vocab 58 | return 59 | 60 | #============================================================= 61 | def add_files(self, conll_files): 62 | """""" 63 | 64 | self.token_vocab.count(conll_files) 65 | return 66 | 67 | #============================================================= 68 | def index_tokens(self): 69 | """""" 70 | 71 | n_buckets = self.n_buckets 72 | tok2idxs = {token: [vocab.subtoken_indices(token) for vocab in self] for token in self.token_vocab.counts} 73 | with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s'%self.name) as bucketer: 74 | splits = bucketer.compute_splits(len(indices[0]) for indices in tok2idxs.values()) 75 | bucketer.plot() 76 | with self.multibucket.open(splits, depth=len(self)): 77 | for index, special_token in enumerate(self.special_tokens): 78 | self.tok2idx[special_token] = self.multibucket.add([[index]*len(self)]) 79 | for token, _ in self.sorted_counts(self.token_vocab.counts): 80 | indices = tok2idxs[token] 81 | sequence = [[indices[i][j] for i in xrange(len(indices)) if j < len(indices[i])] for j in xrange(len(indices[0]))] 82 | self.tok2idx[token] = self.multibucket.add(sequence) 83 | return 84 | 85 | #============================================================= 86 | def __call__(self, placeholder, keep_prob=None, moving_params=None): 87 | return SubtokenVocab.__call__(self, placeholder, keep_prob=keep_prob, moving_params=moving_params) 88 | 89 | def index(self, token): 90 | return SubtokenVocab.index(self, token) 91 | 92 | def generate_placeholder(self): 93 | return SubtokenVocab.generate_placeholder(self) 94 | 95 | #============================================================= 96 | def embedding_lookup(self, placeholders, embed_keep_prob=None, moving_params=None): 97 | """""" 98 | 99 | if moving_params is None: 100 | shape = tf.shape(placeholders) 101 | shape = tf.stack([shape[0], 1, shape[2]]) 102 | placeholders = la.random_where(embed_keep_prob, placeholders, self.UNK, shape=shape) 103 | embeddings = [vocab.embedding_lookup(placeholders[:,:,i], embed_keep_prob=1, moving_params=moving_params) for i, vocab in enumerate(self)] 104 | return tf.stack(embeddings, axis=2) 105 | 106 | #============================================================= 107 | def __iter__(self): 108 | return (vocab for vocab in self._vocabs) 109 | def __getitem__(self, key): 110 | return self._vocabs[key] 111 | def __len__(self): 112 | return len(self._vocabs) 113 | 114 | #*************************************************************** 115 | if __name__ == '__main__': 116 | """""" 117 | 118 | from parser import Configurable 119 | from parser.vocabs import WordVocab, NgramMultivocab 120 | 121 | configurable = Configurable() 122 | token_vocab = WordVocab.from_configurable(configurable) 123 | ngram_multivocab = NgramMultivocab.from_vocab(token_vocab) 124 | ngram_multivocab.add_files(configurable.valid_files) 125 | ngram_multivocab.index_tokens() 126 | print("Indices for '': %s" % str(ngram_multivocab.index(''))) 127 | print("Indices for 'the': %s" % str(ngram_multivocab.index('the'))) 128 | print("Indices for 'The': %s" % str(ngram_multivocab.index('The'))) 129 | print('NgramMultivocab passes') 130 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/vocabs/ngram_vocab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | import codecs 24 | from collections import Counter 25 | 26 | import numpy as np 27 | import tensorflow as tf 28 | 29 | from parser.vocabs import TokenVocab, SubtokenVocab, CharVocab 30 | from parser import Multibucket 31 | 32 | __all__ = ['NgramVocab'] 33 | 34 | #*************************************************************** 35 | class NgramVocab(SubtokenVocab): 36 | """""" 37 | 38 | #============================================================= 39 | def __init__(self, n, token_vocab, *args, **kwargs): 40 | """""" 41 | 42 | recount = kwargs.pop('recount', False) 43 | initialize_zero = kwargs.pop('initialize_zero', False) 44 | super(TokenVocab, self).__init__(*args, **kwargs) 45 | 46 | self._n = n 47 | self._token_vocab = token_vocab 48 | self._token_counts = Counter() 49 | self._subtoken_vocab = CharVocab.from_vocab(self.token_vocab) 50 | self._multibucket = Multibucket.from_configurable(self, embed_model=self.embed_model, name=self.name) 51 | 52 | if recount: 53 | self.count() 54 | else: 55 | if os.path.isfile(self.filename): 56 | self.load() 57 | else: 58 | self.count() 59 | self.dump() 60 | self.index_vocab() 61 | 62 | embed_dims = [len(self), self.embed_size] 63 | if initialize_zero: 64 | self.embeddings = np.zeros(embed_dims) 65 | else: 66 | self.embeddings = np.random.randn(*embed_dims) 67 | return 68 | 69 | #============================================================= 70 | def count(self): 71 | """""" 72 | 73 | special_tokens = set(self.token_vocab.special_tokens) 74 | for token in self.token_vocab: 75 | if token not in special_tokens: 76 | idxs = self.subtoken_vocab.subtoken_indices(token) 77 | idxs = [self.subtoken_vocab.START] + idxs + [self.subtoken_vocab.STOP] 78 | if len(idxs) > self.n: 79 | for i in xrange(len(idxs) - self.n): 80 | subtoken = ''.join(self.subtoken_vocab[idxs[i:i+self.n]]) 81 | self.counts[subtoken] += 1 82 | self.token_counts[subtoken] += self.token_vocab.counts[token] 83 | return 84 | 85 | #============================================================= 86 | def subtoken_indices(self, token): 87 | """""" 88 | 89 | idxs = self.subtoken_vocab.subtoken_indices(token) 90 | idxs = [self.subtoken_vocab.START] + idxs + [self.subtoken_vocab.STOP] 91 | if len(idxs) <= self.n: 92 | return [self.PAD] 93 | else: 94 | subtokens = [] 95 | for i in xrange(len(idxs) - self.n): 96 | subtokens.append(''.join(self.subtoken_vocab[idxs[i:i+self.n]])) 97 | return self[subtokens] 98 | 99 | #============================================================= 100 | @property 101 | def n(self): 102 | return self._n 103 | @property 104 | def subtoken_vocab(self): 105 | return self._subtoken_vocab 106 | @property 107 | def name(self): 108 | return '%d-%s' % (self.n, super(NgramVocab, self).name) 109 | 110 | #============================================================= 111 | def __setattr__(self, name, value): 112 | if name == '_subtoken_vocab': 113 | self._conll_idx = value.conll_idx 114 | if self.cased is None: 115 | self._cased = value.cased 116 | elif self.cased != value.cased: 117 | cls = value.__class__ 118 | value = cls.from_configurable(value, value.token_vocab, 119 | cased=self.cased, 120 | recount=True) 121 | super(NgramVocab, self).__setattr__(name, value) 122 | return 123 | 124 | #*************************************************************** 125 | if __name__ == '__main__': 126 | """""" 127 | 128 | from parser import Configurable 129 | from parser.vocabs import WordVocab, CharVocab, NgramVocab 130 | 131 | configurable = Configurable() 132 | token_vocab = WordVocab.from_configurable(configurable, 1) 133 | if os.path.isfile('saves/defaults/2-ngrams.txt'): 134 | os.remove('saves/defaults/2-ngrams.txt') 135 | ngram_vocab = NgramVocab.from_vocab(token_vocab, 2) 136 | ngram_vocab = NgramVocab.from_vocab(token_vocab, 2) 137 | ngram_vocab.token_vocab.count(conll_files = configurable.valid_files) 138 | ngram_vocab.index_tokens() 139 | ngram_vocab.fit_to_zipf() 140 | print('NgramVocab passes') -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/vocabs/pretrained_vocab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | import codecs 24 | import gzip 25 | import warnings 26 | try: 27 | from backports import lzma 28 | except: 29 | warnings.warn('Install backports.lzma for xz support') 30 | from collections import Counter 31 | 32 | import numpy as np 33 | import tensorflow as tf 34 | 35 | import parser.neural.linalg as linalg 36 | from parser.vocabs.base_vocab import BaseVocab 37 | 38 | #*************************************************************** 39 | class PretrainedVocab(BaseVocab): 40 | """""" 41 | 42 | #============================================================= 43 | def __init__(self, token_vocab, *args, **kwargs): 44 | """""" 45 | 46 | super(PretrainedVocab, self).__init__(*args, **kwargs) 47 | 48 | self._token_vocab = token_vocab 49 | 50 | self.load() 51 | self.count() 52 | return 53 | 54 | #============================================================= 55 | def __call__(self, placeholder=None, moving_params=None): 56 | """""" 57 | 58 | embeddings = super(PretrainedVocab, self).__call__(placeholder, moving_params=moving_params) 59 | # (n x b x d') -> (n x b x d) 60 | with tf.variable_scope(self.name.title()): 61 | matrix = linalg.linear(embeddings, self.token_embed_size, moving_params=moving_params) 62 | if moving_params is None: 63 | with tf.variable_scope('Linear', reuse=True): 64 | weights = tf.get_variable('Weights') 65 | tf.losses.add_loss(tf.nn.l2_loss(tf.matmul(tf.transpose(weights), weights) - tf.eye(self.token_embed_size))) 66 | return matrix 67 | #return embeddings # changed in saves2/test8 68 | 69 | #============================================================= 70 | def setup(self): 71 | """""" 72 | 73 | self.placeholder = None 74 | with tf.device('/cpu:0'): 75 | with tf.variable_scope(self.name.title()): 76 | self._embeddings = tf.Variable(self._embeddings_array, name='Embeddings', dtype=tf.float32, trainable=False) 77 | return 78 | 79 | #============================================================= 80 | def load(self): 81 | """""" 82 | 83 | embeddings = [] 84 | cur_idx = len(self.special_tokens) 85 | max_rank = self.max_rank 86 | if self.filename.endswith('.xz'): 87 | open_func = lzma.open 88 | else: 89 | open_func = codecs.open 90 | with open_func(self.filename, 'rb') as f: 91 | reader = codecs.getreader('utf-8')(f, errors='ignore') 92 | if self.skip_header == True: 93 | reader.readline() 94 | for line_num, line in enumerate(reader): 95 | if (not max_rank) or line_num < max_rank: 96 | line = line.rstrip().split(' ') 97 | if len(line) > 1: 98 | embeddings.append(np.array(line[1:], dtype=np.float32)) 99 | self[line[0]] = cur_idx 100 | cur_idx += 1 101 | else: 102 | break 103 | try: 104 | embeddings = np.stack(embeddings) 105 | embeddings = np.pad(embeddings, ( (len(self.special_tokens),0), (0,0) ), 'constant') 106 | self._embeddings_array = np.stack(embeddings) 107 | self._embed_size = self._embeddings_array.shape[1] 108 | except: 109 | shapes = set([embedding.shape for embedding in embeddings]) 110 | raise ValueError("Couldn't stack embeddings with shapes in %s" % shapes) 111 | return 112 | 113 | #============================================================= 114 | def count(self): 115 | """""" 116 | 117 | if self.token_vocab is not None: 118 | zipf = self.token_vocab.fit_to_zipf(plot=False) 119 | zipf_freqs = zipf.predict(np.arange(len(self))+1) 120 | else: 121 | zipf_freqs = -np.log(np.arange(len(self))+1) 122 | zipf_counts = zipf_freqs / np.min(zipf_freqs) 123 | for count, token in zip(zipf_counts, self.strings()): 124 | self.counts[token] = int(count) 125 | return 126 | 127 | #============================================================= 128 | @property 129 | def token_vocab(self): 130 | return self._token_vocab 131 | @property 132 | def token_embed_size(self): 133 | return (self.token_vocab or self).embed_size 134 | @property 135 | def embeddings(self): 136 | return super(PretrainedVocab, self).embeddings 137 | #@embeddings.setter 138 | #def embeddings(self, matrix): 139 | # self._embed_size = matrix.shape[1] 140 | # with tf.device('/cpu:0'): 141 | # with tf.variable_scope(self.name.title()): 142 | # self._embeddings = tf.Variable(matrix, name='Embeddings', trainable=False) 143 | # return 144 | 145 | #*************************************************************** 146 | if __name__ == '__main__': 147 | """""" 148 | 149 | pretrained_vocab = PretrainedVocab(None) 150 | print('PretrainedVocab passes') 151 | -------------------------------------------------------------------------------- /StanfordBiaffineParser-v2/parser/vocabs/token_vocab.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | 4 | # Copyright 2016 Timothy Dozat 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | import re 24 | import codecs 25 | from collections import Counter 26 | 27 | import numpy as np 28 | import tensorflow as tf 29 | 30 | from parser.vocabs.base_vocab import BaseVocab 31 | from parser.misc.zipf import Zipf 32 | 33 | __all__ = ['WordVocab', 'LemmaVocab', 'TagVocab', 'XTagVocab', 'RelVocab'] 34 | 35 | #*************************************************************** 36 | class TokenVocab(BaseVocab): 37 | """""" 38 | 39 | #============================================================= 40 | def __init__(self, *args, **kwargs): 41 | """""" 42 | 43 | recount = kwargs.pop('recount', False) 44 | initialize_zero = kwargs.pop('initialize_zero', True) 45 | super(TokenVocab, self).__init__(*args, **kwargs) 46 | 47 | if recount: 48 | self.count() 49 | else: 50 | if os.path.isfile(self.filename): 51 | self.load() 52 | else: 53 | self.count() 54 | self.dump() 55 | self.index_vocab() 56 | 57 | embed_dims = [len(self), self.embed_size] 58 | if initialize_zero: 59 | self._embeddings_array = np.zeros(embed_dims) 60 | else: 61 | self._embeddings_array = np.random.randn(*embed_dims) 62 | return 63 | 64 | #============================================================= 65 | def setup(self): 66 | """""" 67 | 68 | self.placeholder = None 69 | del self._embeddings 70 | with tf.device('/cpu:0'): 71 | with tf.variable_scope(self.name.title()): 72 | self._embeddings = tf.Variable(self._embeddings_array, name='Embeddings', dtype=tf.float32, trainable=True) 73 | return 74 | 75 | 76 | #============================================================= 77 | def count(self, conll_files=None): 78 | """""" 79 | 80 | if conll_files is None: 81 | conll_files = self.train_files 82 | 83 | for conll_file in conll_files: 84 | with codecs.open(conll_file, encoding='utf-8', errors='ignore') as f: 85 | for line_num, line in enumerate(f): 86 | try: 87 | line = line.strip() 88 | if line and not line.startswith('#'): 89 | line = line.split('\t') 90 | assert len(line) == 10 91 | token = line[self.conll_idx] 92 | if not self.cased: 93 | token = token.lower() 94 | self.counts[token] += 1 95 | except: 96 | raise ValueError('File %s is misformatted at line %d' % (conll_file, line_num+1)) 97 | return 98 | 99 | #============================================================= 100 | def load(self): 101 | """""" 102 | 103 | with codecs.open(self.filename, encoding='utf-8') as f: 104 | for line_num, line in enumerate(f): 105 | try: 106 | line = line.rstrip() 107 | if line: 108 | line = line.split('\t') 109 | token, count = line 110 | self.counts[token] = int(count) 111 | except: 112 | raise ValueError('File %s is misformatted at line %d' % (train_file, line_num+1)) 113 | return 114 | 115 | #============================================================= 116 | def dump(self): 117 | """""" 118 | 119 | with codecs.open(self.filename, 'w', encoding='utf-8') as f: 120 | for word, count in self.sorted_counts(self.counts): 121 | f.write('%s\t%d\n' % (word, count)) 122 | return 123 | 124 | #============================================================= 125 | def index_vocab(self): 126 | """""" 127 | 128 | for token, count in self.sorted_counts(self.counts): 129 | if ((count >= self.min_occur_count) and 130 | token not in self and 131 | (not self.max_rank or len(self) < self.max_rank)): 132 | self[token] = len(self) 133 | return 134 | 135 | #============================================================= 136 | def fit_to_zipf(self, plot=True): 137 | """""" 138 | 139 | zipf = Zipf.from_configurable(self, self.counts, name='zipf-%s'%self.name) 140 | if plot: 141 | zipf.plot() 142 | return zipf 143 | 144 | #============================================================= 145 | @staticmethod 146 | def sorted_counts(counts): 147 | return sorted(counts.most_common(), key=lambda x: (-x[1], x[0])) 148 | 149 | #============================================================= 150 | @property 151 | def conll_idx(self): 152 | return self._conll_idx 153 | 154 | #*************************************************************** 155 | class WordVocab(TokenVocab): 156 | _conll_idx = 1 157 | class LemmaVocab(WordVocab): 158 | _conll_idx = 2 159 | class TagVocab(TokenVocab): 160 | _conll_idx = 3 161 | class XTagVocab(TagVocab): 162 | _conll_idx = 4 163 | class RelVocab(TokenVocab): 164 | _conll_idx = 7 165 | 166 | #*************************************************************** 167 | if __name__ == '__main__': 168 | """""" 169 | 170 | from parser import Configurable 171 | from parser.vocabs import PretrainedVocab, TokenVocab, WordVocab 172 | 173 | configurable = Configurable() 174 | if os.path.isfile('saves/defaults/words.txt'): 175 | os.remove('saves/defaults/words.txt') 176 | token_vocab = WordVocab.from_configurable(configurable, 1) 177 | token_vocab = WordVocab.from_configurable(configurable, 1) 178 | token_vocab.fit_to_zipf() 179 | #pretrained_vocab = PretrainedVocab.from_vocab(token_vocab) 180 | #assert min(pretrained_vocab.counts.values()) == 1 181 | print('TokenVocab passed') 182 | -------------------------------------------------------------------------------- /convert_NLP4J_to_CoNLL.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os 3 | import sys 4 | 5 | #Convert NLP4J's 9-column output into CoNLL's 10-column format 6 | def convert(inputFile): 7 | writer = open(inputFile + ".conll", "w") 8 | for line in open(inputFile, "r").readlines(): 9 | eles = line.strip().split() 10 | if len(eles) == 0: 11 | writer.write("\n") 12 | else: 13 | eles[4] = "_" 14 | eles.insert(4, eles[3]) 15 | eles[8] = "_" 16 | eles[9] = "_" 17 | writer.write("\t".join(eles) + "\n") 18 | 19 | writer.close() 20 | 21 | if __name__ == "__main__": 22 | convert(sys.argv[1]) -------------------------------------------------------------------------------- /data/sentence_segmented.txt: -------------------------------------------------------------------------------- 1 | Induction of tyrosine phosphorylation and T-cell activation by vanadate peroxide, an inhibitor of protein tyrosine phosphatases. 2 | Rapid tyrosine phosphorylation of key cellular proteins is a crucial event in the transduction of activation signals to T-lymphocytes. 3 | The regulatory role of protein tyrosine phosphatases (PTPases) in this process was explored by studying the effects of a powerful PTPase inhibitor, vanadate peroxide (pervanadate), on the activation cascade of Jurkat human leukaemic T-cells. -------------------------------------------------------------------------------- /data/tokenized_sentence_segmented.txt: -------------------------------------------------------------------------------- 1 | Induction of tyrosine phosphorylation and T-cell activation by vanadate peroxide , an inhibitor of protein tyrosine phosphatases . 2 | Rapid tyrosine phosphorylation of key cellular proteins is a crucial event in the transduction of activation signals to T-lymphocytes . 3 | The regulatory role of protein tyrosine phosphatases ( PTPases ) in this process was explored by studying the effects of a powerful PTPase inhibitor , vanadate peroxide ( pervanadate ) , on the activation cascade of Jurkat human leukaemic T-cells . -------------------------------------------------------------------------------- /data/tokenized_sentence_segmented.txt.column: -------------------------------------------------------------------------------- 1 | 1 Induction _ _ _ _ _ _ _ _ 2 | 2 of _ _ _ _ _ _ _ _ 3 | 3 tyrosine _ _ _ _ _ _ _ _ 4 | 4 phosphorylation _ _ _ _ _ _ _ _ 5 | 5 and _ _ _ _ _ _ _ _ 6 | 6 T-cell _ _ _ _ _ _ _ _ 7 | 7 activation _ _ _ _ _ _ _ _ 8 | 8 by _ _ _ _ _ _ _ _ 9 | 9 vanadate _ _ _ _ _ _ _ _ 10 | 10 peroxide _ _ _ _ _ _ _ _ 11 | 11 , _ _ _ _ _ _ _ _ 12 | 12 an _ _ _ _ _ _ _ _ 13 | 13 inhibitor _ _ _ _ _ _ _ _ 14 | 14 of _ _ _ _ _ _ _ _ 15 | 15 protein _ _ _ _ _ _ _ _ 16 | 16 tyrosine _ _ _ _ _ _ _ _ 17 | 17 phosphatases _ _ _ _ _ _ _ _ 18 | 18 . _ _ _ _ _ _ _ _ 19 | 20 | 1 Rapid _ _ _ _ _ _ _ _ 21 | 2 tyrosine _ _ _ _ _ _ _ _ 22 | 3 phosphorylation _ _ _ _ _ _ _ _ 23 | 4 of _ _ _ _ _ _ _ _ 24 | 5 key _ _ _ _ _ _ _ _ 25 | 6 cellular _ _ _ _ _ _ _ _ 26 | 7 proteins _ _ _ _ _ _ _ _ 27 | 8 is _ _ _ _ _ _ _ _ 28 | 9 a _ _ _ _ _ _ _ _ 29 | 10 crucial _ _ _ _ _ _ _ _ 30 | 11 event _ _ _ _ _ _ _ _ 31 | 12 in _ _ _ _ _ _ _ _ 32 | 13 the _ _ _ _ _ _ _ _ 33 | 14 transduction _ _ _ _ _ _ _ _ 34 | 15 of _ _ _ _ _ _ _ _ 35 | 16 activation _ _ _ _ _ _ _ _ 36 | 17 signals _ _ _ _ _ _ _ _ 37 | 18 to _ _ _ _ _ _ _ _ 38 | 19 T-lymphocytes _ _ _ _ _ _ _ _ 39 | 20 . _ _ _ _ _ _ _ _ 40 | 41 | 1 The _ _ _ _ _ _ _ _ 42 | 2 regulatory _ _ _ _ _ _ _ _ 43 | 3 role _ _ _ _ _ _ _ _ 44 | 4 of _ _ _ _ _ _ _ _ 45 | 5 protein _ _ _ _ _ _ _ _ 46 | 6 tyrosine _ _ _ _ _ _ _ _ 47 | 7 phosphatases _ _ _ _ _ _ _ _ 48 | 8 ( _ _ _ _ _ _ _ _ 49 | 9 PTPases _ _ _ _ _ _ _ _ 50 | 10 ) _ _ _ _ _ _ _ _ 51 | 11 in _ _ _ _ _ _ _ _ 52 | 12 this _ _ _ _ _ _ _ _ 53 | 13 process _ _ _ _ _ _ _ _ 54 | 14 was _ _ _ _ _ _ _ _ 55 | 15 explored _ _ _ _ _ _ _ _ 56 | 16 by _ _ _ _ _ _ _ _ 57 | 17 studying _ _ _ _ _ _ _ _ 58 | 18 the _ _ _ _ _ _ _ _ 59 | 19 effects _ _ _ _ _ _ _ _ 60 | 20 of _ _ _ _ _ _ _ _ 61 | 21 a _ _ _ _ _ _ _ _ 62 | 22 powerful _ _ _ _ _ _ _ _ 63 | 23 PTPase _ _ _ _ _ _ _ _ 64 | 24 inhibitor _ _ _ _ _ _ _ _ 65 | 25 , _ _ _ _ _ _ _ _ 66 | 26 vanadate _ _ _ _ _ _ _ _ 67 | 27 peroxide _ _ _ _ _ _ _ _ 68 | 28 ( _ _ _ _ _ _ _ _ 69 | 29 pervanadate _ _ _ _ _ _ _ _ 70 | 30 ) _ _ _ _ _ _ _ _ 71 | 31 , _ _ _ _ _ _ _ _ 72 | 32 on _ _ _ _ _ _ _ _ 73 | 33 the _ _ _ _ _ _ _ _ 74 | 34 activation _ _ _ _ _ _ _ _ 75 | 35 cascade _ _ _ _ _ _ _ _ 76 | 36 of _ _ _ _ _ _ _ _ 77 | 37 Jurkat _ _ _ _ _ _ _ _ 78 | 38 human _ _ _ _ _ _ _ _ 79 | 39 leukaemic _ _ _ _ _ _ _ _ 80 | 40 T-cells _ _ _ _ _ _ _ _ 81 | 41 . _ _ _ _ _ _ _ _ 82 | 83 | -------------------------------------------------------------------------------- /get_ColumnFormat.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os 3 | import sys 4 | 5 | #Convert word-segmented corpus into 10-column format for dependency parsing 6 | def convert(inputFilePath): 7 | writer = open(inputFilePath + ".column", "w") 8 | lines = open(inputFilePath, "r").readlines() 9 | for line in lines: 10 | tok = line.strip().split() 11 | if not tok or line.strip() == '': 12 | writer.write("\n") 13 | else: 14 | count = 0 15 | for word in tok: 16 | count += 1 17 | writer.write(str(count) + "\t" + word + "\t" + '\t'.join(['_'] * 8) + "\n") 18 | writer.write("\n") 19 | writer.close() 20 | 21 | if __name__ == "__main__": 22 | convert(sys.argv[1]) 23 | pass -------------------------------------------------------------------------------- /jPTDP-v1/README.md: -------------------------------------------------------------------------------- 1 | jPTDP: Neural network models for joint POS tagging and dependency parsing 2 | 3 | See [https://github.com/datquocnguyen/jPTDP](https://github.com/datquocnguyen/jPTDP) for more details. --------------------------------------------------------------------------------