├── NLP4J
    ├── bin
    │   ├── nlpdecode
    │   ├── nlpdecode.bat
    │   ├── version
    │   └── version.bat
    ├── config-CRAFT.xml
    ├── config-GENIA.xml
    ├── etc
    │   └── log4j.properties
    ├── lexica
    │   ├── en-ambiguity-classes-simplified-lowercase.xz
    │   └── en-brown-clusters-simplified-lowercase.xz
    ├── models
    │   ├── CRAFT.DEP.model.xz
    │   ├── CRAFT.POS.model.xz
    │   ├── GENIA.DEP.model.xz
    │   └── GENIA.POS.model.xz
    └── repo
    │   ├── args4j
    │       └── args4j
    │       │   └── 2.32
    │       │       └── args4j-2.32.jar
    │   ├── edu
    │       └── emory
    │       │   └── mathcs
    │       │       └── nlp
    │       │           ├── nlp4j-api
    │       │               └── 1.1.4-SNAPSHOT
    │       │               │   └── nlp4j-api-1.1.4-SNAPSHOT.jar
    │       │           └── nlp4j-cli
    │       │               └── 1.1.4-SNAPSHOT
    │       │                   └── nlp4j-cli-1.1.4-SNAPSHOT.jar
    │   ├── it
    │       └── unimi
    │       │   └── dsi
    │       │       └── fastutil
    │       │           └── 7.0.12
    │       │               └── fastutil-7.0.12.jar
    │   ├── log4j
    │       └── log4j
    │       │   └── 1.2.17
    │       │       └── log4j-1.2.17.jar
    │   └── org
    │       ├── apache
    │           └── commons
    │           │   ├── commons-csv
    │           │       └── 1.2
    │           │       │   └── commons-csv-1.2.jar
    │           │   └── commons-math3
    │           │       └── 3.5
    │           │           └── commons-math3-3.5.jar
    │       ├── magicwerk
    │           └── brownies-collections
    │           │   └── 0.9.13
    │           │       └── brownies-collections-0.9.13.jar
    │       ├── slf4j
    │           ├── slf4j-api
    │           │   └── 1.7.21
    │           │   │   └── slf4j-api-1.7.21.jar
    │           └── slf4j-log4j12
    │           │   └── 1.7.21
    │           │       └── slf4j-log4j12-1.7.21.jar
    │       └── tukaani
    │           └── xz
    │               └── 1.5
    │                   └── xz-1.5.jar
├── README.md
├── StanfordBiaffineParser-v2
    ├── config
    │   ├── CRAFT.cfg
    │   ├── GENIA.cfg
    │   ├── defaults.cfg
    │   └── template.cfg
    ├── main.py
    └── parser
    │   ├── __init__.py
    │   ├── bucket.py
    │   ├── configurable.py
    │   ├── dataset.py
    │   ├── misc
    │       ├── __init__.py
    │       ├── bucketer.py
    │       ├── colors.py
    │       ├── get_encoding.py
    │       ├── mst.py
    │       └── zipf.py
    │   ├── multibucket.py
    │   ├── network.py
    │   ├── neural
    │       ├── __init__.py
    │       ├── functions.py
    │       ├── linalg.py
    │       ├── models
    │       │   ├── __init__.py
    │       │   ├── embeds
    │       │   │   ├── __init__.py
    │       │   │   ├── base_embed.py
    │       │   │   ├── cnn_embed.py
    │       │   │   ├── mlp_embed.py
    │       │   │   └── rnn_embed.py
    │       │   ├── nlp
    │       │   │   ├── __init__.py
    │       │   │   ├── parsers
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── base_parser.py
    │       │   │   │   ├── bin_parser.py
    │       │   │   │   ├── fish_parser.py
    │       │   │   │   ├── gama_parser.py
    │       │   │   │   ├── parser.py
    │       │   │   │   └── xbar_parser.py
    │       │   │   └── taggers
    │       │   │   │   ├── __init__.py
    │       │   │   │   ├── base_tagger.py
    │       │   │   │   ├── base_xtagger.py
    │       │   │   │   ├── tagger.py
    │       │   │   │   └── xtagger.py
    │       │   └── nn.py
    │       ├── optimizers
    │       │   ├── __init__.py
    │       │   ├── base_optimizer.py
    │       │   ├── radam_optimizer.py
    │       │   └── sgd_optimizer.py
    │       ├── recur_cells
    │       │   ├── .directory
    │       │   ├── __init__.py
    │       │   ├── base_cell.py
    │       │   ├── cif_lstm_cell.py
    │       │   ├── gru_cell.py
    │       │   ├── lstm_cell.py
    │       │   └── rnn_cell.py
    │       └── rnn.py
    │   ├── scripts
    │       ├── compression_ratio.py
    │       ├── count_nonprojective.py
    │       ├── heaps_law.py
    │       └── reinsert_compounds.py
    │   ├── trash
    │       ├── retrained_vocab.py
    │       └── weighted_mean.py
    │   └── vocabs
    │       ├── __init__.py
    │       ├── base_vocab.py
    │       ├── index_vocab.py
    │       ├── multivocab.py
    │       ├── ngram_multivocab.py
    │       ├── ngram_vocab.py
    │       ├── pretrained_vocab.py
    │       ├── subtoken_vocab.py
    │       └── token_vocab.py
├── convert_NLP4J_to_CoNLL.py
├── data
    ├── raw.txt
    ├── sentence_segmented.txt
    ├── tokenized_sentence_segmented.txt
    └── tokenized_sentence_segmented.txt.column
├── get_ColumnFormat.py
└── jPTDP-v1
    └── README.md


/NLP4J/bin/nlpdecode:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | # ----------------------------------------------------------------------------
  3 | #  Copyright 2001-2006 The Apache Software Foundation.
  4 | #
  5 | #  Licensed under the Apache License, Version 2.0 (the "License");
  6 | #  you may not use this file except in compliance with the License.
  7 | #  You may obtain a copy of the License at
  8 | #
  9 | #       http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | #  Unless required by applicable law or agreed to in writing, software
 12 | #  distributed under the License is distributed on an "AS IS" BASIS,
 13 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | #  See the License for the specific language governing permissions and
 15 | #  limitations under the License.
 16 | # ----------------------------------------------------------------------------
 17 | #
 18 | #   Copyright (c) 2001-2006 The Apache Software Foundation.  All rights
 19 | #   reserved.
 20 | 
 21 | 
 22 | # resolve links - $0 may be a softlink
 23 | PRG="$0"
 24 | 
 25 | while [ -h "$PRG" ]; do
 26 |   ls=`ls -ld "$PRG"`
 27 |   link=`expr "$ls" : '.*-> \(.*\)$'`
 28 |   if expr "$link" : '/.*' > /dev/null; then
 29 |     PRG="$link"
 30 |   else
 31 |     PRG=`dirname "$PRG"`/"$link"
 32 |   fi
 33 | done
 34 | 
 35 | PRGDIR=`dirname "$PRG"`
 36 | BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd`
 37 | 
 38 | # Reset the REPO variable. If you need to influence this use the environment setup file.
 39 | REPO=
 40 | 
 41 | 
 42 | # OS specific support.  $var _must_ be set to either true or false.
 43 | cygwin=false;
 44 | darwin=false;
 45 | case "`uname`" in
 46 |   CYGWIN*) cygwin=true ;;
 47 |   Darwin*) darwin=true
 48 |            if [ -z "$JAVA_VERSION" ] ; then
 49 |              JAVA_VERSION="CurrentJDK"
 50 |            else
 51 |              echo "Using Java version: $JAVA_VERSION"
 52 |            fi
 53 | 		   if [ -z "$JAVA_HOME" ]; then
 54 | 		      if [ -x "/usr/libexec/java_home" ]; then
 55 | 			      JAVA_HOME=`/usr/libexec/java_home`
 56 | 			  else
 57 | 			      JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home
 58 | 			  fi
 59 |            fi       
 60 |            ;;
 61 | esac
 62 | 
 63 | if [ -z "$JAVA_HOME" ] ; then
 64 |   if [ -r /etc/gentoo-release ] ; then
 65 |     JAVA_HOME=`java-config --jre-home`
 66 |   fi
 67 | fi
 68 | 
 69 | # For Cygwin, ensure paths are in UNIX format before anything is touched
 70 | if $cygwin ; then
 71 |   [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
 72 |   [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"`
 73 | fi
 74 | 
 75 | # If a specific java binary isn't specified search for the standard 'java' binary
 76 | if [ -z "$JAVACMD" ] ; then
 77 |   if [ -n "$JAVA_HOME"  ] ; then
 78 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 79 |       # IBM's JDK on AIX uses strange locations for the executables
 80 |       JAVACMD="$JAVA_HOME/jre/sh/java"
 81 |     else
 82 |       JAVACMD="$JAVA_HOME/bin/java"
 83 |     fi
 84 |   else
 85 |     JAVACMD=`which java`
 86 |   fi
 87 | fi
 88 | 
 89 | if [ ! -x "$JAVACMD" ] ; then
 90 |   echo "Error: JAVA_HOME is not defined correctly." 1>&2
 91 |   echo "  We cannot execute $JAVACMD" 1>&2
 92 |   exit 1
 93 | fi
 94 | 
 95 | if [ -z "$REPO" ]
 96 | then
 97 |   REPO="$BASEDIR"/repo
 98 | fi
 99 | 
100 | CLASSPATH="$BASEDIR"/etc:"$REPO"/edu/emory/mathcs/nlp/nlp4j-english/1.1.2/nlp4j-english-1.1.2.jar:"$REPO"/edu/emory/mathcs/nlp/nlp4j-api/1.1.4-SNAPSHOT/nlp4j-api-1.1.4-SNAPSHOT.jar:"$REPO"/org/slf4j/slf4j-api/1.7.21/slf4j-api-1.7.21.jar:"$REPO"/org/tukaani/xz/1.5/xz-1.5.jar:"$REPO"/it/unimi/dsi/fastutil/7.0.12/fastutil-7.0.12.jar:"$REPO"/org/magicwerk/brownies-collections/0.9.13/brownies-collections-0.9.13.jar:"$REPO"/org/apache/commons/commons-math3/3.5/commons-math3-3.5.jar:"$REPO"/org/apache/commons/commons-csv/1.2/commons-csv-1.2.jar:"$REPO"/org/slf4j/slf4j-log4j12/1.7.21/slf4j-log4j12-1.7.21.jar:"$REPO"/log4j/log4j/1.2.17/log4j-1.2.17.jar:"$REPO"/args4j/args4j/2.32/args4j-2.32.jar:"$REPO"/edu/emory/mathcs/nlp/nlp4j-cli/1.1.4-SNAPSHOT/nlp4j-cli-1.1.4-SNAPSHOT.jar
101 | 
102 | ENDORSED_DIR=
103 | if [ -n "$ENDORSED_DIR" ] ; then
104 |   CLASSPATH=$BASEDIR/$ENDORSED_DIR/*:$CLASSPATH
105 | fi
106 | 
107 | if [ -n "$CLASSPATH_PREFIX" ] ; then
108 |   CLASSPATH=$CLASSPATH_PREFIX:$CLASSPATH
109 | fi
110 | 
111 | # For Cygwin, switch paths to Windows format before running java
112 | if $cygwin; then
113 |   [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"`
114 |   [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"`
115 |   [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"`
116 |   [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"`
117 |   [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"`
118 | fi
119 | 
120 | exec "$JAVACMD" $JAVA_OPTS -Xmx8g -XX:+UseConcMarkSweepGC \
121 |   -classpath "$CLASSPATH" \
122 |   -Dapp.name="nlpdecode" \
123 |   -Dapp.pid="$$" \
124 |   -Dapp.repo="$REPO" \
125 |   -Dapp.home="$BASEDIR" \
126 |   -Dbasedir="$BASEDIR" \
127 |   edu.emory.mathcs.nlp.bin.NLPDecode \
128 |   "$@"
129 | 


--------------------------------------------------------------------------------
/NLP4J/bin/nlpdecode.bat:
--------------------------------------------------------------------------------
  1 | @REM ----------------------------------------------------------------------------
  2 | @REM  Copyright 2001-2006 The Apache Software Foundation.
  3 | @REM
  4 | @REM  Licensed under the Apache License, Version 2.0 (the "License");
  5 | @REM  you may not use this file except in compliance with the License.
  6 | @REM  You may obtain a copy of the License at
  7 | @REM
  8 | @REM       http://www.apache.org/licenses/LICENSE-2.0
  9 | @REM
 10 | @REM  Unless required by applicable law or agreed to in writing, software
 11 | @REM  distributed under the License is distributed on an "AS IS" BASIS,
 12 | @REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | @REM  See the License for the specific language governing permissions and
 14 | @REM  limitations under the License.
 15 | @REM ----------------------------------------------------------------------------
 16 | @REM
 17 | @REM   Copyright (c) 2001-2006 The Apache Software Foundation.  All rights
 18 | @REM   reserved.
 19 | 
 20 | @echo off
 21 | 
 22 | set ERROR_CODE=0
 23 | 
 24 | :init
 25 | @REM Decide how to startup depending on the version of windows
 26 | 
 27 | @REM -- Win98ME
 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg
 29 | 
 30 | @REM set local scope for the variables with windows NT shell
 31 | if "%OS%"=="Windows_NT" @setlocal
 32 | 
 33 | @REM -- 4NT shell
 34 | if "%eval[2+2]" == "4" goto 4NTArgs
 35 | 
 36 | @REM -- Regular WinNT shell
 37 | set CMD_LINE_ARGS=%*
 38 | goto WinNTGetScriptDir
 39 | 
 40 | @REM The 4NT Shell from jp software
 41 | :4NTArgs
 42 | set CMD_LINE_ARGS=%$
 43 | goto WinNTGetScriptDir
 44 | 
 45 | :Win9xArg
 46 | @REM Slurp the command line arguments.  This loop allows for an unlimited number
 47 | @REM of arguments (up to the command line limit, anyway).
 48 | set CMD_LINE_ARGS=
 49 | :Win9xApp
 50 | if %1a==a goto Win9xGetScriptDir
 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1
 52 | shift
 53 | goto Win9xApp
 54 | 
 55 | :Win9xGetScriptDir
 56 | set SAVEDIR=%CD%
 57 | %0\
 58 | cd %0\..\.. 
 59 | set BASEDIR=%CD%
 60 | cd %SAVEDIR%
 61 | set SAVE_DIR=
 62 | goto repoSetup
 63 | 
 64 | :WinNTGetScriptDir
 65 | set BASEDIR=%~dp0\..
 66 | 
 67 | :repoSetup
 68 | set REPO=
 69 | 
 70 | 
 71 | if "%JAVACMD%"=="" set JAVACMD=java
 72 | 
 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo
 74 | 
 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-english\1.1.2\nlp4j-english-1.1.2.jar;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-api\1.1.4-SNAPSHOT\nlp4j-api-1.1.4-SNAPSHOT.jar;"%REPO%"\org\slf4j\slf4j-api\1.7.21\slf4j-api-1.7.21.jar;"%REPO%"\org\tukaani\xz\1.5\xz-1.5.jar;"%REPO%"\it\unimi\dsi\fastutil\7.0.12\fastutil-7.0.12.jar;"%REPO%"\org\magicwerk\brownies-collections\0.9.13\brownies-collections-0.9.13.jar;"%REPO%"\org\apache\commons\commons-math3\3.5\commons-math3-3.5.jar;"%REPO%"\org\apache\commons\commons-csv\1.2\commons-csv-1.2.jar;"%REPO%"\org\slf4j\slf4j-log4j12\1.7.21\slf4j-log4j12-1.7.21.jar;"%REPO%"\log4j\log4j\1.2.17\log4j-1.2.17.jar;"%REPO%"\args4j\args4j\2.32\args4j-2.32.jar;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-cli\1.1.4-SNAPSHOT\nlp4j-cli-1.1.4-SNAPSHOT.jar
 76 | 
 77 | set ENDORSED_DIR=
 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH%
 79 | 
 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH%
 81 | 
 82 | @REM Reaching here means variables are defined and arguments have been captured
 83 | :endInit
 84 | 
 85 | %JAVACMD% %JAVA_OPTS% -Xmx8g -XX:+UseConcMarkSweepGC -classpath %CLASSPATH% -Dapp.name="nlpdecode" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" edu.emory.mathcs.nlp.bin.NLPDecode %CMD_LINE_ARGS%
 86 | if %ERRORLEVEL% NEQ 0 goto error
 87 | goto end
 88 | 
 89 | :error
 90 | if "%OS%"=="Windows_NT" @endlocal
 91 | set ERROR_CODE=%ERRORLEVEL%
 92 | 
 93 | :end
 94 | @REM set local scope for the variables with windows NT shell
 95 | if "%OS%"=="Windows_NT" goto endNT
 96 | 
 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set
 98 | @REM before we started - at least we don't leave any baggage around
 99 | set CMD_LINE_ARGS=
100 | goto postExec
101 | 
102 | :endNT
103 | @REM If error code is set to 1 then the endlocal was done already in :error.
104 | if %ERROR_CODE% EQU 0 @endlocal
105 | 
106 | 
107 | :postExec
108 | 
109 | if "%FORCE_EXIT_ON_ERROR%" == "on" (
110 |   if %ERROR_CODE% NEQ 0 exit %ERROR_CODE%
111 | )
112 | 
113 | exit /B %ERROR_CODE%
114 | 


--------------------------------------------------------------------------------
/NLP4J/bin/version:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | # ----------------------------------------------------------------------------
  3 | #  Copyright 2001-2006 The Apache Software Foundation.
  4 | #
  5 | #  Licensed under the Apache License, Version 2.0 (the "License");
  6 | #  you may not use this file except in compliance with the License.
  7 | #  You may obtain a copy of the License at
  8 | #
  9 | #       http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | #  Unless required by applicable law or agreed to in writing, software
 12 | #  distributed under the License is distributed on an "AS IS" BASIS,
 13 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | #  See the License for the specific language governing permissions and
 15 | #  limitations under the License.
 16 | # ----------------------------------------------------------------------------
 17 | #
 18 | #   Copyright (c) 2001-2006 The Apache Software Foundation.  All rights
 19 | #   reserved.
 20 | 
 21 | 
 22 | # resolve links - $0 may be a softlink
 23 | PRG="$0"
 24 | 
 25 | while [ -h "$PRG" ]; do
 26 |   ls=`ls -ld "$PRG"`
 27 |   link=`expr "$ls" : '.*-> \(.*\)$'`
 28 |   if expr "$link" : '/.*' > /dev/null; then
 29 |     PRG="$link"
 30 |   else
 31 |     PRG=`dirname "$PRG"`/"$link"
 32 |   fi
 33 | done
 34 | 
 35 | PRGDIR=`dirname "$PRG"`
 36 | BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd`
 37 | 
 38 | # Reset the REPO variable. If you need to influence this use the environment setup file.
 39 | REPO=
 40 | 
 41 | 
 42 | # OS specific support.  $var _must_ be set to either true or false.
 43 | cygwin=false;
 44 | darwin=false;
 45 | case "`uname`" in
 46 |   CYGWIN*) cygwin=true ;;
 47 |   Darwin*) darwin=true
 48 |            if [ -z "$JAVA_VERSION" ] ; then
 49 |              JAVA_VERSION="CurrentJDK"
 50 |            else
 51 |              echo "Using Java version: $JAVA_VERSION"
 52 |            fi
 53 | 		   if [ -z "$JAVA_HOME" ]; then
 54 | 		      if [ -x "/usr/libexec/java_home" ]; then
 55 | 			      JAVA_HOME=`/usr/libexec/java_home`
 56 | 			  else
 57 | 			      JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home
 58 | 			  fi
 59 |            fi       
 60 |            ;;
 61 | esac
 62 | 
 63 | if [ -z "$JAVA_HOME" ] ; then
 64 |   if [ -r /etc/gentoo-release ] ; then
 65 |     JAVA_HOME=`java-config --jre-home`
 66 |   fi
 67 | fi
 68 | 
 69 | # For Cygwin, ensure paths are in UNIX format before anything is touched
 70 | if $cygwin ; then
 71 |   [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
 72 |   [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"`
 73 | fi
 74 | 
 75 | # If a specific java binary isn't specified search for the standard 'java' binary
 76 | if [ -z "$JAVACMD" ] ; then
 77 |   if [ -n "$JAVA_HOME"  ] ; then
 78 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 79 |       # IBM's JDK on AIX uses strange locations for the executables
 80 |       JAVACMD="$JAVA_HOME/jre/sh/java"
 81 |     else
 82 |       JAVACMD="$JAVA_HOME/bin/java"
 83 |     fi
 84 |   else
 85 |     JAVACMD=`which java`
 86 |   fi
 87 | fi
 88 | 
 89 | if [ ! -x "$JAVACMD" ] ; then
 90 |   echo "Error: JAVA_HOME is not defined correctly." 1>&2
 91 |   echo "  We cannot execute $JAVACMD" 1>&2
 92 |   exit 1
 93 | fi
 94 | 
 95 | if [ -z "$REPO" ]
 96 | then
 97 |   REPO="$BASEDIR"/repo
 98 | fi
 99 | 
100 | CLASSPATH="$BASEDIR"/etc:"$REPO"/edu/emory/mathcs/nlp/nlp4j-english/1.1.2/nlp4j-english-1.1.2.jar:"$REPO"/edu/emory/mathcs/nlp/nlp4j-api/1.1.4-SNAPSHOT/nlp4j-api-1.1.4-SNAPSHOT.jar:"$REPO"/org/slf4j/slf4j-api/1.7.21/slf4j-api-1.7.21.jar:"$REPO"/org/tukaani/xz/1.5/xz-1.5.jar:"$REPO"/it/unimi/dsi/fastutil/7.0.12/fastutil-7.0.12.jar:"$REPO"/org/magicwerk/brownies-collections/0.9.13/brownies-collections-0.9.13.jar:"$REPO"/org/apache/commons/commons-math3/3.5/commons-math3-3.5.jar:"$REPO"/org/apache/commons/commons-csv/1.2/commons-csv-1.2.jar:"$REPO"/org/slf4j/slf4j-log4j12/1.7.21/slf4j-log4j12-1.7.21.jar:"$REPO"/log4j/log4j/1.2.17/log4j-1.2.17.jar:"$REPO"/args4j/args4j/2.32/args4j-2.32.jar:"$REPO"/edu/emory/mathcs/nlp/nlp4j-cli/1.1.4-SNAPSHOT/nlp4j-cli-1.1.4-SNAPSHOT.jar
101 | 
102 | ENDORSED_DIR=
103 | if [ -n "$ENDORSED_DIR" ] ; then
104 |   CLASSPATH=$BASEDIR/$ENDORSED_DIR/*:$CLASSPATH
105 | fi
106 | 
107 | if [ -n "$CLASSPATH_PREFIX" ] ; then
108 |   CLASSPATH=$CLASSPATH_PREFIX:$CLASSPATH
109 | fi
110 | 
111 | # For Cygwin, switch paths to Windows format before running java
112 | if $cygwin; then
113 |   [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"`
114 |   [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"`
115 |   [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"`
116 |   [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"`
117 |   [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"`
118 | fi
119 | 
120 | exec "$JAVACMD" $JAVA_OPTS -Xmx10g -XX:+UseConcMarkSweepGC \
121 |   -classpath "$CLASSPATH" \
122 |   -Dapp.name="version" \
123 |   -Dapp.pid="$$" \
124 |   -Dapp.repo="$REPO" \
125 |   -Dapp.home="$BASEDIR" \
126 |   -Dbasedir="$BASEDIR" \
127 |   edu.emory.mathcs.nlp.bin.Version \
128 |   "$@"
129 | 


--------------------------------------------------------------------------------
/NLP4J/bin/version.bat:
--------------------------------------------------------------------------------
  1 | @REM ----------------------------------------------------------------------------
  2 | @REM  Copyright 2001-2006 The Apache Software Foundation.
  3 | @REM
  4 | @REM  Licensed under the Apache License, Version 2.0 (the "License");
  5 | @REM  you may not use this file except in compliance with the License.
  6 | @REM  You may obtain a copy of the License at
  7 | @REM
  8 | @REM       http://www.apache.org/licenses/LICENSE-2.0
  9 | @REM
 10 | @REM  Unless required by applicable law or agreed to in writing, software
 11 | @REM  distributed under the License is distributed on an "AS IS" BASIS,
 12 | @REM  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | @REM  See the License for the specific language governing permissions and
 14 | @REM  limitations under the License.
 15 | @REM ----------------------------------------------------------------------------
 16 | @REM
 17 | @REM   Copyright (c) 2001-2006 The Apache Software Foundation.  All rights
 18 | @REM   reserved.
 19 | 
 20 | @echo off
 21 | 
 22 | set ERROR_CODE=0
 23 | 
 24 | :init
 25 | @REM Decide how to startup depending on the version of windows
 26 | 
 27 | @REM -- Win98ME
 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg
 29 | 
 30 | @REM set local scope for the variables with windows NT shell
 31 | if "%OS%"=="Windows_NT" @setlocal
 32 | 
 33 | @REM -- 4NT shell
 34 | if "%eval[2+2]" == "4" goto 4NTArgs
 35 | 
 36 | @REM -- Regular WinNT shell
 37 | set CMD_LINE_ARGS=%*
 38 | goto WinNTGetScriptDir
 39 | 
 40 | @REM The 4NT Shell from jp software
 41 | :4NTArgs
 42 | set CMD_LINE_ARGS=%$
 43 | goto WinNTGetScriptDir
 44 | 
 45 | :Win9xArg
 46 | @REM Slurp the command line arguments.  This loop allows for an unlimited number
 47 | @REM of arguments (up to the command line limit, anyway).
 48 | set CMD_LINE_ARGS=
 49 | :Win9xApp
 50 | if %1a==a goto Win9xGetScriptDir
 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1
 52 | shift
 53 | goto Win9xApp
 54 | 
 55 | :Win9xGetScriptDir
 56 | set SAVEDIR=%CD%
 57 | %0\
 58 | cd %0\..\.. 
 59 | set BASEDIR=%CD%
 60 | cd %SAVEDIR%
 61 | set SAVE_DIR=
 62 | goto repoSetup
 63 | 
 64 | :WinNTGetScriptDir
 65 | set BASEDIR=%~dp0\..
 66 | 
 67 | :repoSetup
 68 | set REPO=
 69 | 
 70 | 
 71 | if "%JAVACMD%"=="" set JAVACMD=java
 72 | 
 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo
 74 | 
 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-english\1.1.2\nlp4j-english-1.1.2.jar;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-api\1.1.4-SNAPSHOT\nlp4j-api-1.1.4-SNAPSHOT.jar;"%REPO%"\org\slf4j\slf4j-api\1.7.21\slf4j-api-1.7.21.jar;"%REPO%"\org\tukaani\xz\1.5\xz-1.5.jar;"%REPO%"\it\unimi\dsi\fastutil\7.0.12\fastutil-7.0.12.jar;"%REPO%"\org\magicwerk\brownies-collections\0.9.13\brownies-collections-0.9.13.jar;"%REPO%"\org\apache\commons\commons-math3\3.5\commons-math3-3.5.jar;"%REPO%"\org\apache\commons\commons-csv\1.2\commons-csv-1.2.jar;"%REPO%"\org\slf4j\slf4j-log4j12\1.7.21\slf4j-log4j12-1.7.21.jar;"%REPO%"\log4j\log4j\1.2.17\log4j-1.2.17.jar;"%REPO%"\args4j\args4j\2.32\args4j-2.32.jar;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-cli\1.1.4-SNAPSHOT\nlp4j-cli-1.1.4-SNAPSHOT.jar
 76 | 
 77 | set ENDORSED_DIR=
 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH%
 79 | 
 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH%
 81 | 
 82 | @REM Reaching here means variables are defined and arguments have been captured
 83 | :endInit
 84 | 
 85 | %JAVACMD% %JAVA_OPTS% -Xmx10g -XX:+UseConcMarkSweepGC -classpath %CLASSPATH% -Dapp.name="version" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" edu.emory.mathcs.nlp.bin.Version %CMD_LINE_ARGS%
 86 | if %ERRORLEVEL% NEQ 0 goto error
 87 | goto end
 88 | 
 89 | :error
 90 | if "%OS%"=="Windows_NT" @endlocal
 91 | set ERROR_CODE=%ERRORLEVEL%
 92 | 
 93 | :end
 94 | @REM set local scope for the variables with windows NT shell
 95 | if "%OS%"=="Windows_NT" goto endNT
 96 | 
 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set
 98 | @REM before we started - at least we don't leave any baggage around
 99 | set CMD_LINE_ARGS=
100 | goto postExec
101 | 
102 | :endNT
103 | @REM If error code is set to 1 then the endlocal was done already in :error.
104 | if %ERROR_CODE% EQU 0 @endlocal
105 | 
106 | 
107 | :postExec
108 | 
109 | if "%FORCE_EXIT_ON_ERROR%" == "on" (
110 |   if %ERROR_CODE% NEQ 0 exit %ERROR_CODE%
111 | )
112 | 
113 | exit /B %ERROR_CODE%
114 | 


--------------------------------------------------------------------------------
/NLP4J/config-CRAFT.xml:
--------------------------------------------------------------------------------
 1 | <!-- part-of-speech tagging -->
 2 | <configuration>
 3 |     <tsv>
 4 |         <column index="1" field="form"/>
 5 |         <!--column index="2" field="lemma"/>
 6 |         <column index="3" field="pos"/-->
 7 |     </tsv>
 8 | 
 9 |     <lexica>
10 |         <ambiguity_classes field="word_form_simplified_lowercase">lexica/en-ambiguity-classes-simplified-lowercase.xz</ambiguity_classes>
11 |         <word_clusters field="word_form_simplified_lowercase">lexica/en-brown-clusters-simplified-lowercase.xz</word_clusters>
12 |     </lexica>
13 | 
14 |     <models>
15 |         <pos>models/CRAFT.POS.model.xz</pos>
16 |         <dep>models/CRAFT.DEP.model.xz</dep>
17 |     </models>
18 | </configuration>
19 | 


--------------------------------------------------------------------------------
/NLP4J/config-GENIA.xml:
--------------------------------------------------------------------------------
 1 | <!-- part-of-speech tagging -->
 2 | <configuration>
 3 |     <tsv>
 4 |         <column index="1" field="form"/>
 5 |         <!--column index="2" field="lemma"/>
 6 |         <column index="3" field="pos"/-->
 7 |     </tsv>
 8 | 
 9 |     <lexica>
10 |         <ambiguity_classes field="word_form_simplified_lowercase">lexica/en-ambiguity-classes-simplified-lowercase.xz</ambiguity_classes>
11 |         <word_clusters field="word_form_simplified_lowercase">lexica/en-brown-clusters-simplified-lowercase.xz</word_clusters>
12 |     </lexica>
13 | 
14 |     <models>
15 |         <pos>models/GENIA.POS.model.xz</pos>
16 |         <dep>models/GENIA.DEP.model.xz</dep>
17 |     </models>
18 | </configuration>
19 | 


--------------------------------------------------------------------------------
/NLP4J/etc/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set root logger level to DEBUG and its only appender to A1.
 2 | log4j.rootLogger=INFO, A1
 3 | 
 4 | # A1 is set to be a ConsoleAppender.
 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
 6 | 
 7 | # A1 uses PatternLayout.
 8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
 9 | log4j.appender.A1.layout.conversionPattern=%m%n
10 | 


--------------------------------------------------------------------------------
/NLP4J/lexica/en-ambiguity-classes-simplified-lowercase.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/lexica/en-ambiguity-classes-simplified-lowercase.xz


--------------------------------------------------------------------------------
/NLP4J/lexica/en-brown-clusters-simplified-lowercase.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/lexica/en-brown-clusters-simplified-lowercase.xz


--------------------------------------------------------------------------------
/NLP4J/models/CRAFT.DEP.model.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/models/CRAFT.DEP.model.xz


--------------------------------------------------------------------------------
/NLP4J/models/CRAFT.POS.model.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/models/CRAFT.POS.model.xz


--------------------------------------------------------------------------------
/NLP4J/models/GENIA.DEP.model.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/models/GENIA.DEP.model.xz


--------------------------------------------------------------------------------
/NLP4J/models/GENIA.POS.model.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/models/GENIA.POS.model.xz


--------------------------------------------------------------------------------
/NLP4J/repo/args4j/args4j/2.32/args4j-2.32.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/args4j/args4j/2.32/args4j-2.32.jar


--------------------------------------------------------------------------------
/NLP4J/repo/edu/emory/mathcs/nlp/nlp4j-api/1.1.4-SNAPSHOT/nlp4j-api-1.1.4-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/edu/emory/mathcs/nlp/nlp4j-api/1.1.4-SNAPSHOT/nlp4j-api-1.1.4-SNAPSHOT.jar


--------------------------------------------------------------------------------
/NLP4J/repo/edu/emory/mathcs/nlp/nlp4j-cli/1.1.4-SNAPSHOT/nlp4j-cli-1.1.4-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/edu/emory/mathcs/nlp/nlp4j-cli/1.1.4-SNAPSHOT/nlp4j-cli-1.1.4-SNAPSHOT.jar


--------------------------------------------------------------------------------
/NLP4J/repo/it/unimi/dsi/fastutil/7.0.12/fastutil-7.0.12.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/it/unimi/dsi/fastutil/7.0.12/fastutil-7.0.12.jar


--------------------------------------------------------------------------------
/NLP4J/repo/log4j/log4j/1.2.17/log4j-1.2.17.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/log4j/log4j/1.2.17/log4j-1.2.17.jar


--------------------------------------------------------------------------------
/NLP4J/repo/org/apache/commons/commons-csv/1.2/commons-csv-1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/apache/commons/commons-csv/1.2/commons-csv-1.2.jar


--------------------------------------------------------------------------------
/NLP4J/repo/org/apache/commons/commons-math3/3.5/commons-math3-3.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/apache/commons/commons-math3/3.5/commons-math3-3.5.jar


--------------------------------------------------------------------------------
/NLP4J/repo/org/magicwerk/brownies-collections/0.9.13/brownies-collections-0.9.13.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/magicwerk/brownies-collections/0.9.13/brownies-collections-0.9.13.jar


--------------------------------------------------------------------------------
/NLP4J/repo/org/slf4j/slf4j-api/1.7.21/slf4j-api-1.7.21.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/slf4j/slf4j-api/1.7.21/slf4j-api-1.7.21.jar


--------------------------------------------------------------------------------
/NLP4J/repo/org/slf4j/slf4j-log4j12/1.7.21/slf4j-log4j12-1.7.21.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/slf4j/slf4j-log4j12/1.7.21/slf4j-log4j12-1.7.21.jar


--------------------------------------------------------------------------------
/NLP4J/repo/org/tukaani/xz/1.5/xz-1.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/tukaani/xz/1.5/xz-1.5.jar


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Biomedical POS tagging and dependency parsing models
  3 | 
  4 | Biomedical POS tagging and dependency parsing models are trained on  [GENIA](http://www.geniaproject.org/) and [CRAFT](http://BioPosDep-corpora.sourceforge.net/CRAFT/). See [our following paper](https://arxiv.org/abs/1808.03731) for more details:
  5 | 
  6 | 	@Article{NguyenK2019,
  7 | 	author="Nguyen, Dat Quoc and Verspoor, Karin",
  8 | 	title="From POS tagging to dependency parsing for biomedical event extraction",
  9 | 	journal="BMC Bioinformatics",
 10 | 	year="2019",
 11 | 	month="Feb",
 12 | 	day="12",
 13 | 	volume="20",
 14 | 	number="1",
 15 | 	pages="72",
 16 | 	doi="10.1186/s12859-019-2604-0",
 17 | 	url="https://doi.org/10.1186/s12859-019-2604-0"
 18 | 	}
 19 |     
 20 | Our models are **free** for non-commercial use and distributed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International ([CC BY-NC-SA](https://creativecommons.org/licenses/by-nc-sa/4.0/)) License. 
 21 | 
 22 | <img width="400" alt="pos" src="https://user-images.githubusercontent.com/2412555/53179172-c9de7500-3625-11e9-90ac-17fe3ca016b0.png"> <img width="400" alt="dep" src="https://user-images.githubusercontent.com/2412555/53179163-c6e38480-3625-11e9-954d-9676730e7b27.png">
 23 | 
 24 | # Usage 
 25 | 
 26 | #### The first step is to perform POS tagging and dependency parsing using [NLP4J](https://emorynlp.github.io/nlp4j/) models. Here, NLP4J would also perform _TOKENIZATION_ and _SENTENCE SEGMENTATION_ if input files are raw text corpora. Then, the output of NLP4J will be used as input for other dependency parsing models.
 27 | 
 28 | ### Perform biomedical POS tagging and dependency parsing using retrained NLP4J models 
 29 | 
 30 | #### Installation
 31 | 
 32 | Download NLP4J models from [https://github.com/datquocnguyen/BioPosDep/archive/master.zip](https://github.com/datquocnguyen/BioPosDep/archive/master.zip) (70MB) or clone these models using `git`:
 33 |     
 34 |     $ git clone https://github.com/datquocnguyen/BioPosDep.git
 35 |     
 36 | To run the models, it is expected that `Java` is already set to run in command line or terminal.
 37 | 
 38 | #### Command line 
 39 |     
 40 |     # Using models trained on GENIA
 41 |     BioPosDep/NLP4J$ bin/nlpdecode -c config-GENIA.xml -i <filepath> -format <string> [-ie <string> -oe <string>]
 42 |     
 43 |     # Using models trained on CRAFT
 44 |     BioPosDep/NLP4J$ bin/nlpdecode -c config-CRAFT.xml -i <filepath> -format <string> [-ie <string> -oe <string>]
 45 | 	
 46 | 	-i       <filepath> : input path (required)
 47 | 	-format  <string>   : format of the input data (raw|line|tsv; default: raw)
 48 | 	-ie      <string>   : input file extension (default: *)
 49 | 	-oe      <string>   : output file extension (default: nlp)
 50 | 
 51 |  - `-i`  specifies the input path pointing to either a file or a directory. When the path points to a file, only the specific file is processed. When the path points to a directory, all files with the file extension  `-ie`  under the specific directory are processed.
 52 |  - `-format` specifies the format of the input file: `raw`, `line`, or `tsv`
 53 | 	 - `raw`  accepts texts in any format
 54 | 	 - `line`  expects a sentence per line
 55 | 	 - `tsv`  expects columns delimited by `\t` and sentences separated by `\n`
 56 |  - `-ie`  specifies the input file extension. The default value  `*`  implies files with any extension. This option is used only when the input path  `-i`  points to a directory.
 57 |  - `-oe`  specifies the output file extension appended to each input filename. The corresponding output file, consisting of the NLP output, will be generated.
 58 | 
 59 | #### Examples
 60 | 	
 61 | 	# For a raw corpus input
 62 | 	BioPosDep/NLP4J$ bin/nlpdecode -c config-GENIA.xml -i ../data/raw.txt -format raw -oe genia
 63 | 	BioPosDep/NLP4J$ bin/nlpdecode -c config-CRAFT.xml -i ../data/raw.txt -format raw -oe craft
 64 | 	
 65 | 	# For a sentence-segmented corpus input (without tokenization!)
 66 | 	BioPosDep/NLP4J$ bin/nlpdecode -c config-GENIA.xml -i ../data/sentence_segmented.txt -format line -oe genia
 67 | 	BioPosDep/NLP4J$ bin/nlpdecode -c config-CRAFT.xml -i ../data/sentence_segmented.txt -format line -oe craft
 68 | 
 69 | 	# For a "pre-processed" tokenized and sentence-segmented corpus
 70 | 		# Convert into a column-based format
 71 | 	BioPosDep/NLP4J$ python ../get_ColumnFormat.py ../data/tokenized_sentence_segmented.txt
 72 | 		# Apply models using "tsv". Here we expect word forms at the second column (i.e. column index of 1). 
 73 | 		# Adjust <column index="1" field="form"/> in config-GENIA.xml and config-CRAFT.xml if users already have a column-formated corpus with a different index of the word form column.
 74 | 	BioPosDep/NLP4J$ bin/nlpdecode -c config-GENIA.xml -i ../data/tokenized_sentence_segmented.txt.column -format tsv -oe genia
 75 | 	BioPosDep/NLP4J$ bin/nlpdecode -c config-CRAFT.xml -i ../data/tokenized_sentence_segmented.txt.column -format tsv -oe craft
 76 | 	
 77 | 
 78 | From the examples above, output files `.genia` and `.craft ` are generated in folder `data`, containing POS and dependency annotations.  
 79 | 
 80 | 
 81 | #### NOTE
 82 | Those NLP4J output files are in a 9-column format. To further apply other dependency parsing models, they must be converted to 10-column format:
 83 | 
 84 | 	# Command line
 85 | 	BioPosDep$ python convert_NLP4J_to_CoNLL.py <NLP4J_output_filepath>
 86 | 
 87 | 	# Examples
 88 | 	BioPosDep$ python convert_NLP4J_to_CoNLL.py data/raw.txt.genia
 89 | 	BioPosDep$ python convert_NLP4J_to_CoNLL.py data/raw.txt.craft
 90 | 
 91 | ##### Two 10-column output files `raw.txt.genia.conll` and `raw.txt.craft.conll` are generated in folder `data`, which will be used as inputs for other models.
 92 | 	
 93 | ### Using retrained Stanford [Biaffine](https://github.com/tdozat/Parser-v2) parsing models 
 94 | 
 95 | #### Installation
 96 | 
 97 | 	# Install prerequisite packages  
 98 | 	BioPosDep/StanfordBiaffineParser-v2$ virtualenv .TF1_0
 99 | 	BioPosDep/StanfordBiaffineParser-v2$ source .TF1_0/bin/activate
100 | 	BioPosDep/StanfordBiaffineParser-v2$ pip install tensorflow==1.0
101 | 	BioPosDep/StanfordBiaffineParser-v2$ pip install numpy==1.11.0
102 | 	BioPosDep/StanfordBiaffineParser-v2$ pip install scipy==1.0.0
103 | 	BioPosDep/StanfordBiaffineParser-v2$ pip install matplotlib==2.1.2
104 | 	BioPosDep/StanfordBiaffineParser-v2$ pip install backports.lzma
105 | 
106 |  - Download file `Pre-trained-Biaffine-v2.zip` from [HERE](https://drive.google.com/file/d/18IYSJEV0uwbg468lFXejS0Wyw2_8Pjfa/view?usp=sharing). 
107 |  - Unzip the file, then copy/move folder `models` and file `PubMed-shuffle-win2-500Kwords.txt` into folder `BioPosDep/StanfordBiaffineParser-v2`.
108 | 
109 | 
110 | 
111 | #### Command line 
112 | 
113 | 	# Using model trained on GENIA
114 | 	BioPosDep/StanfordBiaffineParser-v2$ python main.py --save_dir models/GENIA parse <input_file_path>
115 | 	
116 | 	# Using model trained on CRAFT
117 | 	BioPosDep/StanfordBiaffineParser-v2$ python main.py --save_dir models/CRAFT parse <input_file_path>
118 | 
119 | 	# Output parsed files are by default saved in the model directory with the same name as the input file.
120 | 	# NOTE: We can also specify the output directory with the --output_dir flag and/or the output file name with the --output_file flag.
121 | 
122 | #### Examples
123 | 
124 | 	# Activate TensorFlow 1.0 before running models:
125 | 	BioPosDep/StanfordBiaffineParser-v2$ source .TF1_0/bin/activate
126 | 	BioPosDep/StanfordBiaffineParser-v2$ python main.py --save_dir models/GENIA parse ../data/raw.txt.genia.conll
127 | 	BioPosDep/StanfordBiaffineParser-v2$ python main.py --save_dir models/CRAFT parse ../data/raw.txt.craft.conll
128 | 	
129 | Two output  parsed files `raw.txt.genia.conll` and `raw.txt.craft.conll` are generated in folders  `models/GENIA` and `models/CRAFT`, respectively.
130 | 	
131 | ### Using retrained jPTDP models 
132 | 
133 | See [https://github.com/datquocnguyen/jPTDP](https://github.com/datquocnguyen/jPTDP) for details. 
134 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/config/CRAFT.cfg:
--------------------------------------------------------------------------------
 1 | #***************************************************************
 2 | # Where things are located
 3 | [Configurable]
 4 | train_files = /home/ubuntu/WORKSPACE/StanfordBiaffineParser-v2/predictedPOS/CRAFT.train.conll.20wayJK.txt 
 5 | parse_files = /home/ubuntu/WORKSPACE/StanfordBiaffineParser-v2/predictedPOS/CRAFT.dev.conll.20wayJK.txt
 6 | 
 7 | [Pretrained Vocab]
 8 | filename = /home/ubuntu/WORKSPACE/PubMed-shuffle-win2-500Kwords.txt 
 9 | # skips the first line of the file, which sometimes contains metadata about the embedding matrix
10 | skip_header = True
11 | 
12 | #***************************************************************
13 | # Embedding hyperparameters
14 | [Char Vocab]
15 | # {RNNEmbed, CNNEmbed, MLPEmbed}
16 | embed_model = RNNEmbed
17 | 
18 | # The aggregated word vocab, pretrained vocab, and char vocab
19 | [Multivocab]
20 | # probability of dropping a word embedding
21 | embed_keep_prob = .67
22 | 
23 | [Tag Vocab]
24 | # probability of dropping a tag embedding
25 | embed_keep_prob = .67
26 | 
27 | [RNN Embed]
28 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
29 | recur_cell = LSTMCell
30 | # number of LSTM layers
31 | n_layers = 3
32 | # number of recurrent units
33 | recur_size = 400
34 | # probability of dropping a connection between timesteps at a single layer
35 | recur_keep_prob = .67
36 | # probability of dropping a connection between layers at a single timestep
37 | ff_keep_prob = .67
38 | 
39 | #***************************************************************
40 | # NLP model hyperparameters
41 | [Tagger]
42 | #if you only want it to produce the first column of tags, set this to just 'tags'
43 | output_vocabs = tags:xtags
44 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
45 | recur_cell = LSTMCell
46 | # number of LSTM layers
47 | n_layers = 2
48 | # number of recurrent units in each direction of the BiLSTM
49 | recur_size = 400
50 | # number of units in the tag classifier
51 | mlp_size = 600
52 | # probability of dropping a node in the MLP or the classifier
53 | mlp_keep_prob = .67
54 | # probability of dropping a connection between timesteps at a single layer
55 | recur_keep_prob = .5
56 | # probability of dropping a connection between layers at a single timestep
57 | ff_keep_prob = .67
58 | 
59 | [Parser]
60 | # if you only want it to use the first column of tags, set this to 'words:tags'
61 | input_vocabs = words:tags:xtags
62 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
63 | recur_cell = LSTMCell
64 | # number of layers
65 | n_layers = 3
66 | # number of recurrent units
67 | recur_size = 400
68 | # number of units in the edge classifier
69 | arc_mlp_size = 600
70 | # number of units in the label classifier (you probably want this to be small!)
71 | rel_mlp_size = 100
72 | # probability of dropping a node in the MLP or the classifier
73 | mlp_keep_prob = .67
74 | # probability of dropping a connection between timesteps at a single layer
75 | recur_keep_prob = .67
76 | # probability of dropping a connection between layers at a single timestep
77 | ff_keep_prob = .67
78 | 
79 | #***************************************************************
80 | # Training hyperparameters
81 | [Network]
82 | # {Parser, Tagger}
83 | nlp_model = Parser
84 | quit_after_n_iters_without_improvement = 5000
85 | max_train_iters = 20001
86 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/config/GENIA.cfg:
--------------------------------------------------------------------------------
 1 | #***************************************************************
 2 | # Where things are located
 3 | [Configurable]
 4 | train_files = /home/ubuntu/WORKSPACE/StanfordBiaffineParser-v2/predictedPOS/GENIA.train.conll.20wayJK.txt 
 5 | parse_files = /home/ubuntu/WORKSPACE/StanfordBiaffineParser-v2/predictedPOS/GENIA.dev.conll.20wayJK.txt
 6 | 
 7 | [Pretrained Vocab]
 8 | filename = /home/ubuntu/WORKSPACE/PubMed-shuffle-win2-500Kwords.txt 
 9 | # skips the first line of the file, which sometimes contains metadata about the embedding matrix
10 | skip_header = True
11 | 
12 | #***************************************************************
13 | # Embedding hyperparameters
14 | [Char Vocab]
15 | # {RNNEmbed, CNNEmbed, MLPEmbed}
16 | embed_model = RNNEmbed
17 | 
18 | # The aggregated word vocab, pretrained vocab, and char vocab
19 | [Multivocab]
20 | # probability of dropping a word embedding
21 | embed_keep_prob = .67
22 | 
23 | [Tag Vocab]
24 | # probability of dropping a tag embedding
25 | embed_keep_prob = .67
26 | 
27 | [RNN Embed]
28 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
29 | recur_cell = LSTMCell
30 | # number of LSTM layers
31 | n_layers = 3
32 | # number of recurrent units
33 | recur_size = 400
34 | # probability of dropping a connection between timesteps at a single layer
35 | recur_keep_prob = .67
36 | # probability of dropping a connection between layers at a single timestep
37 | ff_keep_prob = .67
38 | 
39 | #***************************************************************
40 | # NLP model hyperparameters
41 | [Tagger]
42 | #if you only want it to produce the first column of tags, set this to just 'tags'
43 | output_vocabs = tags:xtags
44 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
45 | recur_cell = LSTMCell
46 | # number of LSTM layers
47 | n_layers = 2
48 | # number of recurrent units in each direction of the BiLSTM
49 | recur_size = 400
50 | # number of units in the tag classifier
51 | mlp_size = 600
52 | # probability of dropping a node in the MLP or the classifier
53 | mlp_keep_prob = .67
54 | # probability of dropping a connection between timesteps at a single layer
55 | recur_keep_prob = .5
56 | # probability of dropping a connection between layers at a single timestep
57 | ff_keep_prob = .67
58 | 
59 | [Parser]
60 | # if you only want it to use the first column of tags, set this to 'words:tags'
61 | input_vocabs = words:tags:xtags
62 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
63 | recur_cell = LSTMCell
64 | # number of layers
65 | n_layers = 3
66 | # number of recurrent units
67 | recur_size = 400
68 | # number of units in the edge classifier
69 | arc_mlp_size = 600
70 | # number of units in the label classifier (you probably want this to be small!)
71 | rel_mlp_size = 100
72 | # probability of dropping a node in the MLP or the classifier
73 | mlp_keep_prob = .67
74 | # probability of dropping a connection between timesteps at a single layer
75 | recur_keep_prob = .67
76 | # probability of dropping a connection between layers at a single timestep
77 | ff_keep_prob = .67
78 | 
79 | #***************************************************************
80 | # Training hyperparameters
81 | [Network]
82 | # {Parser, Tagger}
83 | nlp_model = Parser
84 | quit_after_n_iters_without_improvement = 5000
85 | max_train_iters = 20001
86 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/config/defaults.cfg:
--------------------------------------------------------------------------------
  1 | #***************************************************************
  2 | # High level stuff
  3 | [DEFAULT]
  4 | save_dir = saves/defaults
  5 | data_dir = data
  6 | lc = en
  7 | treebank = English
  8 | lang = English
  9 | 
 10 | [Configurable]
 11 | train_files = %(data_dir)s/CoNLL17/UD_%(treebank)s/%(lc)s-ud-train.conllu
 12 | parse_files = %(data_dir)s/CoNLL17/UD_%(treebank)s/%(lc)s-ud-dev.conllu
 13 | verbose = True 
 14 | name = None
 15 | 
 16 | #***************************************************************
 17 | # Vocab data structures
 18 | [Base Vocab]
 19 | # TODO take special_tokens out of here and put them in the classes
 20 | cased = None
 21 | embed_size = 100
 22 | 
 23 | [Pretrained Vocab]
 24 | special_tokens=<PAD>:<ROOT>:<DROP>:<UNK>
 25 | skip_header = True
 26 | name = pretrained
 27 | filename = %(data_dir)s/embeddings/%(lang)s/%(lc)s.vectors.xz
 28 | cased = False
 29 | max_rank = 0
 30 | 
 31 | [Token Vocab]
 32 | name = tokens
 33 | embed_keep_prob = .67
 34 | min_occur_count = 2
 35 | max_rank = 100000
 36 | 
 37 | [Index Vocab]
 38 | special_tokens=<PAD>:<ROOT>
 39 | 
 40 | [Dep Vocab]
 41 | name = deps
 42 | 
 43 | [Head Vocab]
 44 | name = heads
 45 | 
 46 | [Word Vocab]
 47 | special_tokens=<PAD>:<ROOT>:<DROP>:<UNK>
 48 | name = words
 49 | filename = %(save_dir)s/%(name)s.txt
 50 | cased = False
 51 | 
 52 | [Lemma Vocab]
 53 | name = lemmas
 54 | filename = %(save_dir)s/%(name)s.txt
 55 | 
 56 | [Tag Vocab]
 57 | special_tokens=PAD:ROOT:DROP:UNK
 58 | name = tags
 59 | filename = %(save_dir)s/%(name)s.txt
 60 | cased = True
 61 | 
 62 | [X Tag Vocab]
 63 | name = xtags
 64 | filename = %(save_dir)s/%(name)s.txt
 65 | 
 66 | [Rel Vocab]
 67 | special_tokens=pad:root:drop:unk
 68 | name = rels
 69 | filename = %(save_dir)s/%(name)s.txt
 70 | cased = True
 71 | 
 72 | [Subtoken Vocab]
 73 | max_rank = 0
 74 | # TODO Setting this to more than 1 triggers a bug
 75 | n_buckets = 2
 76 | embed_model = CNNEmbed
 77 | embed_keep_prob = 1
 78 | 
 79 | [Char Vocab]
 80 | special_tokens = <PAD>:<ROOT>:<DROP>:<UNK>:<META_UNK>:<START>:<STOP>
 81 | name = chars
 82 | filename = %(save_dir)s/%(name)s.txt
 83 | embed_model = RNNEmbed
 84 | 
 85 | [Ngram Vocab]
 86 | special_tokens = <PAD>:<ROOT>:<DROP>:<UNK>:<META_UNK>
 87 | name = ngrams
 88 | filename = %(save_dir)s/%(name)s.txt
 89 | embed_model = MLPEmbed
 90 | 
 91 | [Ngram Multivocab]
 92 | special_tokens = <PAD>:<ROOT>:<DROP>:<UNK>:<META_UNK>
 93 | name = multi-ngram
 94 | max_n = 5
 95 | embed_model = MLPEmbed
 96 | 
 97 | [Bytepair Vocab]
 98 | name = bytepairs
 99 | filename = %(save_dir)s/%(name)s.txt
100 | n_bytepairs = 500
101 | embed_model = MLPEmbed
102 | 
103 | [Multivocab]
104 | embed_keep_prob = .67
105 | 
106 | #***************************************************************
107 | # Neural models
108 | [NN]
109 | recur_cell = LSTMCell
110 | n_layers = 3
111 | mlp_func = leaky_relu
112 | conv_func = leaky_relu
113 | # TODO make sure you add this to Base Cell
114 | recur_size = 200
115 | window_size = 5
116 | conv_size = 200
117 | mlp_size = 200
118 | rnn_func = birnn
119 | conv_keep_prob = .67
120 | mlp_keep_prob = .67
121 | recur_keep_prob = .67
122 | ff_keep_prob = .67
123 | 
124 | [Base Cell]
125 | forget_bias = 0
126 | recur_func = tanh
127 | recur_size = 300
128 | 
129 | [RNN Cell]
130 | recur_func = leaky_relu
131 | recur_size = 400
132 | 
133 | [Base Embed]
134 | 
135 | [MLP Embed]
136 | 
137 | [RNN Embed]
138 | rnn_func = rnn
139 | 
140 | [CNN Embed]
141 | 
142 | [Base Tagger]
143 | input_vocabs = words
144 | output_vocabs = tags
145 | 
146 | [Base X Tagger]
147 | input_vocabs = words
148 | output_vocabs = tags:xtags
149 | 
150 | [Tagger]
151 | name = tagger
152 | n_layers = 2
153 | recur_keep_prob = .5
154 | 
155 | [X Tagger]
156 | name = xtagger
157 | n_layers = 2
158 | recur_keep_prob = .5
159 | 
160 | [Base Parser]
161 | # TODO take off xtags later
162 | input_vocabs = words:tags:xtags
163 | output_vocabs = rels:heads
164 | 
165 | [Parser]
166 | name = parser
167 | arc_mlp_size = 400
168 | rel_mlp_size = 100
169 | 
170 | [Xbar Parser]
171 | name = xbar_parser
172 | p_mlp_size = 400
173 | arc_mlp_size = 400
174 | rel_mlp_size = 100
175 | 
176 | [Bin Parser]
177 | name = bin_parser
178 | p_mlp_size = 400
179 | arc_mlp_size = 400
180 | rel_mlp_size = 100
181 | 
182 | [Fish Parser]
183 | name = fish_parser
184 | lambda_mlp_size = 400
185 | arc_mlp_size = 400
186 | rel_mlp_size = 100
187 | 
188 | [Gama Parser]
189 | name = fish_parser
190 | p_mlp_size = 400
191 | arc_mlp_size = 400
192 | rel_mlp_size = 100
193 | 
194 | [Joint Parser]
195 | tag_mlp_size = 500
196 | arc_mlp_size = 500
197 | rel_mlp_size = 100
198 | 
199 | #***************************************************************
200 | # Sequence data structures
201 | [Multibucket]
202 | n_buckets = 2
203 | name = multibucket
204 | 
205 | [Bucket]
206 | name = None
207 | 
208 | [Dataset]
209 | #TODO make sure you can get rid of data_files
210 | 
211 | [Trainset]
212 | name = trainset
213 | data_files = train_files
214 | n_buckets = 10
215 | batch_by = tokens
216 | batch_size = 5000
217 | 
218 | [Parseset]
219 | name = parseset
220 | data_files = parse_files
221 | n_buckets = 5
222 | batch_by = tokens
223 | batch_size = 50000
224 | 
225 | 
226 | #***************************************************************
227 | # Training 
228 | [Network]
229 | name = network
230 | subtoken_vocab = CharVocab
231 | nlp_model = Parser
232 | min_train_iters = 1000
233 | max_train_iters = 20001
234 | validate_every = 100
235 | save_every = 1
236 | quit_after_n_iters_without_improvement = 5000
237 | per_process_gpu_memory_fraction = -1
238 | 
239 | #***************************************************************
240 | # Miscellaneous
241 | [Radam Optimizer]
242 | name = radam
243 | # TODO keep adjusting lr?
244 | learning_rate = 2e-3
245 | decay = .75
246 | decay_steps = 5000
247 | clip = 5
248 | mu = .9
249 | nu = .9
250 | gamma = 0
251 | chi = 0
252 | epsilon = 1e-12
253 | 
254 | [Zipf]
255 | n_zipfs = 3
256 | name = zipf
257 | filename = %(save_dir)s/%(name)s.txt
258 | batch_size = 500
259 | max_train_iters = 5000
260 | print_every = 500
261 | 
262 | [Bucketer]
263 | name = bucketer
264 | filename = %(save_dir)s/%(name)s.txt
265 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/config/template.cfg:
--------------------------------------------------------------------------------
 1 | #***************************************************************
 2 | # Where things are located
 3 | [Configurable]
 4 | train_files = colon/separated/list/of/files:supports/glob/*
 5 | parse_files = colon/separated/list/of/files:supports/glob/*
 6 | 
 7 | [Pretrained Vocab]
 8 | filename = location/of/pretrained/embeddings
 9 | # skips the first line of the file, which sometimes contains metadata about the embedding matrix
10 | skip_header = True
11 | 
12 | #***************************************************************
13 | # Embedding hyperparameters
14 | [Char Vocab]
15 | # {RNNEmbed, CNNEmbed, MLPEmbed}
16 | embed_model = RNNEmbed
17 | 
18 | # The aggregated word vocab, pretrained vocab, and char vocab
19 | [Multivocab]
20 | # probability of dropping a word embedding
21 | embed_keep_prob = .67
22 | 
23 | [Tag Vocab]
24 | # probability of dropping a tag embedding
25 | embed_keep_prob = .67
26 | 
27 | [RNN Embed]
28 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
29 | recur_cell = LSTMCell
30 | # number of LSTM layers
31 | n_layers = 3
32 | # number of recurrent units
33 | recur_size = 400
34 | # probability of dropping a connection between timesteps at a single layer
35 | recur_keep_prob = .67
36 | # probability of dropping a connection between layers at a single timestep
37 | ff_keep_prob = .67
38 | 
39 | #***************************************************************
40 | # NLP model hyperparameters
41 | [Tagger]
42 | #if you only want it to produce the first column of tags, set this to just 'tags'
43 | output_vocabs = tags:xtags
44 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
45 | recur_cell = LSTMCell
46 | # number of LSTM layers
47 | n_layers = 2
48 | # number of recurrent units in each direction of the BiLSTM
49 | recur_size = 400
50 | # number of units in the tag classifier
51 | mlp_size = 600
52 | # probability of dropping a node in the MLP or the classifier
53 | mlp_keep_prob = .67
54 | # probability of dropping a connection between timesteps at a single layer
55 | recur_keep_prob = .5
56 | # probability of dropping a connection between layers at a single timestep
57 | ff_keep_prob = .67
58 | 
59 | [Parser]
60 | # if you only want it to use the first column of tags, set this to 'words:tags'
61 | input_vocabs = words:tags:xtags
62 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
63 | recur_cell = LSTMCell
64 | # number of layers
65 | n_layers = 3
66 | # number of recurrent units
67 | recur_size = 400
68 | # number of units in the edge classifier
69 | arc_mlp_size = 600
70 | # number of units in the label classifier (you probably want this to be small!)
71 | rel_mlp_size = 100
72 | # probability of dropping a node in the MLP or the classifier
73 | mlp_keep_prob = .67
74 | # probability of dropping a connection between timesteps at a single layer
75 | recur_keep_prob = .67
76 | # probability of dropping a connection between layers at a single timestep
77 | ff_keep_prob = .67
78 | 
79 | #***************************************************************
80 | # Training hyperparameters
81 | [Network]
82 | # {Parser, Tagger}
83 | nlp_model = Parser
84 | quit_after_n_iters_without_improvement = 5000
85 | max_train_iters = 50000
86 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import re
 23 | import os
 24 | import sys
 25 | import codecs
 26 | from argparse import ArgumentParser
 27 | 
 28 | from parser import Configurable
 29 | from parser import Network
 30 | 
 31 | # TODO make the pretrained vocab names a list given to TokenVocab
 32 | #***************************************************************
 33 | # Set up the argparser
 34 | argparser = ArgumentParser('Network')
 35 | argparser.add_argument('--save_dir', required=True)
 36 | subparsers = argparser.add_subparsers()
 37 | section_names = set()
 38 | # --section_name opt1=value1 opt2=value2 opt3=value3
 39 | with codecs.open('config/defaults.cfg') as f:
 40 |   section_regex = re.compile('\[(.*)\]')
 41 |   for line in f:
 42 |     match = section_regex.match(line)
 43 |     if match:
 44 |       section_names.add(match.group(1).lower().replace(' ', '_'))
 45 | 
 46 | #===============================================================
 47 | # Train
 48 | #---------------------------------------------------------------
 49 | def train(save_dir, **kwargs):
 50 |   """"""
 51 |   
 52 |   kwargs['config_file'] = kwargs.pop('config_file', '')
 53 |   load = kwargs.pop('load')
 54 |   try:
 55 |     if not load and os.path.isdir(save_dir):
 56 |       raw_input('Save directory already exists. Press <Enter> to continue or <Ctrl-c> to abort.')
 57 |       if os.path.isfile(os.path.join(save_dir, 'config.cfg')):
 58 |         os.remove(os.path.join(save_dir, 'config.cfg'))
 59 |   except KeyboardInterrupt:
 60 |     print()
 61 |     sys.exit(0)
 62 |   network = Network(**kwargs)
 63 |   network.train(load=load)
 64 |   return
 65 | #---------------------------------------------------------------
 66 | 
 67 | train_parser = subparsers.add_parser('train')
 68 | train_parser.set_defaults(action=train)
 69 | train_parser.add_argument('--load', action='store_true')
 70 | train_parser.add_argument('--config_file')
 71 | for section_name in section_names:
 72 |   train_parser.add_argument('--'+section_name, nargs='+')
 73 | 
 74 | #===============================================================
 75 | # Parse
 76 | #---------------------------------------------------------------
 77 | def parse(save_dir, **kwargs):
 78 |   """"""
 79 |   
 80 |   kwargs['config_file'] = os.path.join(save_dir, 'config.cfg')
 81 |   files = kwargs.pop('files')
 82 |   output_file = kwargs.pop('output_file', None)
 83 |   output_dir = kwargs.pop('output_dir', None)
 84 |   if len(files) > 1 and output_file is not None:
 85 |     raise ValueError('Cannot provide a value for --output_file when parsing multiple files')
 86 |   kwargs['is_evaluation'] = True
 87 |   network = Network(**kwargs)
 88 |   network.parse(files, output_file=output_file, output_dir=output_dir)
 89 |   return
 90 | #---------------------------------------------------------------
 91 | 
 92 | parse_parser = subparsers.add_parser('parse')
 93 | parse_parser.set_defaults(action=parse)
 94 | parse_parser.add_argument('files', nargs='+')
 95 | for section_name in section_names:
 96 |   parse_parser.add_argument('--'+section_name, nargs='+')
 97 | parse_parser.add_argument('--output_file')
 98 | parse_parser.add_argument('--output_dir')
 99 | 
100 | #***************************************************************
101 | # Parse the arguments
102 | kwargs = vars(argparser.parse_args())
103 | action = kwargs.pop('action')
104 | save_dir = kwargs.pop('save_dir')
105 | kwargs = {key: value for key, value in kwargs.iteritems() if value is not None}
106 | for section, values in kwargs.iteritems():
107 |   if section in section_names:
108 |     values = [value.split('=', 1) for value in values]
109 |     kwargs[section] = {opt: value for opt, value in values}
110 | if 'default' not in kwargs:
111 |   kwargs['default'] = {}
112 | kwargs['default']['save_dir'] = save_dir
113 | action(save_dir, **kwargs)  
114 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | 
4 | from configurable import Configurable
5 | from bucket import Bucket
6 | from multibucket import Multibucket
7 | from network import Network


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/bucket.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import numpy as np
 23 | import tensorflow as tf
 24 | 
 25 | from parser.configurable import Configurable
 26 | 
 27 | #***************************************************************
 28 | class Bucket(Configurable):
 29 |   """"""
 30 |   
 31 |   #=============================================================
 32 |   def __init__(self, *args, **kwargs):
 33 |     """"""
 34 |     
 35 |     embed_model = kwargs.pop('embed_model', None)
 36 |     super(Bucket, self).__init__(*args, **kwargs)
 37 |     
 38 |     self._indices = []
 39 |     self._maxlen = 0
 40 |     self._depth = 1
 41 |     self._tokens = []
 42 |     if embed_model is not None:
 43 |       self._embed_model = embed_model.from_configurable(self, name=self.name)
 44 |     else:
 45 |       self._embed_model = None
 46 |     return
 47 |   
 48 |   #=============================================================
 49 |   def __call__(self, vocab, keep_prob=None, moving_params=None):
 50 |     """"""
 51 |     
 52 |     return self.embed_model(vocab, keep_prob=keep_prob, moving_params=moving_params)
 53 |   
 54 |   #=============================================================
 55 |   def open(self, maxlen, depth=None):
 56 |     """"""
 57 |     
 58 |     if depth is None:
 59 |       self._indices = [[0]]
 60 |     else:
 61 |       self._indices = [[[0]*depth]]
 62 |     self._tokens = [['']]
 63 |     self._maxlen = maxlen
 64 |     self._depth = depth
 65 |     return self
 66 |   
 67 |   #=============================================================
 68 |   def add(self, idxs, tokens=None):
 69 |     """"""
 70 |     
 71 |     if isinstance(self.indices, np.ndarray):
 72 |       raise TypeError("The bucket has already been closed, you can't add to it")
 73 |     if len(idxs) > len(self) and len(self) != -1:
 74 |       raise ValueError('Bucket of max len %d received sequence of len %d' % (len(self), len(idxs)))
 75 |     
 76 |     self.indices.append(idxs)
 77 |     if tokens is not None:
 78 |       self.tokens.append(tokens)
 79 |     return len(self.indices) - 1
 80 |   
 81 |   #=============================================================
 82 |   def get_tokens(self, batch):
 83 |     """"""
 84 |     
 85 |     return [self.tokens[sent_idx] for sent_idx in batch]
 86 | 
 87 |   #=============================================================
 88 |   def close(self):
 89 |     """"""
 90 |     
 91 |     if self.depth is None:
 92 |       indices = np.zeros((len(self.indices), len(self)), dtype=np.int32)
 93 |       for i, sequence in enumerate(self.indices):
 94 |         indices[i,0:len(sequence)] = sequence 
 95 |     else:
 96 |       indices = np.zeros((len(self.indices), len(self), self.depth), dtype=np.int32)
 97 |       for i, sequence in enumerate(self.indices):
 98 |         for j, index in enumerate(sequence):
 99 |           indices[i,j,0:len(index)] = index
100 |     self._indices = indices
101 |   
102 |   #=============================================================
103 |   @classmethod
104 |   def from_dataset(cls, dataset, bkt_idx, *args, **kwargs):
105 |     """"""
106 |     
107 |     kwargs = dict(kwargs)
108 |     kwargs['name'] = '{name}-{bkt_idx}'.format(name=dataset.name, bkt_idx=bkt_idx)
109 |     bucket = cls.from_configurable(dataset, *args, **kwargs)
110 |     indices = []
111 |     for multibucket in dataset:
112 |       indices.append(multibucket[bkt_idx].indices)
113 |     for i in xrange(len(indices)):
114 |       if len(indices[i].shape) == 2:
115 |         indices[i] = indices[i][:,:,None]
116 |     bucket._indices = np.concatenate(indices, axis=2)
117 |     bucket._maxlen = bucket.indices.shape[1]
118 |     bucket._depth = bucket.indices.shape[2]
119 |     return bucket
120 |     
121 |   #=============================================================
122 |   def reset_placeholders(self):
123 |     self.embed_model.reset_placeholders()
124 |     return
125 |   #=============================================================
126 |   @property
127 |   def tokens(self):
128 |     return self._tokens
129 |   @property
130 |   def indices(self):
131 |     return self._indices
132 |   @property
133 |   def embed_model(self):
134 |     return self._embed_model
135 |   @property
136 |   def depth(self):
137 |     return self._depth
138 |   @property
139 |   def placeholder(self):
140 |     return self.embed_model.placeholder
141 | 
142 |   #=============================================================
143 |   def __len__(self):
144 |     return self._maxlen
145 |   def __enter__(self):
146 |     return self
147 |   def __exit__(self, exception_type, exception_value, trace):
148 |     if exception_type is not None:
149 |       raise exception_type(exception_value)
150 |     self.close()
151 |     return
152 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/misc/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 |  
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 |  
19 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/misc/colors.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |  
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | colors = {
23 |   None: '\033[0m',
24 |   'bold': '\033[1m',
25 |   'italic': '\033[3m',
26 |   'uline': '\033[4m',
27 |   'blink': '\033[5m',
28 |   'hlight': '\033[7m',
29 |   
30 |   'black': '\033[30m',
31 |   'red': '\033[31m',
32 |   'green': '\033[32m',
33 |   'yellow': '\033[33m',
34 |   'blue': '\033[34m',
35 |   'magenta': '\033[35m',
36 |   'cyan': '\033[36m',
37 |   'white': '\033[37m',
38 |   
39 |   'black_hlight': '\033[40m',
40 |   'red_hlight': '\033[41m',
41 |   'green_hlight': '\033[42m',
42 |   'yellow_hlight': '\033[43m',
43 |   'blue_hlight': '\033[44m',
44 |   'magenta_hlight': '\033[45m',
45 |   'cyan_hlight': '\033[46m',
46 |   'white_hlight': '\033[47m',
47 |   
48 |   'bright_black': '\033[90m',
49 |   'bright_red': '\033[91m',
50 |   'bright_green': '\033[92m',
51 |   'bright_yellow': '\033[93m',
52 |   'bright_blue': '\033[94m',
53 |   'bright_magenta': '\033[95m',
54 |   'bright_cyan': '\033[96m',
55 |   'bright_white': '\033[97m',
56 |   
57 |   'bright_black_hlight': '\033[100m',
58 |   'bright_red_hlight': '\033[101m',
59 |   'bright_green_hlight': '\033[102m',
60 |   'bright_orange_hlight': '\033[103m',
61 |   'bright_blue_hlight': '\033[1010m',
62 |   'bright_magenta_hlight': '\033[105m',
63 |   'bright_cyan_hlight': '\033[106m',
64 |   'bright_white_hlight': '\033[107m',
65 | }
66 | 
67 | def ctext(text, *color_list):
68 |   return ''.join(colors[color] for color in color_list) + text + colors[None]
69 | def color_pattern(text1, text2, *color_list):
70 |   multicolor = ''.join(colors[color] for color in color_list)
71 |   return multicolor + colors['bold'] + text1 + colors[None] + ' ' + multicolor + colors['uline'] + text2 + colors[None]
72 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/misc/get_encoding.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import codecs
23 | 
24 | #***************************************************************
25 | encodings = ['utf-8', 'ascii']
26 | 
27 | def get_encoding(filename):
28 |   """"""
29 |   
30 |   success = False
31 |   for encoding in encodings:
32 |     with codecs.open(filename, encoding=encoding) as f:
33 |       try:
34 |         for i, line in enumerate(f):
35 |           pass
36 |         success = True
37 |         break
38 |       except ValueError as e:
39 |         print('Encoding {0} failed for file {1} at line {2}: {3}\n{4}'.format(encoding, filename, i, line, e))
40 |         continue
41 | 
42 |   if success:
43 |     return encoding
44 |   else:
45 |     raise ValueError('No valid encoding found for file {0}'.format(filename))
46 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/multibucket.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import numpy as np
 23 | import tensorflow as tf
 24 | 
 25 | from parser import Configurable
 26 | from parser import Bucket
 27 | from parser.misc.colors import ctext
 28 | 
 29 | #***************************************************************
 30 | class Multibucket(Configurable):
 31 |   """"""
 32 |   
 33 |   #=============================================================
 34 |   def __init__(self, *args, **kwargs):
 35 |     """"""
 36 |     
 37 |     self._embed_model = kwargs.pop('embed_model', None)
 38 |     super(Multibucket, self).__init__(*args, **kwargs)
 39 |     
 40 |     self._indices = []
 41 |     self._buckets = []
 42 |     self._len2idx = {}
 43 |     self.placeholder = None
 44 |     return
 45 |   
 46 |   #=============================================================
 47 |   def __call__(self, vocab, keep_prob=None, moving_params=None):
 48 |     """"""
 49 |     
 50 |     # This placeholder is used to ensure the bucket data is in the right order
 51 |     reuse = None if moving_params is None else True
 52 |     self.generate_placeholder()
 53 |     embeddings = []
 54 |     for i, bucket in enumerate(self):
 55 |       if i > 0:
 56 |         reuse = True
 57 |       with tf.variable_scope(self.name+'-multibucket', reuse=reuse):
 58 |         embeddings.append(bucket(vocab, keep_prob=keep_prob, moving_params=moving_params))
 59 |     return tf.nn.embedding_lookup(tf.concat(embeddings, axis=0), self.placeholder)
 60 |   
 61 |   #=============================================================
 62 |   def reset_placeholders(self):
 63 |     self.placeholder = None
 64 |     for bucket in self:
 65 |       bucket.reset_placeholders()
 66 |     return
 67 | 
 68 |   #=============================================================
 69 |   def generate_placeholder(self):
 70 |     """"""
 71 |     
 72 |     if self.placeholder is None:
 73 |       self.placeholder = tf.placeholder(tf.int32, shape=(None,), name=self.name+'-multibucket')
 74 |     return self.placeholder
 75 |   
 76 |   #=============================================================
 77 |   def open(self, maxlens, depth=None):
 78 |     """"""
 79 |     
 80 |     self._indices = [(0,0)]
 81 |     self._buckets = []
 82 |     self._len2idx = {}
 83 |     prevlen = -1
 84 |     for idx, maxlen in enumerate(maxlens):
 85 |       self._buckets.append(Bucket.from_configurable(self, embed_model=self.embed_model, name='%s-%d' % (self.name, idx)).open(maxlen, depth=depth))
 86 |       self._len2idx.update(zip(range(prevlen+1, maxlen+1), [idx]*(maxlen-prevlen)))
 87 |       prevlen = maxlen
 88 |     return self
 89 |   
 90 |   #=============================================================
 91 |   def add(self, idxs, tokens=None):
 92 |     """"""
 93 |     
 94 |     if isinstance(self.indices, np.ndarray):
 95 |       raise TypeError("The buckets have already been closed, you can't add to them")
 96 |     
 97 |     idx = self._len2idx.get(len(idxs), len(self)-1)
 98 |     bkt_idx = self[idx].add(idxs, tokens=tokens)
 99 |     self.indices.append( (idx, bkt_idx) )
100 |     return len(self.indices) - 1
101 |   
102 |   #=============================================================
103 |   def close(self):
104 |     """"""
105 |     
106 |     for bucket in self:
107 |       bucket.close()
108 |     
109 |     self._indices = np.array(self.indices, dtype=[('bkt_idx', 'i4'), ('idx', 'i4')])
110 |     return
111 |   
112 |   #=============================================================
113 |   def inv_idxs(self):
114 |     """"""
115 |     
116 |     return np.argsort(np.concatenate([np.where(self.indices['bkt_idx'][1:] == bkt_idx)[0] for bkt_idx in xrange(len(self))]))
117 |   
118 |   #=============================================================
119 |   def get_tokens(self, bkt_idx, batch):
120 |     """"""
121 | 
122 |     return self[bkt_idx].get_tokens(batch)
123 | 
124 |   #=============================================================
125 |   @classmethod
126 |   def from_dataset(cls, dataset, *args, **kwargs):
127 |     """"""
128 |     
129 |     multibucket = cls.from_configurable(dataset, *args, **kwargs)
130 |     indices = []
131 |     for multibucket_ in dataset:
132 |       indices.append(multibucket_.indices)
133 |     for i in xrange(1, len(indices)):
134 |       assert np.equal(indices[0].astype(int), indices[i].astype(int)).all()
135 |     multibucket._indices = np.array(multibucket_.indices)
136 |     buckets = [Bucket.from_dataset(dataset, i, *args, **kwargs) for i in xrange(len(multibucket_))]
137 |     multibucket._buckets = buckets
138 |     if dataset.verbose:
139 |       for bucket in multibucket:
140 |         print('Bucket {name} is {shape}'.format(name=bucket.name, shape=ctext(' x '.join(str(x) for x in bucket.indices.shape), 'bright_blue')))
141 |     return multibucket
142 |   
143 |   #=============================================================
144 |   @property
145 |   def indices(self):
146 |     return self._indices
147 |   @property
148 |   def embed_model(self):
149 |     return self._embed_model
150 |   
151 |   #=============================================================
152 |   def __str__(self):
153 |     return str(self._buckets)
154 |   def __iter__(self):
155 |     return (bucket for bucket in self._buckets)
156 |   def __getitem__(self, key):
157 |     return self._buckets[key]
158 |   def __len__(self):
159 |     return len(self._buckets)
160 |   def __enter__(self):
161 |     return self
162 |   def __exit__(self, exception_type, exception_value, trace):
163 |     if exception_type is not None:
164 |       raise exception_type(exception_value)
165 |     self.close()
166 |     return
167 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/__init__.py:
--------------------------------------------------------------------------------
1 | import models 
2 | import optimizers
3 | import recur_cells


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/functions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 |  
 4 | from __future__ import absolute_import
 5 | from __future__ import division
 6 | from __future__ import print_function
 7 | 
 8 | import numpy as np
 9 | import tensorflow as tf
10 | 
11 | #***************************************************************
12 | sig_const = np.arctanh(1/3)
13 | tanh_const = np.arctanh(np.sqrt(1/3))
14 | 
15 | def gate(x):
16 |   return tf.nn.sigmoid(2*x)
17 | 
18 | def tanh(x):
19 |   return tf.nn.tanh(x)
20 | 
21 | def gated_tanh(x):
22 |   dim = len(x.get_shape().as_list())-1
23 |   cell_act, gate_act = tf.split(x, 2, dim)
24 |   return gate(gate_act) * tanh(cell_act)
25 | 
26 | def identity(x):
27 |   return tf.identity(x)
28 | 
29 | def gated_identity(x):
30 |   dim = len(x.get_shape().as_list())-1
31 |   cell_act, gate_act = tf.split(x, 2, dim)
32 |   return gate(gate_act) * identity(cell_act)
33 | 
34 | def softplus(x):
35 |   return tf.softplus(2*x)/2
36 | 
37 | def elu(x):
38 |   return tf.nn.elu(x)
39 | 
40 | def relu(x):
41 |   return tf.nn.relu(x)
42 | 
43 | def leaky_relu(x):
44 |   return tf.maximum(.1*x, x)


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/__init__.py:
--------------------------------------------------------------------------------
1 | from nn import NN


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/embeds/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from mlp_embed import MLPEmbed
19 | from rnn_embed import RNNEmbed
20 | from cnn_embed import CNNEmbed


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/embeds/base_embed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import numpy as np
23 | import tensorflow as tf
24 | 
25 | from parser.vocabs import TokenVocab, Multivocab
26 | from parser.neural.models import NN
27 | 
28 | #***************************************************************
29 | class BaseEmbed(NN):
30 |   """"""
31 |   
32 |   #=============================================================
33 |   def __init__(self, *args, **kwargs):
34 |     """"""
35 |     
36 |     super(BaseEmbed, self).__init__(*args, **kwargs)
37 |     # This placeholder represents the data in the bucket that called BaseEmbed.__init__
38 |     self.placeholder = None
39 |     return
40 |   
41 |   #=============================================================
42 |   def reset_placeholders(self):
43 |     self.placeholder = None
44 |     return
45 | 
46 |   #=============================================================
47 |   def __call__(self, vocab, keep_prob=None, moving_params=None):
48 |     """"""
49 |     
50 |     self.moving_params = moving_params
51 |     if isinstance(vocab, Multivocab):
52 |       multivocab = vocab
53 |       self.generate_placeholder([None,None,None])
54 |       embeddings = [TokenVocab.__call__(vocab, self.placeholder[:,:,i]) for i, vocab in enumerate(multivocab)]
55 |       embeddings = tf.stack(embeddings, axis=2)
56 |       # (n x b x g x d) -> (n x b x d)
57 |       with tf.variable_scope('Pre-Attn'):
58 |         embeddings = self.linear_attention(embeddings)
59 |       self._tokens_to_keep = tf.to_float(tf.greater(self.placeholder[:,:,0], vocab.PAD))
60 |     else:
61 |       self.generate_placeholder([None,None])
62 |       # (n x b x d)
63 |       embeddings = TokenVocab.__call__(vocab, self.placeholder)
64 |       self._tokens_to_keep = tf.to_float(tf.greater(self.placeholder, vocab.PAD))
65 |     self._batch_size = tf.shape(self.placeholder)[0]
66 |     self._bucket_size = tf.shape(self.placeholder)[1]
67 |     self._sequence_lengths = tf.to_int32(tf.reduce_sum(self.tokens_to_keep, axis=1))
68 |     self._n_tokens = tf.reduce_sum(self.sequence_lengths)
69 |     return embeddings
70 |   
71 |   #=============================================================
72 |   def generate_placeholder(self, shape):
73 |     if self.placeholder is None:
74 |       self.placeholder = tf.placeholder(tf.int32, shape=shape, name='%s-bkt' % self.name)
75 |     return self.placeholder
76 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/embeds/cnn_embed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import numpy as np
23 | import tensorflow as tf
24 | 
25 | from parser.neural.models.embeds.base_embed import BaseEmbed
26 | 
27 | #***************************************************************
28 | class CNNEmbed(BaseEmbed):
29 |   """"""
30 |   
31 |   #=============================================================
32 |   def __call__(self, vocab, **kwargs):
33 |     """"""
34 |     
35 |     # (n x b x d)
36 |     embeddings = super(CNNEmbed, self).__call__(vocab, **kwargs)
37 |     # (n x b x d) -> (n x b x h)
38 |     with tf.variable_scope('CNN'):
39 |       conv = self.CNN(embeddings, self.window_size, self.conv_size)
40 |     # (n x b x h) -> (n x h)
41 |     hidden = tf.reduce_max(conv, axis=1)
42 |     # (n x h) -> (n x o)
43 |     linear = self.linear(hidden, vocab.token_embed_size)
44 |     return linear 
45 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/embeds/mlp_embed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import numpy as np
23 | import tensorflow as tf
24 | 
25 | from parser.neural.models.embeds.base_embed import BaseEmbed
26 | 
27 | #***************************************************************
28 | class MLPEmbed(BaseEmbed):
29 |   """"""
30 |   
31 |   #=============================================================
32 |   def __call__(self, vocab, **kwargs):
33 |     """"""
34 |     
35 |     # (n x b x d)
36 |     embeddings = super(MLPEmbed, self).__call__(vocab, **kwargs)
37 |     # (n x b x d) -> (n x d)
38 |     with tf.variable_scope('Attn'):
39 |       attn = self.linear_attention(embeddings)
40 |     # (n x d) -> (n x h)
41 |     with tf.variable_scope('MLP'):
42 |       hidden = self.MLP(attn, self.mlp_size)
43 |     # (n x h) -> (n x o)
44 |     linear = self.linear(hidden, vocab.token_embed_size)
45 |     return linear 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/embeds/rnn_embed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import numpy as np
23 | import tensorflow as tf
24 | 
25 | import parser.neural.rnn as rnn
26 | from parser.neural.models.embeds.base_embed import BaseEmbed
27 | 
28 | #***************************************************************
29 | class RNNEmbed(BaseEmbed):
30 |   """"""
31 |   
32 |   #=============================================================
33 |   def __call__(self, vocab, **kwargs):
34 |     """"""
35 |     
36 |     # (n x b x d)
37 |     embeddings = super(RNNEmbed, self).__call__(vocab, **kwargs)
38 |     # (n x b x d) -> (n x b x h)
39 |     with tf.variable_scope('RNN'):
40 |       recur, state = self.RNN(embeddings, self.recur_size)
41 |     if self.rnn_func == rnn.birnn:
42 |       state_fw, state_bw = tf.unstack(state)
43 |       state_fw = tf.split(state_fw, 2, axis=1)[0]
44 |       state_bw = tf.split(state_bw, 2, axis=1)[0]
45 |       state = tf.concat([state_fw, state_bw], 1)
46 |     elif self.rnn_func == rnn.rnn:
47 |       state = tf.split(state, 2, axis=1)[0]
48 |     # (n x b x h) -> (n x h)
49 |     with tf.variable_scope('MLP'):
50 |       hidden = self.linear_attention(recur)
51 |     # (n x h) -> (n x o)
52 |     linear = self.linear(tf.concat([hidden, state], axis=1), vocab.token_embed_size)
53 |     return linear 
54 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/__init__.py:
--------------------------------------------------------------------------------
1 | from parsers import *
2 | from taggers import *
3 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | from parser import Parser
2 | from fish_parser import FishParser
3 | from gama_parser import GamaParser
4 | from xbar_parser import XbarParser
5 | from bin_parser import BinParser
6 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/base_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import re
 23 | import codecs
 24 | import numpy as np
 25 | import tensorflow as tf
 26 | import matplotlib.pyplot as plt
 27 | 
 28 | from parser.misc.colors import ctext, color_pattern
 29 | from parser.misc.mst import nonprojective, argmax
 30 | from parser.neural.models.nn import NN
 31 | 
 32 | #***************************************************************
 33 | class BaseParser(NN):
 34 |   """"""
 35 |   
 36 |   PAD = 0
 37 |   ROOT = 1
 38 |   
 39 |   #=============================================================
 40 |   def __call__(self, vocabs, moving_params=None):
 41 |     """"""
 42 |     
 43 |     self.moving_params = moving_params
 44 |     if isinstance(vocabs, dict):
 45 |       self.vocabs = vocabs
 46 |     else:
 47 |       self.vocabs = {vocab.name: vocab for vocab in vocabs}
 48 |     
 49 |     input_vocabs = [self.vocabs[name] for name in self.input_vocabs]
 50 |     #embed = tf.concat([vocab(moving_params=self.moving_params) for vocab in input_vocabs], 2)
 51 |     embed = self.embed_concat(input_vocabs)
 52 |     for vocab in self.vocabs.values():
 53 |       if vocab not in input_vocabs:
 54 |         vocab.generate_placeholder()
 55 |     placeholder = self.vocabs['words'].placeholder
 56 |     if len(placeholder.get_shape().as_list()) == 3:
 57 |       placeholder = placeholder[:,:,0]
 58 |     self._tokens_to_keep = tf.to_float(tf.greater(placeholder, self.ROOT))
 59 |     self._batch_size = tf.shape(placeholder)[0]
 60 |     self._bucket_size = tf.shape(placeholder)[1]
 61 |     self._sequence_lengths = tf.reduce_sum(tf.to_int32(tf.greater(placeholder, self.PAD)), axis=1)
 62 |     self._n_tokens = tf.to_int32(tf.reduce_sum(self.tokens_to_keep))
 63 |     
 64 |     top_recur = embed
 65 |     for i in xrange(self.n_layers):
 66 |       with tf.variable_scope('RNN%d' % i):
 67 |         top_recur, _ = self.RNN(top_recur, self.recur_size)
 68 |     return top_recur
 69 |   
 70 |   #=============================================================
 71 |   def process_accumulators(self, accumulators, time=None):
 72 |     """"""
 73 |     
 74 |     n_tokens, n_seqs, loss, rel_corr, arc_corr, corr, seq_corr = accumulators
 75 |     acc_dict = {
 76 |       'Loss': loss,
 77 |       'LS': rel_corr/n_tokens*100,
 78 |       'UAS': arc_corr/n_tokens*100,
 79 |       'LAS': corr/n_tokens*100,
 80 |       'SS': seq_corr/n_seqs*100,
 81 |     }
 82 |     if time is not None:
 83 |       acc_dict.update({
 84 |         'Token_rate': n_tokens / time,
 85 |         'Seq_rate': n_seqs / time,
 86 |       })
 87 |     return acc_dict
 88 |   
 89 |   #=============================================================
 90 |   def update_history(self, history, accumulators):
 91 |     """"""
 92 |     
 93 |     acc_dict = self.process_accumulators(accumulators)
 94 |     for key, value in acc_dict.iteritems():
 95 |       history[key].append(value)
 96 |     return history['LAS'][-1]
 97 |   
 98 |   #=============================================================
 99 |   def print_accuracy(self, accumulators, time, prefix='Train'):
100 |     """"""
101 |     
102 |     acc_dict = self.process_accumulators(accumulators, time=time)
103 |     strings = []
104 |     strings.append(color_pattern('Loss:', '{Loss:7.3f}', 'bright_red'))
105 |     strings.append(color_pattern('LS:', '{LS:5.2f}%', 'bright_cyan'))
106 |     strings.append(color_pattern('UAS:', '{UAS:5.2f}%', 'bright_cyan'))
107 |     strings.append(color_pattern('LAS:', '{LAS:5.2f}%', 'bright_cyan'))
108 |     strings.append(color_pattern('SS:', '{SS:5.2f}%', 'bright_green'))
109 |     strings.append(color_pattern('Speed:', '{Seq_rate:6.1f} seqs/sec', 'bright_magenta'))
110 |     string = ctext('{0}  ', 'bold') + ' | '.join(strings)
111 |     print(string.format(prefix, **acc_dict))
112 |     return
113 |   
114 |   #=============================================================
115 |   def plot(self, history, prefix='Train'):
116 |     """"""
117 |     
118 |     pass
119 |   
120 |   #=============================================================
121 |   def check(self, preds, sents, fileobj):
122 |     """"""
123 | 
124 |     for tokens, arc_preds, rel_preds in zip(sents, preds[0], preds[1]):
125 |       for token, arc_pred, rel_pred in zip(zip(*tokens), arc_preds, rel_preds):
126 |         arc = self.vocabs['heads'][arc_pred]
127 |         rel = self.vocabs['rels'][rel_pred]
128 |         fileobj.write('\t'.join(token+(arc, rel))+'\n')
129 |       fileobj.write('\n')
130 |     return
131 | 
132 |   #=============================================================
133 |   def write_probs(self, sents, output_file, probs, inv_idxs):
134 |     """"""
135 |     
136 |     #parse_algorithm = self.parse_algorithm 
137 |     
138 |     # Turns list of tuples of tensors into list of matrices
139 |     arc_probs = [arc_prob for batch in probs for arc_prob in batch[0]]
140 |     rel_probs = [rel_prob for batch in probs for rel_prob in batch[1]]
141 |     tokens_to_keep = [weight for batch in probs for weight in batch[2]]
142 |     tokens = [sent for batch in sents for sent in batch]
143 |     
144 |     with codecs.open(output_file, 'w', encoding='utf-8', errors='ignore') as f:
145 |       j = 0
146 |       for i in inv_idxs:
147 |         sent, arc_prob, rel_prob, weights = tokens[i], arc_probs[i], rel_probs[i], tokens_to_keep[i]
148 |         sent = zip(*sent)
149 |         sequence_length = int(np.sum(weights))+1
150 |         arc_prob = arc_prob[:sequence_length][:,:sequence_length]
151 |         #arc_preds = np.argmax(arc_prob, axis=1)
152 |         arc_preds = nonprojective(arc_prob)
153 |         arc_preds_one_hot = np.zeros([rel_prob.shape[0], rel_prob.shape[2]])
154 |         arc_preds_one_hot[np.arange(len(arc_preds)), arc_preds] = 1.
155 |         rel_preds = np.argmax(np.einsum('nrb,nb->nr', rel_prob, arc_preds_one_hot), axis=1)
156 |         for token, arc_pred, rel_pred, weight in zip(sent, arc_preds[1:], rel_preds[1:], weights[1:]):
157 |           token = list(token)
158 |           token.insert(5, '_')
159 |           token.append('_')
160 |           token.append('_')
161 |           token[6] = self.vocabs['heads'][arc_pred]
162 |           token[7] = self.vocabs['rels'][rel_pred]
163 |           f.write('\t'.join(token)+'\n')
164 |         j += 1
165 |         if j < len(inv_idxs):
166 |           f.write('\n')
167 |     return
168 |   
169 |   #=============================================================
170 |   @property
171 |   def train_keys(self):
172 |     return ('n_tokens', 'n_seqs', 'loss', 'n_rel_correct', 'n_arc_correct', 'n_correct', 'n_seqs_correct')
173 |   
174 |   #=============================================================
175 |   @property
176 |   def valid_keys(self):
177 |     return ('arc_preds', 'rel_preds')
178 | 
179 |   #=============================================================
180 |   @property
181 |   def parse_keys(self):
182 |     return ('arc_probs', 'rel_probs', 'tokens_to_keep')
183 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/bin_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import numpy as np
 23 | import tensorflow as tf
 24 | 
 25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser
 26 | 
 27 | #***************************************************************
 28 | class BinParser(BaseParser):
 29 |   """"""
 30 |   
 31 |   #=============================================================
 32 |   def __call__(self, vocabs, moving_params=None):
 33 |     """"""
 34 |     
 35 |     top_recur = super(BinParser, self).__call__(vocabs, moving_params=moving_params)
 36 |     int_tokens_to_keep = tf.to_int32(self.tokens_to_keep)
 37 |     
 38 |     with tf.variable_scope('MLP'):
 39 |       dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size + self.p_mlp_size,
 40 |                                    n_splits=2)
 41 |       arc_dep_mlp, rel_dep_mlp, p_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size], axis=2)
 42 |       arc_head_mlp, rel_head_mlp, p_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size], axis=2)
 43 |     
 44 |     with tf.variable_scope('p'):
 45 |       # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
 46 |       arc_ps = self.bilinear(p_dep_mlp, p_head_mlp, 1)
 47 |       # (b x 1)
 48 |       arc_logits = -tf.nn.softplus(arc_ps)
 49 |       
 50 |     with tf.variable_scope('Arc'):
 51 |       # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
 52 |       arc_logits += self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False)
 53 |       # (n x b x b)
 54 |       arc_probs = tf.nn.softmax(arc_logits)
 55 |       # (n x b)
 56 |       arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1))
 57 |       # (n x b)
 58 |       arc_targets = self.vocabs['heads'].placeholder
 59 |       # (n x b)
 60 |       arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep
 61 |       # ()
 62 |       arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep)
 63 |     
 64 |     with tf.variable_scope('Rel'):
 65 |       # (n x b x d) o (d x r x d) o (n x b x d).T -> (n x b x r x b)
 66 |       rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels']))
 67 |       # (n x b x r x b)
 68 |       rel_probs = tf.nn.softmax(rel_logits, dim=2)
 69 |       # (n x b x b)
 70 |       one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size)
 71 |       # (n x b x b) -> (n x b x b x 1)
 72 |       one_hot = tf.expand_dims(one_hot, axis=3)
 73 |       # (n x b x r x b) o (n x b x b x 1) -> (n x b x r x 1)
 74 |       select_rel_logits = tf.matmul(rel_logits, one_hot)
 75 |       # (n x b x r x 1) -> (n x b x r)
 76 |       select_rel_logits = tf.squeeze(select_rel_logits, axis=3)
 77 |       # (n x b)
 78 |       rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1))
 79 |       # (n x b)
 80 |       rel_targets = self.vocabs['rels'].placeholder
 81 |       # (n x b)
 82 |       rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep
 83 |       # ()
 84 |       rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep)
 85 |     
 86 |     n_arc_correct = tf.reduce_sum(arc_correct)
 87 |     n_rel_correct = tf.reduce_sum(rel_correct)
 88 |     correct = arc_correct * rel_correct
 89 |     n_correct = tf.reduce_sum(correct)
 90 |     n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1)))
 91 |     loss = arc_loss + rel_loss
 92 |     
 93 |     outputs = {
 94 |       'arc_logits': arc_logits,
 95 |       'arc_probs': arc_probs,
 96 |       'arc_preds': arc_preds,
 97 |       'arc_targets': arc_targets,
 98 |       'arc_correct': arc_correct,
 99 |       'arc_loss': arc_loss,
100 |       'n_arc_correct': n_arc_correct,
101 |       
102 |       'rel_logits': rel_logits,
103 |       'rel_probs': rel_probs,
104 |       'rel_preds': rel_preds,
105 |       'rel_targets': rel_targets,
106 |       'rel_correct': rel_correct,
107 |       'rel_loss': rel_loss,
108 |       'n_rel_correct': n_rel_correct,
109 |       
110 |       'n_tokens': self.n_tokens,
111 |       'n_seqs': self.batch_size,
112 |       'tokens_to_keep': self.tokens_to_keep,
113 |       'n_correct': n_correct,
114 |       'n_seqs_correct': n_seqs_correct,
115 |       'loss': loss
116 |     }
117 |     
118 |     return outputs
119 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/fish_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import numpy as np
 23 | import tensorflow as tf
 24 | 
 25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser
 26 | 
 27 | #***************************************************************
 28 | class FishParser(BaseParser):
 29 |   """"""
 30 |   
 31 |   #=============================================================
 32 |   def __call__(self, vocabs, moving_params=None):
 33 |     """"""
 34 |     
 35 |     top_recur = super(FishParser, self).__call__(vocabs, moving_params=moving_params)
 36 |     int_tokens_to_keep = tf.to_int32(self.tokens_to_keep)
 37 |     
 38 |     with tf.variable_scope('MLP'):
 39 |       dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size + self.lambda_mlp_size,
 40 |                                    n_splits=2)
 41 |       arc_dep_mlp, rel_dep_mlp, lambda_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.lambda_mlp_size], axis=2)
 42 |       arc_head_mlp, rel_head_mlp, lambda_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.lambda_mlp_size], axis=2)
 43 |     
 44 |     with tf.variable_scope('Lambda'):
 45 |       # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
 46 |       arc_lambdas = self.bilinear(lambda_dep_mlp, lambda_head_mlp, 1) + 5
 47 |       # (b x 1)
 48 |       i_mat = tf.expand_dims(tf.expand_dims(tf.range(self.bucket_size), 1), 0)
 49 |       # (1 x b)
 50 |       j_mat = tf.expand_dims(tf.expand_dims(tf.range(self.bucket_size), 0), 0)
 51 |       # (b x 1) - (1 x b) -> (b x b)
 52 |       k_mat = tf.abs(i_mat - j_mat)
 53 |       # (b x 1)
 54 |       n_mat = tf.expand_dims(tf.expand_dims(self.sequence_lengths, 1), 1) - 1 - i_mat
 55 |       # (b x b) * (n x b x b) - (n x b x b) - (b x b) -> (n x b x b)
 56 |       arc_logits = tf.to_float(k_mat)*arc_lambdas - tf.exp(arc_lambdas) - tf.lgamma(tf.to_float(k_mat+1))
 57 | 
 58 |     with tf.variable_scope('Arc'):
 59 |       # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
 60 |       arc_logits += self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False)
 61 |       # (n x b x b)
 62 |       arc_probs = tf.nn.softmax(arc_logits)
 63 |       # (n x b)
 64 |       arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1))
 65 |       # (n x b)
 66 |       arc_targets = self.vocabs['heads'].placeholder
 67 |       # (n x b)
 68 |       arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep
 69 |       # ()
 70 |       arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep)
 71 |     
 72 |     with tf.variable_scope('Rel'):
 73 |       # (n x b x d) o (d x r x d) o (n x b x d).T -> (n x b x r x b)
 74 |       rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels']))
 75 |       # (n x b x r x b)
 76 |       rel_probs = tf.nn.softmax(rel_logits, dim=2)
 77 |       # (n x b x b)
 78 |       one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size)
 79 |       # (n x b x b) -> (n x b x b x 1)
 80 |       one_hot = tf.expand_dims(one_hot, axis=3)
 81 |       # (n x b x r x b) o (n x b x b x 1) -> (n x b x r x 1)
 82 |       select_rel_logits = tf.matmul(rel_logits, one_hot)
 83 |       # (n x b x r x 1) -> (n x b x r)
 84 |       select_rel_logits = tf.squeeze(select_rel_logits, axis=3)
 85 |       # (n x b)
 86 |       rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1))
 87 |       # (n x b)
 88 |       rel_targets = self.vocabs['rels'].placeholder
 89 |       # (n x b)
 90 |       rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep
 91 |       # ()
 92 |       rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep)
 93 |     
 94 |     n_arc_correct = tf.reduce_sum(arc_correct)
 95 |     n_rel_correct = tf.reduce_sum(rel_correct)
 96 |     correct = arc_correct * rel_correct
 97 |     n_correct = tf.reduce_sum(correct)
 98 |     n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1)))
 99 |     loss = arc_loss + rel_loss
100 |     
101 |     outputs = {
102 |       'arc_logits': arc_logits,
103 |       'arc_lambdas': arc_lambdas,
104 |       'arc_probs': arc_probs,
105 |       'arc_preds': arc_preds,
106 |       'arc_targets': arc_targets,
107 |       'arc_correct': arc_correct,
108 |       'arc_loss': arc_loss,
109 |       'n_arc_correct': n_arc_correct,
110 |       
111 |       'rel_logits': rel_logits,
112 |       'rel_probs': rel_probs,
113 |       'rel_preds': rel_preds,
114 |       'rel_targets': rel_targets,
115 |       'rel_correct': rel_correct,
116 |       'rel_loss': rel_loss,
117 |       'n_rel_correct': n_rel_correct,
118 |       
119 |       'n_tokens': self.n_tokens,
120 |       'n_seqs': self.batch_size,
121 |       'tokens_to_keep': self.tokens_to_keep,
122 |       'n_correct': n_correct,
123 |       'n_seqs_correct': n_seqs_correct,
124 |       'loss': loss
125 |     }
126 |     
127 |     return outputs
128 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/gama_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import numpy as np
 23 | import tensorflow as tf
 24 | 
 25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser
 26 | 
 27 | #***************************************************************
 28 | class GamaParser(BaseParser):
 29 |   """"""
 30 |   
 31 |   #=============================================================
 32 |   def __call__(self, vocabs, moving_params=None):
 33 |     """"""
 34 |     
 35 |     top_recur = super(GamaParser, self).__call__(vocabs, moving_params=moving_params)
 36 |     int_tokens_to_keep = tf.to_int32(self.tokens_to_keep)
 37 |     
 38 |     with tf.variable_scope('MLP'):
 39 |       dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size + 2*self.p_mlp_size,
 40 |                                    n_splits=2)
 41 |       arc_dep_mlp, rel_dep_mlp, mu_dep_mlp, sigma_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size, self.p_mlp_size], axis=2)
 42 |       arc_head_mlp, rel_head_mlp, mu_head_mlp, sigma_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size, self.p_mlp_size], axis=2)
 43 |     
 44 |     with tf.variable_scope('dist'):
 45 |       with tf.variable_scope('mu'):
 46 |         # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
 47 |         arc_mus = self.bilinear(mu_dep_mlp, mu_head_mlp, 1)**2
 48 |       with tf.variable_scope('sigma'):
 49 |         # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
 50 |         arc_sigmas = self.bilinear(sigma_dep_mlp, sigma_head_mlp, 1, initializer=None)**2 + .1
 51 |       # (b x 1)
 52 |       i_mat = tf.expand_dims(tf.range(self.bucket_size), 1)
 53 |       # (1 x b)
 54 |       j_mat = tf.expand_dims(tf.range(self.bucket_size), 0)
 55 |       # (b x 1) - (1 x b) -> (b x b)
 56 |       k_mat = tf.to_float(tf.abs(i_mat - j_mat))
 57 |       
 58 |       arc_logits = -.5*tf.log(2*np.pi * arc_sigmas) - .5*(k_mat-arc_mus)**2 / arc_sigmas
 59 |       #arc_rs += tf.to_float(k_mat)#tf.to_float(tf.expand_dims(tf.expand_dims(self.sequence_lengths, 1), 1))
 60 |       # (b x 1)
 61 |       #n_mat = tf.expand_dims(self.sequence_lengths, 1) - 1 - i_mat
 62 |       # (b x b) * (n x b x b) - (n x b x b) - (b x b) -> (n x b x b)
 63 |       #arc_logits = (tf.lgamma(arc_rs+1) - tf.lgamma(k_mat) - tf.lgamma(arc_rs-k_mat+2) +
 64 |       #               k_mat * tf.log(arc_ps) + (arc_rs-k_mat+1)*tf.log(1-arc_ps) )
 65 |     with tf.variable_scope('Arc'):
 66 |       # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
 67 |       arc_logits += self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False)
 68 |       # (n x b x b)
 69 |       arc_probs = tf.nn.softmax(arc_logits)
 70 |       # (n x b)
 71 |       arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1))
 72 |       # (n x b)
 73 |       arc_targets = self.vocabs['heads'].placeholder
 74 |       # (n x b)
 75 |       arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep
 76 |       # ()
 77 |       arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep)
 78 |     
 79 |     with tf.variable_scope('Rel'):
 80 |       # (n x b x d) o (d x r x d) o (n x b x d).T -> (n x b x r x b)
 81 |       rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels']))
 82 |       # (n x b x r x b)
 83 |       rel_probs = tf.nn.softmax(rel_logits, dim=2)
 84 |       # (n x b x b)
 85 |       one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size)
 86 |       # (n x b x b) -> (n x b x b x 1)
 87 |       one_hot = tf.expand_dims(one_hot, axis=3)
 88 |       # (n x b x r x b) o (n x b x b x 1) -> (n x b x r x 1)
 89 |       select_rel_logits = tf.matmul(rel_logits, one_hot)
 90 |       # (n x b x r x 1) -> (n x b x r)
 91 |       select_rel_logits = tf.squeeze(select_rel_logits, axis=3)
 92 |       # (n x b)
 93 |       rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1))
 94 |       # (n x b)
 95 |       rel_targets = self.vocabs['rels'].placeholder
 96 |       # (n x b)
 97 |       rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep
 98 |       # ()
 99 |       rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep)
100 |     
101 |     n_arc_correct = tf.reduce_sum(arc_correct)
102 |     n_rel_correct = tf.reduce_sum(rel_correct)
103 |     correct = arc_correct * rel_correct
104 |     n_correct = tf.reduce_sum(correct)
105 |     n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1)))
106 |     loss = arc_loss + rel_loss
107 |     
108 |     outputs = {
109 |       'arc_logits': arc_logits,
110 |       'arc_mus': arc_mus,
111 |       'arc_sigmas': arc_sigmas,
112 |       'arc_probs': arc_probs,
113 |       'arc_preds': arc_preds,
114 |       'arc_targets': arc_targets,
115 |       'arc_correct': arc_correct,
116 |       'arc_loss': arc_loss,
117 |       'n_arc_correct': n_arc_correct,
118 |       
119 |       'rel_logits': rel_logits,
120 |       'rel_probs': rel_probs,
121 |       'rel_preds': rel_preds,
122 |       'rel_targets': rel_targets,
123 |       'rel_correct': rel_correct,
124 |       'rel_loss': rel_loss,
125 |       'n_rel_correct': n_rel_correct,
126 |       
127 |       'n_tokens': self.n_tokens,
128 |       'n_seqs': self.batch_size,
129 |       'tokens_to_keep': self.tokens_to_keep,
130 |       'n_correct': n_correct,
131 |       'n_seqs_correct': n_seqs_correct,
132 |       'loss': loss
133 |     }
134 |     
135 |     return outputs
136 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import numpy as np
 23 | import tensorflow as tf
 24 | 
 25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser
 26 | 
 27 | #***************************************************************
 28 | class Parser(BaseParser):
 29 |   """"""
 30 |   
 31 |   #=============================================================
 32 |   def __call__(self, vocabs, moving_params=None):
 33 |     """"""
 34 |     
 35 |     top_recur = super(Parser, self).__call__(vocabs, moving_params=moving_params)
 36 |     int_tokens_to_keep = tf.to_int32(self.tokens_to_keep)
 37 |     
 38 |     with tf.variable_scope('MLP'):
 39 |       dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size,
 40 |                                    n_splits=2)
 41 |       arc_dep_mlp, rel_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size], axis=2)
 42 |       arc_head_mlp, rel_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size], axis=2)
 43 |     
 44 |     with tf.variable_scope('Arc'):
 45 |       # (n x b x d) * (d x 1 x d) * (n x b x d).T -> (n x b x b)
 46 |       arc_logits = self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False)
 47 |       # (n x b x b)
 48 |       arc_probs = tf.nn.softmax(arc_logits)
 49 |       # (n x b)
 50 |       arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1))
 51 |       # (n x b)
 52 |       arc_targets = self.vocabs['heads'].placeholder
 53 |       # (n x b)
 54 |       arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep
 55 |       # ()
 56 |       arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep)
 57 |     
 58 |     with tf.variable_scope('Rel'):
 59 |       # (n x b x d) * (d x r x d) * (n x b x d).T -> (n x b x r x b)
 60 |       rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels']))
 61 |       # (n x b x r x b)
 62 |       rel_probs = tf.nn.softmax(rel_logits, dim=2)
 63 |       # (n x b x b)
 64 |       one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size)
 65 |       # (n x b x b) -> (n x b x b x 1)
 66 |       one_hot = tf.expand_dims(one_hot, axis=3)
 67 |       # (n x b x r x b) * (n x b x b x 1) -> (n x b x r x 1)
 68 |       select_rel_logits = tf.matmul(rel_logits, one_hot)
 69 |       # (n x b x r x 1) -> (n x b x r)
 70 |       select_rel_logits = tf.squeeze(select_rel_logits, axis=3)
 71 |       # (n x b)
 72 |       rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1))
 73 |       # (n x b)
 74 |       rel_targets = self.vocabs['rels'].placeholder
 75 |       # (n x b)
 76 |       rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep
 77 |       # ()
 78 |       rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep)
 79 |     
 80 |     n_arc_correct = tf.reduce_sum(arc_correct)
 81 |     n_rel_correct = tf.reduce_sum(rel_correct)
 82 |     correct = arc_correct * rel_correct
 83 |     n_correct = tf.reduce_sum(correct)
 84 |     n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1)))
 85 |     loss = arc_loss + rel_loss
 86 |     
 87 |     outputs = {
 88 |       'arc_logits': arc_logits,
 89 |       'arc_probs': arc_probs,
 90 |       'arc_preds': arc_preds,
 91 |       'arc_targets': arc_targets,
 92 |       'arc_correct': arc_correct,
 93 |       'arc_loss': arc_loss,
 94 |       'n_arc_correct': n_arc_correct,
 95 |       
 96 |       'rel_logits': rel_logits,
 97 |       'rel_probs': rel_probs,
 98 |       'rel_preds': rel_preds,
 99 |       'rel_targets': rel_targets,
100 |       'rel_correct': rel_correct,
101 |       'rel_loss': rel_loss,
102 |       'n_rel_correct': n_rel_correct,
103 |       
104 |       'n_tokens': self.n_tokens,
105 |       'n_seqs': self.batch_size,
106 |       'tokens_to_keep': self.tokens_to_keep,
107 |       'n_correct': n_correct,
108 |       'n_seqs_correct': n_seqs_correct,
109 |       'loss': loss
110 |     }
111 |     
112 |     return outputs
113 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/xbar_parser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import numpy as np
 23 | import tensorflow as tf
 24 | 
 25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser
 26 | 
 27 | #***************************************************************
 28 | class XbarParser(BaseParser):
 29 |   """"""
 30 |   
 31 |   #=============================================================
 32 |   def __call__(self, vocabs, moving_params=None):
 33 |     """"""
 34 |     
 35 |     top_recur = super(XbarParser, self).__call__(vocabs, moving_params=moving_params)
 36 |     int_tokens_to_keep = tf.to_int32(self.tokens_to_keep)
 37 |     
 38 |     with tf.variable_scope('MLP'):
 39 |       dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size + self.p_mlp_size,
 40 |                                    n_splits=2)
 41 |       arc_dep_mlp, rel_dep_mlp, p_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size], axis=2)
 42 |       arc_head_mlp, rel_head_mlp, p_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size], axis=2)
 43 |     
 44 |     with tf.variable_scope('p'):
 45 |       # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
 46 |       arc_ps = self.bilinear(p_dep_mlp, p_head_mlp, 1, add_bias2=False)
 47 |       # (b x 1)
 48 |       i_mat = tf.expand_dims(tf.expand_dims(tf.range(self.bucket_size), 1), 0)
 49 |       # (1 x b)
 50 |       j_mat = tf.expand_dims(tf.expand_dims(tf.range(self.bucket_size), 0), 0)
 51 |       # (b x 1) > (1 x b) -> (b x b)
 52 |       k_mat = tf.tile(j_mat > i_mat, [self.batch_size,1,1])
 53 |       # (b x 1)
 54 |       n_mat = tf.expand_dims(tf.expand_dims(self.sequence_lengths,  1), 1) - 1 - i_mat
 55 |       # (n x b x b) + (b x b) * (n x b x b) + (b x b) * (n x b x b) -> (n x b x b)
 56 |       arc_logits = -tf.nn.softplus(tf.where(k_mat, arc_ps, -arc_ps))
 57 |       # (n x b x b) - (b x b) * (b x b) -> (n x b x b)
 58 |       
 59 |     with tf.variable_scope('Arc'):
 60 |       # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
 61 |       arc_logits += self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False)
 62 |       # (n x b x b)
 63 |       arc_probs = tf.nn.softmax(arc_logits)
 64 |       # (n x b)
 65 |       arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1))
 66 |       # (n x b)
 67 |       arc_targets = self.vocabs['heads'].placeholder
 68 |       # (n x b)
 69 |       arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep
 70 |       # ()
 71 |       arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep)
 72 |     
 73 |     with tf.variable_scope('Rel'):
 74 |       # (n x b x d) o (d x r x d) o (n x b x d).T -> (n x b x r x b)
 75 |       rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels']))
 76 |       # (n x b x r x b)
 77 |       rel_probs = tf.nn.softmax(rel_logits, dim=2)
 78 |       # (n x b x b)
 79 |       one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size)
 80 |       # (n x b x b) -> (n x b x b x 1)
 81 |       one_hot = tf.expand_dims(one_hot, axis=3)
 82 |       # (n x b x r x b) o (n x b x b x 1) -> (n x b x r x 1)
 83 |       select_rel_logits = tf.matmul(rel_logits, one_hot)
 84 |       # (n x b x r x 1) -> (n x b x r)
 85 |       select_rel_logits = tf.squeeze(select_rel_logits, axis=3)
 86 |       # (n x b)
 87 |       rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1))
 88 |       # (n x b)
 89 |       rel_targets = self.vocabs['rels'].placeholder
 90 |       # (n x b)
 91 |       rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep
 92 |       # ()
 93 |       rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep)
 94 |     
 95 |     n_arc_correct = tf.reduce_sum(arc_correct)
 96 |     n_rel_correct = tf.reduce_sum(rel_correct)
 97 |     correct = arc_correct * rel_correct
 98 |     n_correct = tf.reduce_sum(correct)
 99 |     n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1)))
100 |     loss = arc_loss + rel_loss
101 |     
102 |     outputs = {
103 |       'arc_logits': arc_logits,
104 |       'arc_probs': arc_probs,
105 |       'arc_preds': arc_preds,
106 |       'arc_targets': arc_targets,
107 |       'arc_correct': arc_correct,
108 |       'arc_loss': arc_loss,
109 |       'n_arc_correct': n_arc_correct,
110 |       
111 |       'rel_logits': rel_logits,
112 |       'rel_probs': rel_probs,
113 |       'rel_preds': rel_preds,
114 |       'rel_targets': rel_targets,
115 |       'rel_correct': rel_correct,
116 |       'rel_loss': rel_loss,
117 |       'n_rel_correct': n_rel_correct,
118 |       
119 |       'n_tokens': self.n_tokens,
120 |       'n_seqs': self.batch_size,
121 |       'tokens_to_keep': self.tokens_to_keep,
122 |       'n_correct': n_correct,
123 |       'n_seqs_correct': n_seqs_correct,
124 |       'loss': loss
125 |     }
126 |     
127 |     return outputs
128 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/__init__.py:
--------------------------------------------------------------------------------
1 | from tagger import Tagger
2 | from xtagger import XTagger
3 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/base_tagger.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import re
 23 | import codecs
 24 | import numpy as np
 25 | import tensorflow as tf
 26 | import matplotlib.pyplot as plt
 27 | 
 28 | from parser.misc.colors import ctext, color_pattern
 29 | from parser.neural.models.nn import NN
 30 | 
 31 | #***************************************************************
 32 | class BaseTagger(NN):
 33 |   """"""
 34 |   
 35 |   PAD = 0
 36 |   ROOT = 1
 37 |   
 38 |   #=============================================================
 39 |   def __call__(self, vocabs, moving_params=None):
 40 |     """"""
 41 |     
 42 |     self.moving_params = moving_params
 43 |     if isinstance(vocabs, dict):
 44 |       self.vocabs = vocabs
 45 |     else:
 46 |       self.vocabs = {vocab.name: vocab for vocab in vocabs}
 47 |     
 48 |     input_vocabs = [self.vocabs[name] for name in self.input_vocabs]
 49 |     embed = self.embed_concat(input_vocabs)
 50 |     for vocab in self.vocabs.values():
 51 |       if vocab not in input_vocabs:
 52 |         vocab.generate_placeholder()
 53 |     placeholder = self.vocabs['words'].placeholder
 54 |     if len(placeholder.get_shape().as_list()) == 3:
 55 |       placeholder = placeholder[:,:,0]
 56 |     self._tokens_to_keep = tf.to_float(tf.greater(placeholder, self.ROOT))
 57 |     self._batch_size = tf.shape(placeholder)[0]
 58 |     self._bucket_size = tf.shape(placeholder)[1]
 59 |     self._sequence_lengths = tf.reduce_sum(tf.to_int32(tf.greater(placeholder, self.PAD)), axis=1)
 60 |     self._n_tokens = tf.to_int32(tf.reduce_sum(self.tokens_to_keep))
 61 |     
 62 |     top_recur = embed
 63 |     for i in xrange(self.n_layers):
 64 |       with tf.variable_scope('RNN%d' % i):
 65 |         top_recur, _ = self.RNN(top_recur, self.recur_size)
 66 |     return top_recur
 67 |   
 68 |   #=============================================================
 69 |   def process_accumulators(self, accumulators, time=None):
 70 |     """"""
 71 |     
 72 |     n_tokens, n_seqs, loss, corr, seq_corr = accumulators
 73 |     acc_dict = {
 74 |       'Loss': loss,
 75 |       'TS': corr/n_tokens*100,
 76 |       'SS': seq_corr/n_seqs*100,
 77 |     }
 78 |     if time is not None:
 79 |       acc_dict.update({
 80 |         'Token_rate': n_tokens / time,
 81 |         'Seq_rate': n_seqs / time,
 82 |       })
 83 |     return acc_dict
 84 |   
 85 |   #=============================================================
 86 |   def update_history(self, history, accumulators):
 87 |     """"""
 88 |     
 89 |     acc_dict = self.process_accumulators(accumulators)
 90 |     for key, value in acc_dict.iteritems():
 91 |       history[key].append(value)
 92 |     return history['TS'][-1]
 93 |   
 94 |   #=============================================================
 95 |   def print_accuracy(self, accumulators, time, prefix='Train'):
 96 |     """"""
 97 |     
 98 |     acc_dict = self.process_accumulators(accumulators, time=time)
 99 |     strings = []
100 |     strings.append(color_pattern('Loss:', '{Loss:7.3f}', 'bright_red'))
101 |     strings.append(color_pattern('TS:', '{TS:5.2f}%', 'bright_cyan'))
102 |     strings.append(color_pattern('SS:', '{SS:5.2f}%', 'bright_green'))
103 |     strings.append(color_pattern('Speed:', '{Seq_rate:6.1f} seqs/sec', 'bright_magenta'))
104 |     string = ctext('{0}  ', 'bold') + ' | '.join(strings)
105 |     print(string.format(prefix, **acc_dict))
106 |     return
107 |   
108 |   #=============================================================
109 |   def plot(self, history, prefix='Train'):
110 |     """"""
111 |     
112 |     pass
113 |   
114 |   #=============================================================
115 |   def check(self, preds, sents, fileobj):
116 |     """"""
117 | 
118 |     for tokens, preds in zip(sents, preds[0]):
119 |       for token, pred in zip(zip(*tokens), preds):
120 |         tag = self.vocabs['tags'][pred]
121 |         fileobj.write('\t'.join(token+(tag, ))+'\n')
122 |       fileobj.write('\n')
123 |     return
124 | 
125 |   #=============================================================
126 |   def write_probs(self, sents, output_file, probs, inv_idxs):
127 |     """"""
128 |     
129 |     # Turns list of tuples of tensors into list of matrices
130 |     tag_probs = [tag_prob for batch in probs for tag_prob in batch[0]]
131 |     tokens_to_keep = [weight for batch in probs for weight in batch[1]]
132 |     tokens = [sent for batch in sents for sent in batch]
133 |     
134 |     with codecs.open(output_file, 'w', encoding='utf-8', errors='ignore') as f:
135 |       for i in inv_idxs:
136 |         sent, tag_prob, weights = tokens[i], tag_probs[i], tokens_to_keep[i]
137 |         sent = zip(*sent)
138 |         tag_preds = np.argmax(tag_prob, axis=1)
139 |         for token, tag_pred, weight in zip(sent, tag_preds[1:], weights[1:]):
140 |           token = list(token)
141 |           token.insert(5, '_')
142 |           token.append('_')
143 |           token.append('_')
144 |           token[3] = self.vocabs['tags'][tag_pred]
145 |           f.write('\t'.join(token)+'\n')
146 |         f.write('\n')
147 |     return
148 |   
149 |   #=============================================================
150 |   @property
151 |   def train_keys(self):
152 |     return ('n_tokens', 'n_seqs', 'loss', 'n_correct', 'n_seqs_correct')
153 |   
154 |   #=============================================================
155 |   @property
156 |   def valid_keys(self):
157 |     return ('preds', )
158 | 
159 |   #=============================================================
160 |   @property
161 |   def parse_keys(self):
162 |     return ('probs', 'tokens_to_keep')
163 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/base_xtagger.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import re
 23 | import codecs
 24 | import numpy as np
 25 | import tensorflow as tf
 26 | import matplotlib.pyplot as plt
 27 | 
 28 | from parser.misc.colors import ctext, color_pattern
 29 | from parser.neural.models.nn import NN
 30 | 
 31 | #***************************************************************
 32 | class BaseXTagger(NN):
 33 |   """"""
 34 |   
 35 |   PAD = 0
 36 |   ROOT = 1
 37 |   
 38 |   #=============================================================
 39 |   def __call__(self, vocabs, moving_params=None):
 40 |     """"""
 41 |     
 42 |     self.moving_params = moving_params
 43 |     if isinstance(vocabs, dict):
 44 |       self.vocabs = vocabs
 45 |     else:
 46 |       self.vocabs = {vocab.name: vocab for vocab in vocabs}
 47 |     
 48 |     input_vocabs = [self.vocabs[name] for name in self.input_vocabs]
 49 |     embed = self.embed_concat(input_vocabs)
 50 |     for vocab in self.vocabs.values():
 51 |       if vocab not in input_vocabs:
 52 |         vocab.generate_placeholder()
 53 |     placeholder = self.vocabs['words'].placeholder
 54 |     if len(placeholder.get_shape().as_list()) == 3:
 55 |       placeholder = placeholder[:,:,0]
 56 |     self._tokens_to_keep = tf.to_float(tf.greater(placeholder, self.ROOT))
 57 |     self._batch_size = tf.shape(placeholder)[0]
 58 |     self._bucket_size = tf.shape(placeholder)[1]
 59 |     self._sequence_lengths = tf.reduce_sum(tf.to_int32(tf.greater(placeholder, self.PAD)), axis=1)
 60 |     self._n_tokens = tf.to_int32(tf.reduce_sum(self.tokens_to_keep))
 61 |     
 62 |     top_recur = embed
 63 |     for i in xrange(self.n_layers):
 64 |       with tf.variable_scope('RNN%d' % i):
 65 |         top_recur, _ = self.RNN(top_recur, self.recur_size)
 66 |     return top_recur
 67 |   
 68 |   #=============================================================
 69 |   def process_accumulators(self, accumulators, time=None):
 70 |     """"""
 71 |     
 72 |     n_tokens, n_seqs, loss, corr, xcorr, seq_corr = accumulators
 73 |     acc_dict = {
 74 |       'Loss': loss,
 75 |       'TS': corr/n_tokens*100,
 76 |       'XTS': xcorr/n_tokens*100,
 77 |       'SS': seq_corr/n_seqs*100,
 78 |     }
 79 |     if time is not None:
 80 |       acc_dict.update({
 81 |         'Token_rate': n_tokens / time,
 82 |         'Seq_rate': n_seqs / time,
 83 |       })
 84 |     return acc_dict
 85 |   
 86 |   #=============================================================
 87 |   def update_history(self, history, accumulators):
 88 |     """"""
 89 |     
 90 |     acc_dict = self.process_accumulators(accumulators)
 91 |     for key, value in acc_dict.iteritems():
 92 |       history[key].append(value)
 93 |     return history['TS'][-1]
 94 |   
 95 |   #=============================================================
 96 |   def print_accuracy(self, accumulators, time, prefix='Train'):
 97 |     """"""
 98 |     
 99 |     acc_dict = self.process_accumulators(accumulators, time=time)
100 |     strings = []
101 |     strings.append(color_pattern('Loss:', '{Loss:7.3f}', 'bright_red'))
102 |     strings.append(color_pattern('TS:', '{TS:5.2f}%', 'bright_cyan'))
103 |     strings.append(color_pattern('XTS:', '{XTS:5.2f}%', 'bright_cyan'))
104 |     strings.append(color_pattern('SS:', '{SS:5.2f}%', 'bright_green'))
105 |     strings.append(color_pattern('Speed:', '{Seq_rate:6.1f} seqs/sec', 'bright_magenta'))
106 |     string = ctext('{0}  ', 'bold') + ' | '.join(strings)
107 |     print(string.format(prefix, **acc_dict))
108 |     return
109 |   
110 |   #=============================================================
111 |   def plot(self, history, prefix='Train'):
112 |     """"""
113 |     
114 |     pass
115 |   
116 |   #=============================================================
117 |   def check(self, preds, sents, fileobj):
118 |     """"""
119 | 
120 |     for tokens, preds, xpreds in zip(sents, preds[0], preds[1]):
121 |       for token, pred, xpred in zip(zip(*tokens), preds, xpreds):
122 |         tag = self.vocabs['tags'][pred]
123 |         xtag = self.vocabs['xtags'][xpred]
124 |         fileobj.write('\t'.join(token+(tag, xtag))+'\n')
125 |       fileobj.write('\n')
126 |     return
127 | 
128 |   #=============================================================
129 |   def write_probs(self, sents, output_file, probs, inv_idxs):
130 |     """"""
131 |     
132 |     # Turns list of tuples of tensors into list of matrices
133 |     tag_probs = [tag_prob for batch in probs for tag_prob in batch[0]]
134 |     xtag_probs = [xtag_prob for batch in probs for xtag_prob in batch[1]]
135 |     tokens_to_keep = [weight for batch in probs for weight in batch[2]]
136 |     tokens = [sent for batch in sents for sent in batch]
137 |     
138 |     with codecs.open(output_file, 'w', encoding='utf-8', errors='ignore') as f:
139 |       for i in inv_idxs:
140 |         sent, tag_prob, xtag_prob, weights = tokens[i], tag_probs[i], xtag_probs[i], tokens_to_keep[i]
141 |         sent = zip(*sent)
142 |         tag_preds = np.argmax(tag_prob, axis=1)
143 |         xtag_preds = np.argmax(xtag_prob, axis=1)
144 |         for token, tag_pred, xtag_pred, weight in zip(sent, tag_preds[1:], xtag_preds[1:], weights[1:]):
145 |           token = list(token)
146 |           token.insert(5, '_')
147 |           token.append('_')
148 |           token.append('_')
149 |           token[3] = self.vocabs['tags'][tag_pred]
150 |           token[4] = self.vocabs['xtags'][xtag_pred]
151 |           f.write('\t'.join(token)+'\n')
152 |         f.write('\n')
153 |     return
154 |   
155 |   #=============================================================
156 |   @property
157 |   def train_keys(self):
158 |     return ('n_tokens', 'n_seqs', 'loss', 'n_tag_correct', 'n_xtag_correct', 'n_seqs_correct')
159 |   
160 |   #=============================================================
161 |   @property
162 |   def valid_keys(self):
163 |     return ('tag_preds', 'xtag_preds')
164 | 
165 |   #=============================================================
166 |   @property
167 |   def parse_keys(self):
168 |     return ('tag_probs', 'xtag_probs', 'tokens_to_keep')
169 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/tagger.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import numpy as np
23 | import tensorflow as tf
24 | 
25 | from parser.neural.models.nlp.taggers.base_tagger import BaseTagger
26 | 
27 | #***************************************************************
28 | class Tagger(BaseTagger):
29 |   """"""
30 |   
31 |   #=============================================================
32 |   def __call__(self, vocabs, moving_params=None):
33 |     """"""
34 |     
35 |     top_recur = super(Tagger, self).__call__(vocabs, moving_params=moving_params)
36 |     int_tokens_to_keep = tf.to_int32(self.tokens_to_keep)
37 |     
38 |     with tf.variable_scope('MLP'):
39 |       mlp = self.MLP(top_recur, self.mlp_size)
40 |     
41 |     with tf.variable_scope('Tag'):
42 |       logits = self.linear(mlp, len(self.vocabs['tags']))
43 |       probs = tf.nn.softmax(logits)
44 |       preds = tf.to_int32(tf.argmax(logits, axis=-1))
45 |       targets = self.vocabs['tags'].placeholder
46 |       correct = tf.to_int32(tf.equal(preds, targets))*int_tokens_to_keep
47 |       loss = tf.losses.sparse_softmax_cross_entropy(targets, logits, self.tokens_to_keep)
48 |     
49 |     
50 |     n_correct = tf.reduce_sum(correct)
51 |     n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1)))
52 |     
53 |     outputs = {
54 |       'logits': logits,
55 |       'probs': probs,
56 |       'preds': preds,
57 |       'targets': targets,
58 |       'correct': correct,
59 |       'loss': loss,
60 |       'n_correct': n_correct,
61 |       
62 |       'n_tokens': self.n_tokens,
63 |       'n_seqs': self.batch_size,
64 |       'tokens_to_keep': self.tokens_to_keep,
65 |       'n_correct': n_correct,
66 |       'n_seqs_correct': n_seqs_correct,
67 |       'loss': loss
68 |     }
69 |     
70 |     return outputs
71 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/xtagger.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import numpy as np
23 | import tensorflow as tf
24 | 
25 | from parser.neural.models.nlp.taggers.base_xtagger import BaseXTagger
26 | 
27 | #***************************************************************
28 | class XTagger(BaseXTagger):
29 |   """"""
30 |   
31 |   #=============================================================
32 |   def __call__(self, vocabs, moving_params=None):
33 |     """"""
34 |     
35 |     top_recur = super(XTagger, self).__call__(vocabs, moving_params=moving_params)
36 |     int_tokens_to_keep = tf.to_int32(self.tokens_to_keep)
37 |     
38 |     with tf.variable_scope('MLP'):
39 |       tag_mlp, xtag_mlp = self.MLP(top_recur, self.mlp_size, n_splits=2)
40 |     
41 |     with tf.variable_scope('Tag'):
42 |       tag_logits = self.linear(tag_mlp, len(self.vocabs['tags']))
43 |       tag_probs = tf.nn.softmax(tag_logits)
44 |       tag_preds = tf.to_int32(tf.argmax(tag_logits, axis=-1))
45 |       tag_targets = self.vocabs['tags'].placeholder
46 |       tag_correct = tf.to_int32(tf.equal(tag_preds, tag_targets))*int_tokens_to_keep
47 |       tag_loss = tf.losses.sparse_softmax_cross_entropy(tag_targets, tag_logits, self.tokens_to_keep)
48 |     
49 |     with tf.variable_scope('XTag'):
50 |       xtag_logits = self.linear(xtag_mlp, len(self.vocabs['xtags']))
51 |       xtag_probs = tf.nn.softmax(xtag_logits)
52 |       xtag_preds = tf.to_int32(tf.argmax(xtag_logits, axis=-1))
53 |       xtag_targets = self.vocabs['xtags'].placeholder
54 |       xtag_correct = tf.to_int32(tf.equal(xtag_preds, xtag_targets))*int_tokens_to_keep
55 |       xtag_loss = tf.losses.sparse_softmax_cross_entropy(xtag_targets, xtag_logits, self.tokens_to_keep)
56 |     
57 |     correct = tag_correct * xtag_correct
58 |     n_correct = tf.reduce_sum(correct)
59 |     n_tag_correct = tf.reduce_sum(tag_correct)
60 |     n_xtag_correct = tf.reduce_sum(xtag_correct)
61 |     n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1)))
62 |     loss = tag_loss + xtag_loss
63 |     
64 |     outputs = {
65 |       'tag_logits': tag_logits,
66 |       'tag_probs': tag_probs,
67 |       'tag_preds': tag_preds,
68 |       'tag_targets': tag_targets,
69 |       'tag_correct': tag_correct,
70 |       'tag_loss': tag_loss,
71 |       'n_tag_correct': n_tag_correct,
72 | 
73 |       'xtag_logits': xtag_logits,
74 |       'xtag_probs': xtag_probs,
75 |       'xtag_preds': xtag_preds,
76 |       'xtag_targets': xtag_targets,
77 |       'xtag_correct': xtag_correct,
78 |       'xtag_loss': xtag_loss,
79 |       'n_xtag_correct': n_xtag_correct,
80 |       
81 |       'n_tokens': self.n_tokens,
82 |       'n_seqs': self.batch_size,
83 |       'tokens_to_keep': self.tokens_to_keep,
84 |       'n_correct': n_correct,
85 |       'n_seqs_correct': n_seqs_correct,
86 |       'loss': loss
87 |     }
88 |     
89 |     return outputs
90 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/optimizers/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 |  
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from sgd_optimizer import SGDOptimizer 
19 | from radam_optimizer import RadamOptimizer


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/optimizers/radam_optimizer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 |  
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import tensorflow as tf
 23 | 
 24 | from parser.neural.optimizers.base_optimizer import BaseOptimizer
 25 | 
 26 | #***************************************************************
 27 | class RadamOptimizer(BaseOptimizer):
 28 |   """"""
 29 |   
 30 |   #=============================================================
 31 |   def _init_acc(self, var_list, grads):
 32 |     """"""
 33 |     
 34 |     super(RadamOptimizer, self)._init_acc(var_list, grads)
 35 |     for x_tm1, g_t in zip(var_list, grads):
 36 |       if self.mu > 0:
 37 |         self.get_accumulator(x_tm1, 'm')
 38 |         shape = self.get_variable_shape(x_tm1)
 39 |         if isinstance(g_t, tf.Tensor):
 40 |           self.get_accumulator(x_tm1, 'm/tm1', [])
 41 |         else:
 42 |           self.get_accumulator(x_tm1, 'm/tm1', [shape[0]]+[1]*(len(shape)-1))
 43 |       if self.nu > 0:
 44 |         self.get_accumulator(x_tm1, 'v')
 45 |         shape = self.get_variable_shape(x_tm1)
 46 |         if isinstance(g_t, tf.Tensor):
 47 |           self.get_accumulator(x_tm1, 'v/tm1', [])
 48 |         else:
 49 |           self.get_accumulator(x_tm1, 'v/tm1', [shape[0]]+[1]*(len(shape)-1))
 50 |     return
 51 |   
 52 |   #=============================================================
 53 |   def _apply_dense(self, cache):
 54 |     """"""
 55 |     
 56 |     x_tm1, g_t = cache['x_tm1'], cache['g_t']
 57 |     updates = cache['updates']
 58 |     
 59 |     if self.mu > 0:
 60 |       m_t, t_m = self._dense_moving_average(x_tm1, g_t, 'm', beta=self.mu)
 61 |       m_bar_t = (1-self.gamma) * m_t + self.gamma * g_t
 62 |       updates.extend([m_t, t_m])
 63 |     else:
 64 |       m_bar_t = g_t
 65 |     
 66 |     if self.nu > 0:
 67 |       v_t, t_v = self._dense_moving_average(x_tm1, g_t**2, 'v', beta=self.nu)
 68 |       v_bar_t = tf.sqrt(v_t + self.epsilon)
 69 |       updates.extend([v_t, t_v])
 70 |     else:
 71 |       v_bar_t = 1
 72 |     
 73 |     s_t = self.learning_rate * m_bar_t / v_bar_t
 74 |     cache['s_t'] = tf.where(tf.is_finite(s_t), s_t, tf.zeros_like(s_t))
 75 |     return cache
 76 |   
 77 |   #=============================================================
 78 |   def _apply_sparse(self, cache):
 79 |     """"""
 80 |     
 81 |     x_tm1, g_t, idxs = cache['x_tm1'], cache['g_t'], cache['idxs']
 82 |     idxs, idxs_ = tf.unique(idxs)
 83 |     g_t_ = tf.unsorted_segment_sum(g_t, idxs_, tf.size(idxs))
 84 |     updates = cache['updates']
 85 |     
 86 |     if self.mu > 0:
 87 |       m_t, t_m = self._sparse_moving_average(x_tm1, idxs, g_t_, 'm', beta=self.mu)
 88 |       m_t_ = tf.gather(m_t, idxs)
 89 |       m_bar_t_ = (1-self.gamma) * m_t_ + self.gamma * g_t_
 90 |       updates.extend([m_t, t_m])
 91 |     else:
 92 |       m_bar_t_ = g_t_
 93 |     
 94 |     if self.nu > 0:
 95 |       v_t, t_v = self._sparse_moving_average(x_tm1, idxs, g_t_**2, 'v', beta=self.nu)
 96 |       v_t_ = tf.gather(v_t, idxs)
 97 |       v_bar_t_ = tf.sqrt(v_t_ + self.epsilon)
 98 |       updates.extend([v_t, t_v])
 99 |     else:
100 |       v_bar_t_ = 1
101 |     
102 |     s_t_ = self.learning_rate * m_bar_t_ / v_bar_t_
103 |     cache['s_t'] = tf.where(tf.is_finite(s_t_), s_t_, tf.zeros_like(s_t_))
104 |     cache['g_t'] = g_t_
105 |     cache['idxs'] = idxs
106 |     return cache
107 |   
108 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/optimizers/sgd_optimizer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 |  
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import tensorflow as tf
23 | 
24 | from parser.neural.optimizers.base_optimizer import BaseOptimizer
25 | 
26 | #***************************************************************
27 | class SGDOptimizer(BaseOptimizer):
28 |   """"""
29 |   
30 |   #=============================================================
31 |   def _apply_dense(self, cache):
32 |     """"""
33 |     
34 |     g_t = cache['g_t']
35 |     cache['s_t'] = self.learning_rate * g_t
36 |     return cache
37 |   
38 |   #=============================================================
39 |   def _apply_sparse(self, cache):
40 |     """"""
41 |     
42 |     g_t, idxs = cache['g_t'], cache['idxs']
43 |     idxs, idxs_ = tf.unique(idxs)
44 |     g_t_ = tf.unsorted_segment_sum(g_t, idxs_, tf.size(idxs))
45 |     
46 |     cache['g_t'] = g_t_
47 |     cache['idxs'] = idxs
48 |     cache['s_t'] = self.learning_rate * g_t_
49 |     
50 |     return cache
51 |   
52 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/recur_cells/.directory:
--------------------------------------------------------------------------------
1 | [Dolphin]
2 | Timestamp=2016,10,21,3,50,28
3 | Version=3
4 | ViewMode=1
5 | VisibleRoles=Details_text,Details_size,Details_date,Details_wordCount,Details_lineCount,CustomizedDetails
6 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/recur_cells/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from rnn_cell import RNNCell 
19 | from gru_cell import GRUCell
20 | from cif_lstm_cell import CifLSTMCell
21 | from lstm_cell import LSTMCell
22 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/recur_cells/base_cell.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import tensorflow as tf
23 | 
24 | from parser.configurable import Configurable
25 | 
26 | #***************************************************************
27 | class BaseCell(Configurable):
28 |   """"""
29 |   
30 |   #=============================================================
31 |   def __init__(self, output_size, *args, **kwargs):
32 |     """"""
33 |     
34 |     self._output_size = output_size
35 |     input_size = kwargs.pop('input_size', self._output_size)
36 |     self.moving_params = kwargs.pop('moving_params', None)
37 |     super(BaseCell, self).__init__(*args, **kwargs)
38 |     self._input_size = input_size if input_size is not None else self.output_size
39 |   
40 |   #=============================================================
41 |   def __call__(self, inputs, state, scope=None):
42 |     """"""
43 |     
44 |     raise NotImplementedError()
45 |   
46 |   #=============================================================
47 |   def zero_state(self, batch_size, dtype):
48 |     """"""
49 |     
50 |     zero_state = tf.get_variable('Zero_state',
51 |                                  shape=self.state_size,
52 |                                  dtype=dtype,
53 |                                  initializer=tf.zeros_initializer())
54 |     state = tf.reshape(tf.tile(zero_state, tf.stack([batch_size])), tf.stack([batch_size, self.state_size]))
55 |     state.set_shape([None, self.state_size])
56 |     return state
57 |   
58 |   #=============================================================
59 |   @property
60 |   def input_size(self):
61 |     return self._input_size
62 |   @property
63 |   def output_size(self):
64 |     return self._output_size
65 |   @property
66 |   def state_size(self):
67 |     raise NotImplementedError()
68 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/recur_cells/cif_lstm_cell.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import tensorflow as tf
23 | 
24 | from parser.neural.recur_cells.base_cell import BaseCell
25 | from parser.neural.linalg import linear
26 | from parser.neural.functions import gate
27 | 
28 | #***************************************************************
29 | class CifLSTMCell(BaseCell):
30 |   """"""
31 |   
32 |   #=============================================================
33 |   def __call__(self, inputs, state, scope=None):
34 |     """"""
35 |     
36 |     with tf.variable_scope(scope or type(self).__name__):
37 |       cell_tm1, hidden_tm1 = tf.split(state, 2, axis=1)
38 |       input_list = [inputs, hidden_tm1]
39 |       lin = linear(input_list,
40 |                    self.output_size,
41 |                    add_bias=True,
42 |                    n_splits=3,
43 |                    moving_params=self.moving_params)
44 |       cell_act, update_act, output_act = lin
45 |       
46 |       cell_tilde_t = cell_act
47 |       update_gate = gate(update_act-self.forget_bias)
48 |       output_gate = gate(output_act)
49 |       cell_t = update_gate * cell_tilde_t + (1-update_gate) * cell_tm1
50 |       hidden_tilde_t = self.recur_func(cell_t)
51 |       hidden_t = hidden_tilde_t * output_gate
52 |       
53 |       return hidden_t, tf.concat([cell_t, hidden_t], 1)
54 |   
55 |   #=============================================================
56 |   @property
57 |   def state_size(self):
58 |     return self.output_size * 2
59 |   
60 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/recur_cells/gru_cell.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import tensorflow as tf
23 | 
24 | from parser.neural.recur_cells.base_cell import BaseCell
25 | from parser.neural.linalg import linear
26 | from parser.neural.functions import gate
27 | 
28 | #***************************************************************
29 | class GRUCell(BaseCell):
30 |   """"""
31 |   
32 |   #=============================================================
33 |   def __call__(self, inputs, state, scope=None):
34 |     """"""
35 |     
36 |     with tf.variable_scope(scope or type(self).__name__):
37 |       cell_tm1, hidden_tm1 = tf.split(state, 2, axis=1)
38 |       input_list = [inputs, hidden_tm1]
39 |       with tf.variable_scope('Gates'):
40 |         gates = linear(inputs_list,
41 |                        self.output_size,
42 |                        add_bias=True,
43 |                        n_splits=2,
44 |                        moving_params=self.moving_params)
45 |         update_act, reset_act = gates
46 |         update_gate = gate(update_act-self.forget_bias)
47 |         reset_gate = gate(reset_act)
48 |         reset_state = reset_gate * hidden_tm1
49 |       input_list = [inputs, reset_state]
50 |       with tf.variable_scope('Candidate'):
51 |         hidden_act = linear(input_list,
52 |                             self.output_size,
53 |                             add_bias=True,
54 |                             moving_params=self.moving_params)
55 |         hidden_tilde = self.recur_func(hidden_act)
56 |       cell_t = update_gate * cell_tm1 + (1-update_gate) * hidden_tilde
57 |     return cell_t, tf.concat([cell_t, cell_t], 1)
58 |   
59 |   #=============================================================
60 |   @property
61 |   def state_size(self):
62 |     return self.output_size * 2
63 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/recur_cells/lstm_cell.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import tensorflow as tf
23 | 
24 | from parser.neural.recur_cells.base_cell import BaseCell
25 | from parser.neural.linalg import linear
26 | from parser.neural.functions import gate, tanh
27 | 
28 | #***************************************************************
29 | class LSTMCell(BaseCell):
30 |   """"""
31 |   
32 |   #=============================================================
33 |   def __call__(self, inputs, state, scope=None):
34 |     """"""
35 |     
36 |     with tf.variable_scope(scope or type(self).__name__):
37 |       cell_tm1, hidden_tm1 = tf.split(state, 2, axis=1)
38 |       input_list = [inputs, hidden_tm1]
39 |       lin = linear(input_list,
40 |                    self.output_size,
41 |                    add_bias=True,
42 |                    n_splits=4,
43 |                    moving_params=self.moving_params)
44 |       cell_act, input_act, forget_act, output_act = lin
45 |       
46 |       cell_tilde_t = tanh(cell_act)
47 |       input_gate =  gate(input_act)
48 |       forget_gate = gate(forget_act-self.forget_bias)
49 |       output_gate = gate(output_act)
50 |       cell_t = input_gate * cell_tilde_t + (1-forget_gate) * cell_tm1
51 |       hidden_tilde_t = self.recur_func(cell_t)
52 |       hidden_t = hidden_tilde_t * output_gate
53 |       
54 |       return hidden_t, tf.concat([cell_t, hidden_t], 1)
55 |   
56 |   #=============================================================
57 |   @property
58 |   def state_size(self):
59 |     return self.output_size * 2
60 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/recur_cells/rnn_cell.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import tensorflow as tf
23 | 
24 | from parser.neural.recur_cells.base_cell import BaseCell
25 | from parser.neural.linalg import linear
26 | 
27 | #***************************************************************
28 | class RNNCell(BaseCell):
29 |   """"""
30 |   
31 |   #=============================================================
32 |   def __call__(self, inputs, state, scope=None):
33 |     """"""
34 |     
35 |     with tf.variable_scope(scope or type(self).__name__):
36 |       inputs_list = [inputs, state]
37 |       hidden_act = linear(inputs_list,
38 |                           self.output_size,
39 |                           add_bias=True,
40 |                           moving_params=self.moving_params)
41 |       hidden = self.recur_func(hidden_act)
42 |     return hidden, hidden
43 |   
44 |   #=============================================================
45 |   @property
46 |   def state_size(self):
47 |     return self.output_size
48 |   
49 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/rnn.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """RNN helpers for TensorFlow models."""
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import numpy as np
 23 | import tensorflow as tf
 24 | 
 25 | import parser.neural.linalg as linalg
 26 | 
 27 | #===============================================================
 28 | def birnn(cell, inputs, sequence_length, initial_state_fw=None, initial_state_bw=None, ff_keep_prob=1., recur_keep_prob=1., dtype=tf.float32, scope=None):
 29 |   """"""
 30 |   
 31 |   # Forward direction
 32 |   with tf.variable_scope(scope or 'BiRNN_FW') as fw_scope:
 33 |     output_fw, output_state_fw = rnn(cell, inputs, sequence_length, initial_state_fw, ff_keep_prob, recur_keep_prob, dtype, scope=fw_scope)
 34 | 
 35 |   # Backward direction
 36 |   rev_inputs = tf.reverse_sequence(inputs, sequence_length, 1, 0)
 37 |   with tf.variable_scope(scope or 'BiRNN_BW') as bw_scope:
 38 |     output_bw, output_state_bw = rnn(cell, rev_inputs, sequence_length, initial_state_bw, ff_keep_prob, recur_keep_prob, dtype, scope=bw_scope)
 39 |   output_bw = tf.reverse_sequence(output_bw, sequence_length, 1, 0)
 40 |   # Concat each of the forward/backward outputs
 41 |   outputs = tf.concat([output_fw, output_bw], 2)
 42 | 
 43 |   return outputs, tf.tuple([output_state_fw, output_state_bw])
 44 | 
 45 | #===============================================================
 46 | def rnn(cell, inputs, sequence_length=None, initial_state=None, ff_keep_prob=1., recur_keep_prob=1., dtype=tf.float32, scope=None):
 47 |   """"""
 48 |   
 49 |   inputs = tf.transpose(inputs, [1, 0, 2])  # (B,T,D) => (T,B,D)
 50 |   
 51 |   parallel_iterations = 32
 52 |   if sequence_length is not None:
 53 |     sequence_length = tf.to_int32(sequence_length)
 54 |   
 55 |   with tf.variable_scope(scope or 'RNN') as varscope:
 56 |     #if varscope.caching_device is None:
 57 |     #  varscope.set_caching_device(lambda op: op.device)
 58 |     input_shape = tf.shape(inputs)
 59 |     time_steps, batch_size, _ = tf.unstack(input_shape, 3)
 60 |     const_time_steps, const_batch_size, const_depth = inputs.get_shape().as_list()
 61 |     
 62 |     if initial_state is not None:
 63 |       state = initial_state
 64 |     else:
 65 |       if not dtype:
 66 |         raise ValueError('If no initial_state is provided, dtype must be.')
 67 |       state = cell.zero_state(batch_size, dtype)
 68 |     
 69 |     zero_output = tf.zeros(tf.stack([batch_size, cell.output_size]), inputs.dtype)
 70 |     if sequence_length is not None:
 71 |       min_sequence_length = tf.reduce_min(sequence_length)
 72 |       max_sequence_length = tf.reduce_max(sequence_length)
 73 |     
 74 |     time = tf.constant(0, dtype=tf.int32, name='time')
 75 |     
 76 |     output_ta = tf.TensorArray(dtype=inputs.dtype,
 77 |                                size=time_steps,
 78 |                                tensor_array_name='dynamic_rnn_output')
 79 |     
 80 |     input_ta = tf.TensorArray(dtype=inputs.dtype,
 81 |                               size=time_steps,
 82 |                               tensor_array_name='dynamic_rnn_input')
 83 |     
 84 |     if ff_keep_prob < 1:
 85 |       noise_shape = tf.stack([1, batch_size, const_depth])
 86 |       inputs = tf.nn.dropout(inputs, ff_keep_prob, noise_shape=noise_shape)
 87 |       
 88 |     if recur_keep_prob < 1:
 89 |       ones = tf.ones(tf.stack([batch_size, cell.output_size]))
 90 |       state_dropout = tf.nn.dropout(ones, recur_keep_prob)
 91 |       state_dropout = tf.concat([ones] * (cell.state_size // cell.output_size - 1) + [state_dropout], 1)
 92 |     else:
 93 |       state_dropout = 1
 94 |       
 95 |     input_ta = input_ta.unstack(inputs)
 96 |     
 97 |     #-----------------------------------------------------------
 98 |     def _time_step(time, state, output_ta_t):
 99 |       """"""
100 |       
101 |       input_t = input_ta.read(time)
102 |       
103 |       #- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
104 |       def _empty_update():
105 |         return zero_output, state
106 |       #- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
107 |       def _call_cell():
108 |         return cell(input_t, state * state_dropout)
109 |       #- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
110 |       def _maybe_copy_some_through():
111 |         new_output, new_state = _call_cell()
112 |         
113 |         return tf.cond(
114 |           time < min_sequence_length,
115 |           lambda: (new_output, new_state),
116 |           lambda: (tf.where(time >= sequence_length, zero_output, new_output),
117 |                    tf.where(time >= sequence_length, state, new_state)))
118 |       #- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
119 |       
120 |       if sequence_length is not None:
121 |         output, new_state = tf.cond(
122 |           time >= max_sequence_length,
123 |           _empty_update,
124 |           _maybe_copy_some_through)
125 |       else:
126 |         (output, new_state) = _call_cell()
127 |       
128 |       output_ta_t = output_ta_t.write(time, output)
129 |       
130 |       return (time + 1, new_state, output_ta_t)
131 |     #-----------------------------------------------------------
132 |     
133 |     _, final_state, output_final_ta = tf.while_loop(
134 |       cond=lambda time, _1, _2: time < time_steps,
135 |       body=_time_step,
136 |       loop_vars=(time, state, output_ta),
137 |       parallel_iterations=parallel_iterations)
138 |     
139 |     final_outputs = output_final_ta.stack()
140 |     
141 |     outputs = tf.transpose(final_outputs, [1, 0, 2])  # (T,B,D) => (B,T,D)
142 |     return outputs, final_state
143 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/scripts/compression_ratio.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import os
23 | import re
24 | import argparse
25 | import codecs
26 | from backports import lzma
27 | 
28 | import numpy as np
29 | from numpy.linalg import inv
30 | import matplotlib.pyplot as plt
31 | from collections import Counter
32 | 
33 | #***************************************************************
34 | if __name__ == '__main__':
35 |   """"""
36 |   
37 |   parser = argparse.ArgumentParser()
38 |   parser.add_argument('-k', '--k_trials', type=int, default=100)
39 |   parser.add_argument('-n', '--n_words', type=int, default=5000)
40 |   parser.add_argument('files', nargs='+')
41 |   
42 |   args = parser.parse_args()
43 |   type_counter = Counter()
44 |   for filename in args.files:
45 |     with codecs.open(filename, encoding='utf-8', errors='ignore') as f:
46 |       for line in f:
47 |         line = line.strip()
48 |         if line:
49 |           if not re.match('#|[0-9]+[-.][0-9]+', line):
50 |             type_counter[line.split('\t')[1]] += 1
51 |   
52 |   types = type_counter.keys()
53 |   total = sum(type_counter.values())
54 |   probs = [type_counter[type_] / total for type_ in types]
55 |   
56 |   trials = []
57 |   n_words = min(args.n_words, len(types)) or len(types)
58 |   for _ in xrange(args.k_trials):
59 |     chosen_types = np.random.choice(types, size=n_words, replace=False, p=probs)
60 |     with codecs.open('uncompressed.txt', 'w', encoding='utf-8', errors='ignore') as f:
61 |       f.write('\n'.join(chosen_types))
62 |     with lzma.open('compressed.txt.xz', 'wb') as f:
63 |       f.write('\n'.join(chosen_types).encode('utf-8', 'ignore'))
64 |     trials.append(os.path.getsize('compressed.txt.xz')/os.path.getsize('uncompressed.txt'))
65 |   os.remove('uncompressed.txt')
66 |   os.remove('compressed.txt.xz')
67 |   print(np.mean(trials))
68 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/scripts/count_nonprojective.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import os
 23 | import re
 24 | import argparse
 25 | 
 26 | import numpy as np
 27 | from collections import defaultdict
 28 | 
 29 | #***************************************************************
 30 | class DepTree:
 31 |   """"""
 32 |   
 33 |   #=============================================================
 34 |   def __init__(self, buff):
 35 |     """"""
 36 |     
 37 |     self._head2deps = defaultdict(list)
 38 |     self._dep2head = dict()
 39 |     self._str = []
 40 |     for line in buff:
 41 |       dep_idx = int(line[0])
 42 |       head_idx = int(line[6])
 43 |       self.head2deps[head_idx].append(dep_idx)
 44 |       self.dep2head[dep_idx] = head_idx
 45 |       self._str.append(line[1])
 46 |     return
 47 |   
 48 |   #=============================================================
 49 |   def count_nonprojective(self):
 50 |     """"""
 51 |     
 52 |     nonproj = []
 53 |     for dep in self:
 54 |       head = self.dep2head[dep]
 55 |       span_min = min(dep, head)
 56 |       span_max = max(dep, head)
 57 |       for mid_dep in xrange(span_min+1, span_max):
 58 |         mid_head = self.dep2head[mid_dep]
 59 |         if mid_head < span_min or mid_head > span_max:
 60 |           crossing = True
 61 |           break
 62 |       else:
 63 |         crossing = False
 64 |       nonproj.append(int(crossing))
 65 |     return nonproj
 66 |   
 67 |   #=============================================================
 68 |   @property
 69 |   def head2deps(self):
 70 |     return self._head2deps
 71 |   @property
 72 |   def dep2head(self):
 73 |     return self._dep2head
 74 |   
 75 |   #=============================================================
 76 |   def __iter__(self):
 77 |     return (dep for dep in self.dep2head)
 78 |   def __len__(self):
 79 |     return len(self.dep2head)
 80 |   def __str__(self):
 81 |     return ' '.join(self._str)+'\n'
 82 |   
 83 | #***************************************************************
 84 | if __name__ == '__main__':
 85 |   """"""
 86 |   
 87 |   parser = argparse.ArgumentParser()
 88 |   parser.add_argument('files', nargs='+')
 89 |   
 90 |   args = parser.parse_args()
 91 |   for filename in args.files:
 92 |     lang = re.search('([-\w]*)-ud', filename).group(1)
 93 |     nonproj = []
 94 |     with open(filename) as f:
 95 |       buff = []
 96 |       for line in f:
 97 |         line = line.strip()
 98 |         if line:
 99 |           if not re.match('#|[0-9]+[-.][0-9]+', line):
100 |             buff.append(line.split('\t'))
101 |         else:
102 |           tree = DepTree(buff)
103 |           nonproj.extend(tree.count_nonprojective())
104 |           buff = []
105 |     print(lang, np.mean(nonproj)*100)
106 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/scripts/heaps_law.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import os
23 | import re
24 | import argparse
25 | 
26 | import numpy as np
27 | from numpy.linalg import inv
28 | import matplotlib.pyplot as plt
29 | from collections import defaultdict
30 | 
31 | #***************************************************************
32 | if __name__ == '__main__':
33 |   """"""
34 |   
35 |   parser = argparse.ArgumentParser()
36 |   parser.add_argument('files', nargs='+')
37 |   
38 |   args = parser.parse_args()
39 |   words = []
40 |   types = set()
41 |   n_types = []
42 |   for filename in args.files:
43 |     with open(filename) as f:
44 |       for line in f:
45 |         line = line.strip()
46 |         if line:
47 |           if not re.match('#|[0-9]+[-.][0-9]+', line):
48 |             words.append(line.split('\t')[1])
49 |   np.random.shuffle(words)
50 |   for word in words:
51 |     types.add(word)
52 |     n_types.append(len(types))
53 |   
54 |   K = 1
55 |   b = .75
56 |   y = n_types
57 |   logy = np.log(y)
58 |   x = np.arange(len(n_types))+1
59 |   logx = np.log(x)
60 |   d2ell = np.array([[1, np.mean(logx)],[np.mean(logx), np.mean(logx**2)]])
61 |   d2ellinv = inv(d2ell)
62 |   ell = np.mean((logy - b*logx-K)**2 / 2)
63 |   dell = np.array([np.mean(K+b*logx-logy), np.mean((K+b*logx-logy)*logx)])
64 |   updates = d2ellinv.dot(dell)
65 |   K -= updates[0]
66 |   b -= updates[1]
67 |   print(b)
68 |   #K_ = 5
69 |   #b_ = .74
70 |   #for i in xrange(20):
71 |   #  ell = np.mean((y - K_*x**b_)**2 / 2)
72 |   #  K_ -= 2*np.mean((K_*x**b_-y)*x**b_) / np.mean(x**(2*b_))
73 |   #  b_ -= 2*np.mean((K_*x**b_-y)*K_*x**b_*logx) / np.mean((2*K_*x**b_ - y)*K_*x**b_*logx**2)
74 |   #  print(ell, K_, b_)
75 |   #plt.figure()
76 |   #plt.grid()
77 |   #plt.plot(x, y)
78 |   #plt.plot(x, np.exp(b*logx+K))
79 |   #plt.show()
80 |   #plt.figure()
81 |   #plt.grid()
82 |   #plt.plot(x, logy - b*logx-K)
83 |   #plt.show()
84 |   #plt.figure()
85 |   #plt.grid()
86 |   #plt.plot(x, y - K_*x**b_)
87 |   #plt.show()
88 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/scripts/reinsert_compounds.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import 
 2 | from __future__ import division 
 3 | from __future__ import print_function 
 4 |  
 5 | import os 
 6 | import sys 
 7 | import codecs 
 8 |  
 9 | input_file = sys.argv[2] 
10 | output_file = sys.argv[1] 
11 |  
12 | lines = [] 
13 |  
14 | with codecs.open(output_file, encoding='utf-8') as f: 
15 |   for line in f: 
16 |     lines.append(line) 
17 |  
18 | with codecs.open(input_file, encoding='utf-8') as f: 
19 |   with codecs.open(output_file, 'w', encoding='utf-8') as fout: 
20 |     i = 0 
21 |     for line in f: 
22 |       line = line.strip() 
23 |  
24 |       if len(line) == 0: 
25 |         fout.write(lines[i]) 
26 |         i += 1 
27 |         continue 
28 |  
29 |       if line[0] == '#': 
30 |         continue 
31 |  
32 |       line = line.split('\t') 
33 |       if '.' in line[0]: 
34 |         continue 
35 |  
36 |       if '-' in line[0]: 
37 |         fout.write('%s\n' % ('\t'.join(line))) 
38 |         continue 
39 | 
40 |       fout.write(lines[i])
41 |       i += 1
42 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/trash/retrained_vocab.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import os
 23 | import codecs
 24 | from collections import Counter
 25 | 
 26 | import numpy as np
 27 | import scipy.linalg as la
 28 | import tensorflow as tf
 29 | 
 30 | from parser.vocabs.base_vocab import BaseVocab
 31 | 
 32 | #***************************************************************
 33 | class RetrainedVocab(BaseVocab):
 34 |   """"""
 35 |   
 36 |   #=============================================================
 37 |   def __init__(self, pretrained_vocab, *args, **kwargs):
 38 |     """"""
 39 |     
 40 |     super(RetrainedVocab, self).__init__(*args, **kwargs)
 41 |     
 42 |     self._pretrained_vocab = pretrained_vocab
 43 |     return
 44 |   
 45 |   #=============================================================
 46 |   def __call__(self):
 47 |     """"""
 48 |     
 49 |     embed_size = self.embed_size
 50 |     row_idxs = tf.placeholder(tf.int32, shape=(None,), name='row_idxs')
 51 |     col_idxs = tf.placeholder(tf.int32, shape=(None,), name='col_idxs')
 52 |     S, U, _ = tf.svd(self.pretrained_vocab.embeddings)
 53 |     self.embeddings = U[:,:embed_size] * S[:embed_size]
 54 |     
 55 |     old_rows = tf.gather(self.pretrained_vocab.embeddings, row_idxs)
 56 |     old_cols = tf.gather(self.pretrained_vocab.embeddings, col_idxs)
 57 |     new_rows = tf.gather(self.embeddings, row_idxs)
 58 |     new_cols = tf.gather(self.embeddings, col_idxs)
 59 |     old_matmul = tf.matmul(old_rows, old_cols, transpose_b=True)
 60 |     new_matmul = tf.matmul(new_rows, new_cols, transpose_b=True)
 61 |     
 62 |     if self.embed_loss == 'cross_entropy':
 63 |       old_matmul = tf.expand_dims(tf.nn.softmax(old_matmul), axis=1)
 64 |       new_matmul = tf.expand_dims(tf.nn.softmax(new_matmul), axis=2)
 65 |       loss = -tf.reduce_sum(tf.matmul(old_matmul, tf.log(new_matmul))) / tf.to_float(tf.shape(row_idxs)[0])
 66 |     elif self.embed_loss == 'l2_loss':
 67 |       loss = tf.reduce_sum((old_matmul - new_matmul)**2 / 2) / tf.to_float(tf.shape(row_idxs)[0])
 68 |     else:
 69 |       raise ValueError('embed_loss must be in "(cross_entropy, l2_loss)"')
 70 |     
 71 |     return {'row_idxs': row_idxs,
 72 |             'col_idxs': col_idxs,
 73 |             'loss': loss}
 74 |   
 75 |   #=============================================================
 76 |   def dump(self):
 77 |     """"""
 78 |     
 79 |     matrix = self.embeddings.eval()
 80 |     with codecs.open(self.name+'.txt', 'w') as f:
 81 |       for idx in xrange(self.START_IDX, len(self)):
 82 |         f.write('%s %s\n' % (self[idx], ' '.join(matrix[idx])))
 83 |     return
 84 |   
 85 |   #=============================================================
 86 |   @property
 87 |   def pretrained_vocab(self):
 88 |     return self._pretrained_vocab
 89 |   
 90 |   #=============================================================
 91 |   def __setattr__(self, name, value):
 92 |     if name == '_pretrained_vocab':
 93 |       self._str2idx = value._str2idx
 94 |       self._idx2str = value._idx2str
 95 |       self._counts = value._counts
 96 |     super(RetrainedVocab, self).__setattr__(name, value)
 97 | 
 98 | #***************************************************************
 99 | if __name__ == '__main__':
100 |   """"""
101 |   
102 |   from parser import Configurable
103 |   from parser.vocabs import PretrainedVocab
104 |   configurable = Configurable(retrained_vocab={'embed_loss':'cross_entropy', 'retrained_embed_size':50})
105 |   pretrained_vocab = PretrainedVocab.from_configurable(configurable)
106 |   retrained_vocab = RetrainedVocab.from_vocab(pretrained_vocab)
107 |   retrain_loss = retrained_vocab(pretrained_vocab)
108 |   print('RetrainedVocab passes')
109 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/trash/weighted_mean.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |  
22 | import numpy as np
23 | import tensorflow as tf
24 | 
25 | from parser.neural.models import NN
26 | 
27 | #***************************************************************
28 | class WeightedMean(NN):
29 |   """"""
30 |   
31 |   #=============================================================
32 |   def __call__(self, vocab, output_size, moving_params=None):
33 |     """"""
34 |     
35 |     inputs = tf.placeholder(tf.int32, shape=(None,None), name='inputs-%s' % self.name)
36 |     
37 |     self.tokens_to_keep = tf.to_float(tf.greater(inputs, vocab.PAD))
38 |     self.sequence_lengths = tf.reduce_sum(self.tokens_to_keep, axis=1, keep_dims=True)
39 |     self.n_tokens = tf.reduce_sum(self.sequence_lengths)
40 |     self.batch_size = tf.shape(inputs)[0]
41 |     self.bucket_size = tf.shape(inputs)[1]
42 |     self.moving_params = moving_params
43 |     
44 |     embeddings = vocab.embedding_lookup(inputs, moving_params=self.moving_params)
45 |     weighted_embeddings = self.linear_attention(embeddings)
46 |     mlp = self.MLP(weighted_embeddings, self.mlp_size)
47 |     lin = self.linear(mlp, output_size)
48 |     
49 |     return {'output': lin, 'inputs': inputs}
50 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/__init__.py:
--------------------------------------------------------------------------------
 1 | from index_vocab import IndexVocab, DepVocab, HeadVocab
 2 | from pretrained_vocab import PretrainedVocab
 3 | from token_vocab import TokenVocab, WordVocab, LemmaVocab, TagVocab, XTagVocab, RelVocab
 4 | from subtoken_vocab import SubtokenVocab, CharVocab
 5 | from ngram_vocab import NgramVocab
 6 | from multivocab import Multivocab
 7 | from ngram_multivocab import NgramMultivocab
 8 | 
 9 | __all__ = [
10 |   'DepVocab',
11 |   'HeadVocab',
12 |   'PretrainedVocab',
13 |   'WordVocab',
14 |   'LemmaVocab',
15 |   'TagVocab',
16 |   'XTagVocab',
17 |   'RelVocab',
18 |   'CharVocab',
19 |   'NgramVocab',
20 |   'Multivocab',
21 |   'NgramMultivocab'
22 | ]
23 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/base_vocab.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import os
 23 | import re
 24 | from collections import Counter
 25 | 
 26 | import numpy as np
 27 | import tensorflow as tf
 28 | 
 29 | import parser.neural.linalg as linalg
 30 | from parser import Configurable
 31 | 
 32 | #***************************************************************
 33 | class BaseVocab(Configurable):
 34 |   """"""
 35 |   
 36 |   #=============================================================
 37 |   def __init__(self, *args, **kwargs):
 38 |     """"""
 39 |     
 40 |     super(BaseVocab, self).__init__(*args, **kwargs)
 41 |     
 42 |     self._cased = super(BaseVocab, self).cased
 43 |     self._special_tokens = super(BaseVocab, self).special_tokens
 44 |     self._special_tokens_set = set(self._special_tokens)
 45 |     self._set_special_tokens()
 46 |     # NOTE: __setattr__ turns these into dicts
 47 |     self._str2idx = zip(self.special_tokens, range(len(self.special_tokens)))
 48 |     self._idx2str = zip(range(len(self.special_tokens)), self.special_tokens)
 49 |     self._tok2idx = self._str2idx
 50 |     self._counts = None
 51 |     self._embeddings = None
 52 |     # NOTE this placeholder stores the token data indices
 53 |     # I.e. the token's index in the word/tag/glove embedding matrix
 54 |     # CharVocab will by default be "char"
 55 |     self.placeholder = None
 56 |   
 57 |   #=============================================================
 58 |   def _set_special_tokens(self):
 59 |     pattern = re.compile('\W+', re.UNICODE)
 60 |     for i, token in enumerate(self.special_tokens):
 61 |       token = token.lstrip('<')
 62 |       token = token.rstrip('>')
 63 |       token = token.upper()
 64 |       token = pattern.sub('', token)
 65 |       assert token not in self.__dict__
 66 |       self.__dict__[token] = i
 67 |     return
 68 |   
 69 |   #=============================================================
 70 |   @classmethod
 71 |   def from_vocab(cls, vocab, *args, **kwargs):
 72 |     """"""
 73 |     
 74 |     args += (vocab,)
 75 |     return cls.from_configurable(vocab, *args, **kwargs)
 76 |  
 77 |   #=============================================================
 78 |   def generate_placeholder(self):
 79 |     """"""
 80 |     
 81 |     if self.placeholder is None:
 82 |       self.placeholder = tf.placeholder(tf.int32, shape=[None, None], name=self.name)
 83 |     return self.placeholder
 84 |   
 85 |   #=============================================================
 86 |   def __call__(self, placeholder=None, moving_params=None):
 87 |     """"""
 88 |     
 89 |     placeholder = self.generate_placeholder() if placeholder is None else placeholder
 90 |     embeddings = self.embeddings if moving_params is None else moving_params.average(self.embeddings)
 91 |     return tf.nn.embedding_lookup(embeddings, placeholder)
 92 |   
 93 |   #=============================================================
 94 |   def setup(self):
 95 |     """"""
 96 | 
 97 |     self.placeholder = None
 98 |     return
 99 | 
100 |   #=============================================================
101 |   def set_feed_dict(self, data, feed_dict):
102 |     """"""
103 |     
104 |     feed_dict[self.placeholder] = data
105 |     return
106 |   
107 |   #=============================================================
108 |   def load(self):
109 |     raise NotImplementedError()
110 |   def dump(self):
111 |     raise NotImplementedError()
112 |   def count(self):
113 |     raise NotImplementedError()
114 |   
115 |   #=============================================================
116 |   def strings(self):
117 |     return self._str2idx.keys()
118 |   def indices(self):
119 |     return self._str2idx.values()
120 |   def iteritems(self):
121 |     return self._str2idx.iteritems()
122 |   def most_common(self, n=None):
123 |     return self._counts.most_common(n)
124 |   def index(self, token):
125 |     if not self.cased and token not in self._special_tokens_set:
126 |       token = token.lower()
127 |     return self._tok2idx.get(token, self.UNK)
128 |   
129 |   #=============================================================
130 |   @property
131 |   def depth(self):
132 |     return None
133 |   @property 
134 |   def special_tokens(self):
135 |     return self._special_tokens
136 |   @property 
137 |   def cased(self):
138 |     return self._cased
139 |   @property
140 |   def counts(self):
141 |     return self._counts
142 |   @property
143 |   def embeddings(self):
144 |     return self._embeddings
145 |   #@embeddings.setter
146 |   #def embeddings(self, matrix):
147 |   #  if matrix.shape[1] != self.embed_size:
148 |   #    raise ValueError("Matrix shape[1] of %d doesn't match expected shape of %d" % (matrix.shape[1], self.embed_size))
149 |   #  with tf.device('/cpu:0'):
150 |   #    with tf.variable_scope(self.name.title()):
151 |   #      self._embeddings = tf.Variable(matrix, name='Embeddings', dtype=tf.float32, trainable=True)
152 |   #  return
153 |   
154 |   #=============================================================
155 |   def __getitem__(self, key):
156 |     if isinstance(key, basestring):
157 |       if not self.cased and key not in self._special_tokens_set:
158 |         key = key.lower()
159 |       return self._str2idx.get(key, self.UNK)
160 |     elif isinstance(key, (int, long, np.int32, np.int64)):
161 |       return self._idx2str.get(key, self.special_tokens[self.UNK])
162 |     elif hasattr(key, '__iter__'):
163 |       return [self[k] for k in key]
164 |     else:
165 |       raise ValueError('key to BaseVocab.__getitem__ must be (iterable of) string or integer')
166 |     return
167 |   
168 |   def __setitem__(self, key, value):
169 |     if isinstance(key, basestring):
170 |       if not self.cased and key not in self._special_tokens_set:
171 |         key = key.lower()
172 |       self._str2idx[key] = value
173 |       self._idx2str[value] = key
174 |     elif isinstance(key, (int, long)):
175 |       if not self.cased and value not in self._special_tokens_set:
176 |         value = value.lower()
177 |       self._idx2str[key] = value
178 |       self._str2idx[value] = key
179 |     elif hasattr(key, '__iter__') and hasattr(value, '__iter__'):
180 |       for k, v in zip(key, value):
181 |         self[k] = v
182 |     else:
183 |       raise ValueError('keys and values to BaseVocab.__setitem__ must be (iterable of) string or integer')
184 |   
185 |   def __contains__(self, key):
186 |     if isinstance(key, basestring):
187 |       if not self.cased and key not in self._special_tokens_set:
188 |         key = key.lower()
189 |       return key in self._str2idx
190 |     elif isinstance(key, (int, long)):
191 |       return key in self._idx2str
192 |     else:
193 |       raise ValueError('key to BaseVocab.__contains__ must be string or integer')
194 |     return
195 |   
196 |   def __len__(self):
197 |     return len(self._str2idx)
198 |   
199 |   def __iter__(self):
200 |     return (key for key in sorted(self._str2idx, key=self._str2idx.get))
201 | 
202 |   def __setattr__(self, name, value):
203 |     if name in ('_str2idx', '_idx2str', '_str2idxs'):
204 |       value = dict(value)
205 |     elif name == '_counts':
206 |       value = Counter(value)
207 |     super(BaseVocab, self).__setattr__(name, value)
208 |     return
209 |   
210 | #***************************************************************
211 | if __name__ == '__main__':
212 |   """"""
213 |   
214 |   base_vocab = BaseVocab()
215 |   print('BaseVocab passes')
216 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/index_vocab.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | # Copyright 2016 Timothy Dozat
 5 | # 
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | # 
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | # 
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import os
23 | import re
24 | import sys
25 | from collections import Counter
26 | 
27 | import numpy as np
28 | import tensorflow as tf
29 | 
30 | from parser import Configurable
31 | 
32 | __all__ = ['DepVocab', 'HeadVocab']
33 | 
34 | #***************************************************************
35 | class IndexVocab(Configurable):
36 |   """"""
37 |   
38 |   ROOT = 0
39 |   
40 |   #=============================================================
41 |   def __init__(self, *args, **kwargs):
42 |     """"""
43 |     
44 |     super(IndexVocab, self).__init__(*args, **kwargs)
45 |     self.placeholder = None
46 |   
47 |   #=============================================================
48 |   def generate_placeholder(self):
49 |     """"""
50 |     
51 |     if self.placeholder is None:
52 |       self.placeholder = tf.placeholder(tf.int32, shape=[None, None], name=self.name)
53 |     return self.placeholder
54 |   
55 |   #=============================================================
56 |   def set_feed_dict(self, data, feed_dict):
57 |     """"""
58 |     
59 |     feed_dict[self.placeholder] = data
60 |     return
61 |   
62 |   #=============================================================
63 |   def setup(self):
64 |     self.placeholder = None
65 |     return
66 | 
67 |   #=============================================================
68 |   def index(self, token):
69 |     return 0 if token == '_' else int(token)
70 |   
71 |   #=============================================================
72 |   @property
73 |   def depth(self):
74 |     return None
75 |   @property
76 |   def conll_idx(self):
77 |     return self._conll_idx
78 | 
79 |   #=============================================================
80 |   def __getitem__(self, key):
81 |     if isinstance(key, basestring):
82 |       return int(key)
83 |     elif isinstance(key, (int, long, np.int32, np.int64)):
84 |       return str(key)
85 |     elif hasattr(key, '__iter__'):
86 |       return [self[k] for k in key]
87 |     else:
88 |       raise ValueError('key to BaseVocab.__getitem__ must be (iterable of) string or integer')
89 |     return
90 | 
91 | #***************************************************************
92 | class DepVocab(IndexVocab):
93 |   _conll_idx = 0
94 | class HeadVocab(IndexVocab):
95 |   _conll_idx = 6
96 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/multivocab.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import os
 23 | import re
 24 | import codecs
 25 | from collections import Counter
 26 | 
 27 | import numpy as np
 28 | import tensorflow as tf
 29 | 
 30 | from parser import Configurable
 31 | from parser.neural import linalg
 32 | from parser.vocabs import TokenVocab, SubtokenVocab
 33 | 
 34 | __all__ = ['Multivocab']
 35 | 
 36 | #***************************************************************
 37 | class Multivocab(Configurable):
 38 |   """"""
 39 |   
 40 |   #=============================================================
 41 |   def __init__(self, vocabs, *args, **kwargs):
 42 |     """"""
 43 |     
 44 |     super(Multivocab, self).__init__(*args, **kwargs)
 45 |     
 46 |     self._vocabs = vocabs
 47 |     self._set_special_tokens()
 48 |     # NOTE Don't forget to run index_tokens() after adding test/validation files!
 49 |     self.placeholder = None
 50 |     return
 51 |   
 52 |   #=============================================================
 53 |   def __call__(self, placeholder=None, moving_params=None):
 54 |     """"""
 55 |     # TODO check to see if a word is all unk, and if so, replace it with a random vector
 56 |     
 57 |     embeddings = [vocab(moving_params=moving_params) for vocab in self]
 58 |     return tf.add_n(embeddings)
 59 |   
 60 |   #=============================================================
 61 |   def setup(self):
 62 |     """"""
 63 | 
 64 |     self.placeholder = None
 65 |     for vocab in self:
 66 |       vocab.setup()
 67 |     return
 68 | 
 69 |   #=============================================================
 70 |   def generate_placeholder(self):
 71 |     """"""
 72 |     
 73 |     if self.placeholder is None:
 74 |       self.placeholder = tf.stack([vocab.generate_placeholder() for vocab in self], axis=2)
 75 |     return self.placeholder
 76 |   
 77 |   #=============================================================
 78 |   def _set_special_tokens(self):
 79 |     pattern = re.compile('\W+', re.UNICODE)
 80 |     self._special_tokens = zip(*[vocab.special_tokens for vocab in self])
 81 |     for i, token in enumerate(self.special_tokens):
 82 |       n = len(token)
 83 |       assert len(set(token)) == 1
 84 |       token = token[0]
 85 |       token = token.lstrip('<')
 86 |       token = token.rstrip('>')
 87 |       token = token.upper()
 88 |       token = pattern.sub('', token)
 89 |       assert token not in self.__dict__
 90 |       self.__dict__[token] = tuple(i for _ in xrange(n))
 91 |     return
 92 |   
 93 |   #=============================================================
 94 |   def add_files(self, conll_files):
 95 |     """"""
 96 |     
 97 |     conll_files = list(conll_files)
 98 |     token_vocabs = []
 99 |     for vocab in self:
100 |       if hasattr(vocab, 'token_vocab'):
101 |         if vocab.token_vocab not in token_vocabs:
102 |           vocab.token_vocab.count(conll_files)
103 |           token_vocabs.append(vocab.token_vocab)
104 |     return
105 |   
106 |   #=============================================================
107 |   def index_tokens(self):
108 |     """"""
109 |     
110 |     for vocab in self:
111 |       if hasattr(vocab, 'index_tokens'):
112 |         vocab.index_tokens()
113 |     return
114 |   
115 |   #=============================================================
116 |   def set_feed_dict(self, data, feed_dict):
117 |     """"""
118 |     
119 |     for i, vocab in enumerate(self):
120 |       vocab.set_feed_dict(data[:,:,i], feed_dict)
121 |     return
122 |   
123 |   #=============================================================
124 |   def index(self, token):
125 |     return tuple(vocab.index(token) for vocab in self)
126 |   
127 |   #=============================================================
128 |   @property
129 |   def depth(self):
130 |     return len(self)
131 |   @property
132 |   def special_tokens(self):
133 |     return self._special_tokens
134 |   @property
135 |   def conll_idx(self):
136 |     return self._conll_idx
137 |   
138 |   #=============================================================
139 |   def __iter__(self):
140 |     return (vocab for vocab in self._vocabs)
141 |   def __getitem__(self, key):
142 |     return self._vocabs[key]
143 |   def __len__(self):
144 |     return len(self._vocabs)
145 |   def __setattr__(self, key, value):
146 |     if key == '_vocabs':
147 |       conll_idxs = set([vocab.conll_idx for vocab in value if hasattr(vocab, 'conll_idx')]) 
148 |       assert len(conll_idxs) == 1
149 |       self._conll_idx = list(conll_idxs)[0]
150 |     super(Multivocab, self).__setattr__(key, value)
151 | 
152 | #***************************************************************
153 | if __name__ == '__main__':
154 |   """"""
155 |   
156 |   from parser.vocabs import PretrainedVocab, WordVocab, CharVocab, Multivocab
157 |   
158 |   configurable = Configurable()
159 |   token_vocab = WordVocab.from_configurable(configurable)
160 |   pretrained_vocab = PretrainedVocab.from_vocab(token_vocab)
161 |   subtoken_vocab = CharVocab.from_vocab(token_vocab)
162 |   multivocab = Multivocab.from_configurable(configurable, [pretrained_vocab, token_vocab, subtoken_vocab])
163 |   multivocab.add_files(configurable.valid_files)
164 |   multivocab.index_tokens()
165 |   print("Indices for '<PAD>': %s" % str(multivocab.index('<PAD>')))
166 |   print("Indices for 'the': %s" % str(multivocab.index('the')))
167 |   print("Indices for 'The': %s" % str(multivocab.index('The')))
168 |   print('Multivocab passes')
169 |   
170 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/ngram_multivocab.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import os
 23 | import sys
 24 | import codecs
 25 | from collections import Counter
 26 | 
 27 | import numpy as np
 28 | import tensorflow as tf
 29 | 
 30 | from parser import Configurable, Multibucket
 31 | from parser.vocabs.base_vocab import BaseVocab
 32 | from parser.vocabs import SubtokenVocab, NgramVocab, Multivocab
 33 | from parser.misc.bucketer import Bucketer
 34 | 
 35 | __all__ = ['NgramMultivocab']
 36 | 
 37 | #***************************************************************
 38 | class NgramMultivocab(Multivocab, SubtokenVocab):
 39 |   """"""
 40 |   
 41 |   #=============================================================
 42 |   def __init__(self, token_vocab, *args, **kwargs):
 43 |     """"""
 44 |     
 45 |     super(BaseVocab, self).__init__(*args, **kwargs)
 46 |     self._cased = super(BaseVocab, self).cased
 47 |     
 48 |     SubtokenVocab.__setattr__(self, '_token_vocab', token_vocab)
 49 |     self._multibucket = Multibucket.from_configurable(self, embed_model=self.embed_model, name=self.name)
 50 |     self._vocabs = [NgramVocab.from_vocab(self.token_vocab, i+1, cased=self.cased) for i in xrange(self.max_n)]
 51 |     self._special_tokens = super(BaseVocab, self).special_tokens
 52 |     self._special_tokens_set = set(self._special_tokens)
 53 |     SubtokenVocab._set_special_tokens(self)
 54 |     self._tok2idx = {}
 55 |     
 56 |     for vocab in self:
 57 |       assert vocab.token_vocab is self.token_vocab
 58 |     return
 59 |   
 60 |   #=============================================================
 61 |   def add_files(self, conll_files):
 62 |     """"""
 63 |     
 64 |     self.token_vocab.count(conll_files)
 65 |     return
 66 |   
 67 |   #=============================================================
 68 |   def index_tokens(self):
 69 |     """"""
 70 |     
 71 |     n_buckets = self.n_buckets
 72 |     tok2idxs = {token: [vocab.subtoken_indices(token) for vocab in self] for token in self.token_vocab.counts}
 73 |     with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s'%self.name) as bucketer:
 74 |       splits = bucketer.compute_splits(len(indices[0]) for indices in tok2idxs.values())
 75 |       bucketer.plot()
 76 |     with self.multibucket.open(splits, depth=len(self)):
 77 |       for index, special_token in enumerate(self.special_tokens):
 78 |         self.tok2idx[special_token] = self.multibucket.add([[index]*len(self)])
 79 |       for token, _ in self.sorted_counts(self.token_vocab.counts):
 80 |         indices = tok2idxs[token]
 81 |         sequence = [[indices[i][j] for i in xrange(len(indices)) if j < len(indices[i])] for j in xrange(len(indices[0]))]
 82 |         self.tok2idx[token] = self.multibucket.add(sequence)
 83 |     return
 84 |   
 85 |   #=============================================================
 86 |   def __call__(self, placeholder, keep_prob=None, moving_params=None):
 87 |     return SubtokenVocab.__call__(self, placeholder, keep_prob=keep_prob, moving_params=moving_params)
 88 |   
 89 |   def index(self, token):
 90 |     return SubtokenVocab.index(self, token)
 91 |   
 92 |   def generate_placeholder(self):
 93 |     return SubtokenVocab.generate_placeholder(self)
 94 |   
 95 |   #=============================================================
 96 |   def embedding_lookup(self, placeholders, embed_keep_prob=None, moving_params=None):
 97 |     """"""
 98 |     
 99 |     if moving_params is None:
100 |       shape = tf.shape(placeholders)
101 |       shape = tf.stack([shape[0], 1, shape[2]])
102 |       placeholders = la.random_where(embed_keep_prob, placeholders, self.UNK, shape=shape)
103 |     embeddings = [vocab.embedding_lookup(placeholders[:,:,i], embed_keep_prob=1, moving_params=moving_params) for i, vocab in enumerate(self)]
104 |     return tf.stack(embeddings, axis=2)
105 |   
106 |   #=============================================================
107 |   def __iter__(self):
108 |     return (vocab for vocab in self._vocabs)
109 |   def __getitem__(self, key):
110 |     return self._vocabs[key]
111 |   def __len__(self):
112 |     return len(self._vocabs)
113 | 
114 | #***************************************************************
115 | if __name__ == '__main__':
116 |   """"""
117 |   
118 |   from parser import Configurable
119 |   from parser.vocabs import WordVocab, NgramMultivocab
120 |   
121 |   configurable = Configurable()
122 |   token_vocab = WordVocab.from_configurable(configurable)
123 |   ngram_multivocab = NgramMultivocab.from_vocab(token_vocab)
124 |   ngram_multivocab.add_files(configurable.valid_files)
125 |   ngram_multivocab.index_tokens()
126 |   print("Indices for '<PAD>': %s" % str(ngram_multivocab.index('<PAD>')))
127 |   print("Indices for 'the': %s" % str(ngram_multivocab.index('the')))
128 |   print("Indices for 'The': %s" % str(ngram_multivocab.index('The')))
129 |   print('NgramMultivocab passes')
130 |   


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/ngram_vocab.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import os
 23 | import codecs
 24 | from collections import Counter
 25 | 
 26 | import numpy as np
 27 | import tensorflow as tf
 28 | 
 29 | from parser.vocabs import TokenVocab, SubtokenVocab, CharVocab
 30 | from parser import Multibucket
 31 | 
 32 | __all__ = ['NgramVocab']
 33 | 
 34 | #***************************************************************
 35 | class NgramVocab(SubtokenVocab):
 36 |   """"""
 37 |   
 38 |   #=============================================================
 39 |   def __init__(self, n, token_vocab, *args, **kwargs):
 40 |     """"""
 41 |     
 42 |     recount = kwargs.pop('recount', False)
 43 |     initialize_zero = kwargs.pop('initialize_zero', False)
 44 |     super(TokenVocab, self).__init__(*args, **kwargs)
 45 |     
 46 |     self._n = n
 47 |     self._token_vocab = token_vocab
 48 |     self._token_counts = Counter()
 49 |     self._subtoken_vocab = CharVocab.from_vocab(self.token_vocab)
 50 |     self._multibucket = Multibucket.from_configurable(self, embed_model=self.embed_model, name=self.name)
 51 |     
 52 |     if recount:
 53 |       self.count()
 54 |     else:
 55 |       if os.path.isfile(self.filename):
 56 |         self.load()
 57 |       else:
 58 |         self.count()
 59 |         self.dump()
 60 |     self.index_vocab()
 61 |     
 62 |     embed_dims = [len(self), self.embed_size]
 63 |     if initialize_zero:
 64 |       self.embeddings = np.zeros(embed_dims)
 65 |     else:
 66 |       self.embeddings = np.random.randn(*embed_dims)
 67 |     return
 68 |   
 69 |   #=============================================================
 70 |   def count(self):
 71 |     """"""
 72 |     
 73 |     special_tokens = set(self.token_vocab.special_tokens)
 74 |     for token in self.token_vocab:
 75 |       if token not in special_tokens:
 76 |         idxs = self.subtoken_vocab.subtoken_indices(token)
 77 |         idxs = [self.subtoken_vocab.START] + idxs + [self.subtoken_vocab.STOP]
 78 |         if len(idxs) > self.n:
 79 |           for i in xrange(len(idxs) - self.n):
 80 |             subtoken = ''.join(self.subtoken_vocab[idxs[i:i+self.n]])
 81 |             self.counts[subtoken] += 1
 82 |             self.token_counts[subtoken] += self.token_vocab.counts[token]
 83 |     return
 84 |   
 85 |   #=============================================================
 86 |   def subtoken_indices(self, token):
 87 |     """"""
 88 |     
 89 |     idxs = self.subtoken_vocab.subtoken_indices(token)
 90 |     idxs = [self.subtoken_vocab.START] + idxs + [self.subtoken_vocab.STOP]
 91 |     if len(idxs) <= self.n:
 92 |       return [self.PAD]
 93 |     else:
 94 |       subtokens = []
 95 |       for i in xrange(len(idxs) - self.n):
 96 |         subtokens.append(''.join(self.subtoken_vocab[idxs[i:i+self.n]]))
 97 |       return self[subtokens]
 98 |   
 99 |   #=============================================================
100 |   @property
101 |   def n(self):
102 |     return self._n
103 |   @property
104 |   def subtoken_vocab(self):
105 |     return self._subtoken_vocab
106 |   @property
107 |   def name(self):
108 |     return '%d-%s' % (self.n, super(NgramVocab, self).name)
109 |   
110 |   #=============================================================
111 |   def __setattr__(self, name, value):
112 |     if name == '_subtoken_vocab':
113 |       self._conll_idx = value.conll_idx
114 |       if self.cased is None:
115 |         self._cased = value.cased
116 |       elif self.cased != value.cased:
117 |         cls = value.__class__
118 |         value = cls.from_configurable(value, value.token_vocab,
119 |                                       cased=self.cased,
120 |                                       recount=True)
121 |     super(NgramVocab, self).__setattr__(name, value)
122 |     return
123 | 
124 | #***************************************************************
125 | if __name__ == '__main__':
126 |   """"""
127 |   
128 |   from parser import Configurable
129 |   from parser.vocabs import WordVocab, CharVocab, NgramVocab
130 |   
131 |   configurable = Configurable()
132 |   token_vocab = WordVocab.from_configurable(configurable, 1)
133 |   if os.path.isfile('saves/defaults/2-ngrams.txt'):
134 |     os.remove('saves/defaults/2-ngrams.txt')
135 |   ngram_vocab = NgramVocab.from_vocab(token_vocab, 2)
136 |   ngram_vocab = NgramVocab.from_vocab(token_vocab, 2)
137 |   ngram_vocab.token_vocab.count(conll_files = configurable.valid_files)
138 |   ngram_vocab.index_tokens()
139 |   ngram_vocab.fit_to_zipf()
140 |   print('NgramVocab passes')


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/pretrained_vocab.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import os
 23 | import codecs
 24 | import gzip
 25 | import warnings
 26 | try:
 27 |   from backports import lzma
 28 | except:
 29 |   warnings.warn('Install backports.lzma for xz support')
 30 | from collections import Counter
 31 | 
 32 | import numpy as np
 33 | import tensorflow as tf
 34 | 
 35 | import parser.neural.linalg as linalg
 36 | from parser.vocabs.base_vocab import BaseVocab
 37 | 
 38 | #***************************************************************
 39 | class PretrainedVocab(BaseVocab):
 40 |   """"""
 41 |   
 42 |   #=============================================================
 43 |   def __init__(self, token_vocab, *args, **kwargs):
 44 |     """"""
 45 |     
 46 |     super(PretrainedVocab, self).__init__(*args, **kwargs)
 47 |     
 48 |     self._token_vocab = token_vocab
 49 |     
 50 |     self.load()
 51 |     self.count()
 52 |     return
 53 |   
 54 |   #=============================================================
 55 |   def __call__(self, placeholder=None, moving_params=None):
 56 |     """"""
 57 |     
 58 |     embeddings = super(PretrainedVocab, self).__call__(placeholder, moving_params=moving_params)
 59 |     # (n x b x d') -> (n x b x d)
 60 |     with tf.variable_scope(self.name.title()):
 61 |       matrix = linalg.linear(embeddings, self.token_embed_size, moving_params=moving_params)
 62 |       if moving_params is None:
 63 |         with tf.variable_scope('Linear', reuse=True):
 64 |           weights = tf.get_variable('Weights')
 65 |           tf.losses.add_loss(tf.nn.l2_loss(tf.matmul(tf.transpose(weights), weights) - tf.eye(self.token_embed_size)))
 66 |     return matrix
 67 |     #return embeddings # changed in saves2/test8
 68 |   
 69 |   #=============================================================
 70 |   def setup(self):
 71 |     """"""
 72 | 
 73 |     self.placeholder = None
 74 |     with tf.device('/cpu:0'):
 75 |       with tf.variable_scope(self.name.title()):
 76 |         self._embeddings = tf.Variable(self._embeddings_array, name='Embeddings', dtype=tf.float32, trainable=False)
 77 |     return
 78 |     
 79 |   #=============================================================
 80 |   def load(self):
 81 |     """"""
 82 |     
 83 |     embeddings = []
 84 |     cur_idx = len(self.special_tokens)
 85 |     max_rank = self.max_rank
 86 |     if self.filename.endswith('.xz'):
 87 |       open_func = lzma.open
 88 |     else:
 89 |       open_func = codecs.open
 90 |     with open_func(self.filename, 'rb') as f:
 91 |       reader = codecs.getreader('utf-8')(f, errors='ignore')
 92 |       if self.skip_header == True:
 93 |         reader.readline()
 94 |       for line_num, line in enumerate(reader):
 95 |         if (not max_rank) or line_num < max_rank:
 96 |           line = line.rstrip().split(' ')
 97 |           if len(line) > 1:
 98 |             embeddings.append(np.array(line[1:], dtype=np.float32))
 99 |             self[line[0]] = cur_idx
100 |             cur_idx += 1
101 |         else:
102 |           break
103 |     try:
104 |       embeddings = np.stack(embeddings)
105 |       embeddings = np.pad(embeddings, ( (len(self.special_tokens),0), (0,0) ), 'constant')
106 |       self._embeddings_array = np.stack(embeddings)
107 |       self._embed_size = self._embeddings_array.shape[1]
108 |     except:
109 |       shapes = set([embedding.shape for embedding in embeddings])
110 |       raise ValueError("Couldn't stack embeddings with shapes in %s" % shapes)
111 |     return
112 |   
113 |   #=============================================================
114 |   def count(self):
115 |     """"""
116 |     
117 |     if self.token_vocab is not None:
118 |       zipf = self.token_vocab.fit_to_zipf(plot=False)
119 |       zipf_freqs = zipf.predict(np.arange(len(self))+1)
120 |     else:
121 |       zipf_freqs = -np.log(np.arange(len(self))+1)
122 |     zipf_counts = zipf_freqs / np.min(zipf_freqs)
123 |     for count, token in zip(zipf_counts, self.strings()):
124 |       self.counts[token] = int(count)
125 |     return
126 |   
127 |   #=============================================================
128 |   @property
129 |   def token_vocab(self):
130 |     return self._token_vocab
131 |   @property
132 |   def token_embed_size(self):
133 |     return (self.token_vocab or self).embed_size
134 |   @property
135 |   def embeddings(self):
136 |     return super(PretrainedVocab, self).embeddings
137 |   #@embeddings.setter
138 |   #def embeddings(self, matrix):
139 |   #  self._embed_size = matrix.shape[1]
140 |   #  with tf.device('/cpu:0'):
141 |   #    with tf.variable_scope(self.name.title()):
142 |   #      self._embeddings = tf.Variable(matrix, name='Embeddings', trainable=False)
143 |   #  return
144 | 
145 | #***************************************************************
146 | if __name__ == '__main__':
147 |   """"""
148 |   
149 |   pretrained_vocab = PretrainedVocab(None)
150 |   print('PretrainedVocab passes')
151 | 


--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/token_vocab.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | # Copyright 2016 Timothy Dozat
  5 | # 
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | # 
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | # 
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import os
 23 | import re
 24 | import codecs
 25 | from collections import Counter
 26 | 
 27 | import numpy as np
 28 | import tensorflow as tf
 29 | 
 30 | from parser.vocabs.base_vocab import BaseVocab
 31 | from parser.misc.zipf import Zipf
 32 | 
 33 | __all__ = ['WordVocab', 'LemmaVocab', 'TagVocab', 'XTagVocab', 'RelVocab']
 34 | 
 35 | #***************************************************************
 36 | class TokenVocab(BaseVocab):
 37 |   """"""
 38 |   
 39 |   #=============================================================
 40 |   def __init__(self, *args, **kwargs):
 41 |     """"""
 42 |     
 43 |     recount = kwargs.pop('recount', False)
 44 |     initialize_zero = kwargs.pop('initialize_zero', True)
 45 |     super(TokenVocab, self).__init__(*args, **kwargs)
 46 |     
 47 |     if recount:
 48 |       self.count()
 49 |     else:
 50 |       if os.path.isfile(self.filename):
 51 |         self.load()
 52 |       else:
 53 |         self.count()
 54 |         self.dump()
 55 |     self.index_vocab()
 56 |     
 57 |     embed_dims = [len(self), self.embed_size]
 58 |     if initialize_zero:
 59 |       self._embeddings_array = np.zeros(embed_dims)
 60 |     else:
 61 |       self._embeddings_array = np.random.randn(*embed_dims)
 62 |     return
 63 |   
 64 |   #=============================================================
 65 |   def setup(self):
 66 |     """"""
 67 |     
 68 |     self.placeholder = None
 69 |     del self._embeddings
 70 |     with tf.device('/cpu:0'):
 71 |       with tf.variable_scope(self.name.title()):
 72 |         self._embeddings = tf.Variable(self._embeddings_array, name='Embeddings', dtype=tf.float32, trainable=True)
 73 |     return
 74 | 
 75 |   
 76 |   #=============================================================
 77 |   def count(self, conll_files=None):
 78 |     """"""
 79 |     
 80 |     if conll_files is None:
 81 |       conll_files = self.train_files
 82 |     
 83 |     for conll_file in conll_files:
 84 |       with codecs.open(conll_file, encoding='utf-8', errors='ignore') as f:
 85 |         for line_num, line in enumerate(f):
 86 |           try:
 87 |             line = line.strip()
 88 |             if line and not line.startswith('#'):
 89 |               line = line.split('\t')
 90 |               assert len(line) == 10
 91 |               token = line[self.conll_idx]
 92 |               if not self.cased:
 93 |                 token = token.lower()
 94 |               self.counts[token] += 1
 95 |           except:
 96 |             raise ValueError('File %s is misformatted at line %d' % (conll_file, line_num+1))
 97 |     return
 98 |   
 99 |   #=============================================================
100 |   def load(self):
101 |     """"""
102 |     
103 |     with codecs.open(self.filename, encoding='utf-8') as f:
104 |       for line_num, line in enumerate(f):
105 |         try:
106 |           line = line.rstrip()
107 |           if line:
108 |             line = line.split('\t')
109 |             token, count = line
110 |             self.counts[token] = int(count)
111 |         except:
112 |           raise ValueError('File %s is misformatted at line %d' % (train_file, line_num+1))
113 |     return
114 |   
115 |   #=============================================================
116 |   def dump(self):
117 |     """"""
118 |     
119 |     with codecs.open(self.filename, 'w', encoding='utf-8') as f:
120 |       for word, count in self.sorted_counts(self.counts):
121 |         f.write('%s\t%d\n' % (word, count))
122 |     return
123 |   
124 |   #=============================================================
125 |   def index_vocab(self):
126 |     """"""
127 |     
128 |     for token, count in self.sorted_counts(self.counts):
129 |       if ((count >= self.min_occur_count) and
130 |           token not in self and 
131 |           (not self.max_rank or len(self) < self.max_rank)):
132 |         self[token] = len(self)
133 |     return
134 |   
135 |   #=============================================================
136 |   def fit_to_zipf(self, plot=True):
137 |     """"""
138 |     
139 |     zipf = Zipf.from_configurable(self, self.counts, name='zipf-%s'%self.name)
140 |     if plot:
141 |       zipf.plot()
142 |     return zipf
143 |   
144 |   #=============================================================
145 |   @staticmethod
146 |   def sorted_counts(counts):
147 |     return sorted(counts.most_common(), key=lambda x: (-x[1], x[0]))
148 |   
149 |   #=============================================================
150 |   @property
151 |   def conll_idx(self):
152 |     return self._conll_idx
153 | 
154 | #***************************************************************
155 | class WordVocab(TokenVocab):
156 |   _conll_idx = 1
157 | class LemmaVocab(WordVocab):
158 |   _conll_idx = 2
159 | class TagVocab(TokenVocab):
160 |   _conll_idx = 3
161 | class XTagVocab(TagVocab):
162 |   _conll_idx = 4
163 | class RelVocab(TokenVocab):
164 |   _conll_idx = 7
165 | 
166 | #***************************************************************
167 | if __name__ == '__main__':
168 |   """"""
169 |   
170 |   from parser import Configurable
171 |   from parser.vocabs import PretrainedVocab, TokenVocab, WordVocab
172 |   
173 |   configurable = Configurable()
174 |   if os.path.isfile('saves/defaults/words.txt'):
175 |     os.remove('saves/defaults/words.txt')
176 |   token_vocab = WordVocab.from_configurable(configurable, 1)
177 |   token_vocab = WordVocab.from_configurable(configurable, 1)
178 |   token_vocab.fit_to_zipf()
179 |   #pretrained_vocab = PretrainedVocab.from_vocab(token_vocab)
180 |   #assert min(pretrained_vocab.counts.values()) == 1
181 |   print('TokenVocab passed')
182 | 


--------------------------------------------------------------------------------
/convert_NLP4J_to_CoNLL.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import os
 3 | import sys
 4 | 
 5 | #Convert NLP4J's 9-column output into CoNLL's 10-column format
 6 | def convert(inputFile):
 7 |     writer = open(inputFile + ".conll", "w")
 8 |     for line in open(inputFile, "r").readlines():
 9 |         eles = line.strip().split()
10 |         if len(eles) == 0:
11 |             writer.write("\n")
12 |         else:
13 |             eles[4] = "_"
14 |             eles.insert(4, eles[3])
15 |             eles[8] = "_"
16 |             eles[9] = "_"
17 |             writer.write("\t".join(eles) + "\n")
18 | 
19 |     writer.close()
20 | 
21 | if __name__ == "__main__":
22 |     convert(sys.argv[1])


--------------------------------------------------------------------------------
/data/sentence_segmented.txt:
--------------------------------------------------------------------------------
1 | Induction of tyrosine phosphorylation and T-cell activation by vanadate peroxide, an inhibitor of protein tyrosine phosphatases. 
2 | Rapid tyrosine phosphorylation of key cellular proteins is a crucial event in the transduction of activation signals to T-lymphocytes. 
3 | The regulatory role of protein tyrosine phosphatases (PTPases) in this process was explored by studying the effects of a powerful PTPase inhibitor, vanadate peroxide (pervanadate), on the activation cascade of Jurkat human leukaemic T-cells.


--------------------------------------------------------------------------------
/data/tokenized_sentence_segmented.txt:
--------------------------------------------------------------------------------
1 | Induction of tyrosine phosphorylation and T-cell activation by vanadate peroxide , an inhibitor of protein tyrosine phosphatases . 
2 | Rapid tyrosine phosphorylation of key cellular proteins is a crucial event in the transduction of activation signals to T-lymphocytes . 
3 | The regulatory role of protein tyrosine phosphatases ( PTPases ) in this process was explored by studying the effects of a powerful PTPase inhibitor , vanadate peroxide ( pervanadate ) , on the activation cascade of Jurkat human leukaemic T-cells .


--------------------------------------------------------------------------------
/data/tokenized_sentence_segmented.txt.column:
--------------------------------------------------------------------------------
 1 | 1	Induction	_	_	_	_	_	_	_	_
 2 | 2	of	_	_	_	_	_	_	_	_
 3 | 3	tyrosine	_	_	_	_	_	_	_	_
 4 | 4	phosphorylation	_	_	_	_	_	_	_	_
 5 | 5	and	_	_	_	_	_	_	_	_
 6 | 6	T-cell	_	_	_	_	_	_	_	_
 7 | 7	activation	_	_	_	_	_	_	_	_
 8 | 8	by	_	_	_	_	_	_	_	_
 9 | 9	vanadate	_	_	_	_	_	_	_	_
10 | 10	peroxide	_	_	_	_	_	_	_	_
11 | 11	,	_	_	_	_	_	_	_	_
12 | 12	an	_	_	_	_	_	_	_	_
13 | 13	inhibitor	_	_	_	_	_	_	_	_
14 | 14	of	_	_	_	_	_	_	_	_
15 | 15	protein	_	_	_	_	_	_	_	_
16 | 16	tyrosine	_	_	_	_	_	_	_	_
17 | 17	phosphatases	_	_	_	_	_	_	_	_
18 | 18	.	_	_	_	_	_	_	_	_
19 | 
20 | 1	Rapid	_	_	_	_	_	_	_	_
21 | 2	tyrosine	_	_	_	_	_	_	_	_
22 | 3	phosphorylation	_	_	_	_	_	_	_	_
23 | 4	of	_	_	_	_	_	_	_	_
24 | 5	key	_	_	_	_	_	_	_	_
25 | 6	cellular	_	_	_	_	_	_	_	_
26 | 7	proteins	_	_	_	_	_	_	_	_
27 | 8	is	_	_	_	_	_	_	_	_
28 | 9	a	_	_	_	_	_	_	_	_
29 | 10	crucial	_	_	_	_	_	_	_	_
30 | 11	event	_	_	_	_	_	_	_	_
31 | 12	in	_	_	_	_	_	_	_	_
32 | 13	the	_	_	_	_	_	_	_	_
33 | 14	transduction	_	_	_	_	_	_	_	_
34 | 15	of	_	_	_	_	_	_	_	_
35 | 16	activation	_	_	_	_	_	_	_	_
36 | 17	signals	_	_	_	_	_	_	_	_
37 | 18	to	_	_	_	_	_	_	_	_
38 | 19	T-lymphocytes	_	_	_	_	_	_	_	_
39 | 20	.	_	_	_	_	_	_	_	_
40 | 
41 | 1	The	_	_	_	_	_	_	_	_
42 | 2	regulatory	_	_	_	_	_	_	_	_
43 | 3	role	_	_	_	_	_	_	_	_
44 | 4	of	_	_	_	_	_	_	_	_
45 | 5	protein	_	_	_	_	_	_	_	_
46 | 6	tyrosine	_	_	_	_	_	_	_	_
47 | 7	phosphatases	_	_	_	_	_	_	_	_
48 | 8	(	_	_	_	_	_	_	_	_
49 | 9	PTPases	_	_	_	_	_	_	_	_
50 | 10	)	_	_	_	_	_	_	_	_
51 | 11	in	_	_	_	_	_	_	_	_
52 | 12	this	_	_	_	_	_	_	_	_
53 | 13	process	_	_	_	_	_	_	_	_
54 | 14	was	_	_	_	_	_	_	_	_
55 | 15	explored	_	_	_	_	_	_	_	_
56 | 16	by	_	_	_	_	_	_	_	_
57 | 17	studying	_	_	_	_	_	_	_	_
58 | 18	the	_	_	_	_	_	_	_	_
59 | 19	effects	_	_	_	_	_	_	_	_
60 | 20	of	_	_	_	_	_	_	_	_
61 | 21	a	_	_	_	_	_	_	_	_
62 | 22	powerful	_	_	_	_	_	_	_	_
63 | 23	PTPase	_	_	_	_	_	_	_	_
64 | 24	inhibitor	_	_	_	_	_	_	_	_
65 | 25	,	_	_	_	_	_	_	_	_
66 | 26	vanadate	_	_	_	_	_	_	_	_
67 | 27	peroxide	_	_	_	_	_	_	_	_
68 | 28	(	_	_	_	_	_	_	_	_
69 | 29	pervanadate	_	_	_	_	_	_	_	_
70 | 30	)	_	_	_	_	_	_	_	_
71 | 31	,	_	_	_	_	_	_	_	_
72 | 32	on	_	_	_	_	_	_	_	_
73 | 33	the	_	_	_	_	_	_	_	_
74 | 34	activation	_	_	_	_	_	_	_	_
75 | 35	cascade	_	_	_	_	_	_	_	_
76 | 36	of	_	_	_	_	_	_	_	_
77 | 37	Jurkat	_	_	_	_	_	_	_	_
78 | 38	human	_	_	_	_	_	_	_	_
79 | 39	leukaemic	_	_	_	_	_	_	_	_
80 | 40	T-cells	_	_	_	_	_	_	_	_
81 | 41	.	_	_	_	_	_	_	_	_
82 | 
83 | 


--------------------------------------------------------------------------------
/get_ColumnFormat.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import os
 3 | import sys
 4 | 
 5 | #Convert word-segmented corpus into 10-column format for dependency parsing
 6 | def convert(inputFilePath):
 7 |     writer = open(inputFilePath + ".column", "w")
 8 |     lines = open(inputFilePath, "r").readlines()
 9 |     for line in lines:
10 |         tok = line.strip().split()
11 |         if not tok or line.strip() == '':
12 |             writer.write("\n")
13 |         else:
14 |             count = 0
15 |             for word in tok:
16 |                 count += 1
17 |                 writer.write(str(count) + "\t" + word + "\t" + '\t'.join(['_'] * 8) + "\n")
18 |         writer.write("\n")    
19 |     writer.close()
20 | 
21 | if __name__ == "__main__":
22 |     convert(sys.argv[1])
23 |     pass


--------------------------------------------------------------------------------
/jPTDP-v1/README.md:
--------------------------------------------------------------------------------
1 | jPTDP: Neural network models for joint POS tagging and dependency parsing
2 | 
3 | See [https://github.com/datquocnguyen/jPTDP](https://github.com/datquocnguyen/jPTDP) for more details.


--------------------------------------------------------------------------------