├── NLP4J
├── bin
│ ├── nlpdecode
│ ├── nlpdecode.bat
│ ├── version
│ └── version.bat
├── config-CRAFT.xml
├── config-GENIA.xml
├── etc
│ └── log4j.properties
├── lexica
│ ├── en-ambiguity-classes-simplified-lowercase.xz
│ └── en-brown-clusters-simplified-lowercase.xz
├── models
│ ├── CRAFT.DEP.model.xz
│ ├── CRAFT.POS.model.xz
│ ├── GENIA.DEP.model.xz
│ └── GENIA.POS.model.xz
└── repo
│ ├── args4j
│ └── args4j
│ │ └── 2.32
│ │ └── args4j-2.32.jar
│ ├── edu
│ └── emory
│ │ └── mathcs
│ │ └── nlp
│ │ ├── nlp4j-api
│ │ └── 1.1.4-SNAPSHOT
│ │ │ └── nlp4j-api-1.1.4-SNAPSHOT.jar
│ │ └── nlp4j-cli
│ │ └── 1.1.4-SNAPSHOT
│ │ └── nlp4j-cli-1.1.4-SNAPSHOT.jar
│ ├── it
│ └── unimi
│ │ └── dsi
│ │ └── fastutil
│ │ └── 7.0.12
│ │ └── fastutil-7.0.12.jar
│ ├── log4j
│ └── log4j
│ │ └── 1.2.17
│ │ └── log4j-1.2.17.jar
│ └── org
│ ├── apache
│ └── commons
│ │ ├── commons-csv
│ │ └── 1.2
│ │ │ └── commons-csv-1.2.jar
│ │ └── commons-math3
│ │ └── 3.5
│ │ └── commons-math3-3.5.jar
│ ├── magicwerk
│ └── brownies-collections
│ │ └── 0.9.13
│ │ └── brownies-collections-0.9.13.jar
│ ├── slf4j
│ ├── slf4j-api
│ │ └── 1.7.21
│ │ │ └── slf4j-api-1.7.21.jar
│ └── slf4j-log4j12
│ │ └── 1.7.21
│ │ └── slf4j-log4j12-1.7.21.jar
│ └── tukaani
│ └── xz
│ └── 1.5
│ └── xz-1.5.jar
├── README.md
├── StanfordBiaffineParser-v2
├── config
│ ├── CRAFT.cfg
│ ├── GENIA.cfg
│ ├── defaults.cfg
│ └── template.cfg
├── main.py
└── parser
│ ├── __init__.py
│ ├── bucket.py
│ ├── configurable.py
│ ├── dataset.py
│ ├── misc
│ ├── __init__.py
│ ├── bucketer.py
│ ├── colors.py
│ ├── get_encoding.py
│ ├── mst.py
│ └── zipf.py
│ ├── multibucket.py
│ ├── network.py
│ ├── neural
│ ├── __init__.py
│ ├── functions.py
│ ├── linalg.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── embeds
│ │ │ ├── __init__.py
│ │ │ ├── base_embed.py
│ │ │ ├── cnn_embed.py
│ │ │ ├── mlp_embed.py
│ │ │ └── rnn_embed.py
│ │ ├── nlp
│ │ │ ├── __init__.py
│ │ │ ├── parsers
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base_parser.py
│ │ │ │ ├── bin_parser.py
│ │ │ │ ├── fish_parser.py
│ │ │ │ ├── gama_parser.py
│ │ │ │ ├── parser.py
│ │ │ │ └── xbar_parser.py
│ │ │ └── taggers
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base_tagger.py
│ │ │ │ ├── base_xtagger.py
│ │ │ │ ├── tagger.py
│ │ │ │ └── xtagger.py
│ │ └── nn.py
│ ├── optimizers
│ │ ├── __init__.py
│ │ ├── base_optimizer.py
│ │ ├── radam_optimizer.py
│ │ └── sgd_optimizer.py
│ ├── recur_cells
│ │ ├── .directory
│ │ ├── __init__.py
│ │ ├── base_cell.py
│ │ ├── cif_lstm_cell.py
│ │ ├── gru_cell.py
│ │ ├── lstm_cell.py
│ │ └── rnn_cell.py
│ └── rnn.py
│ ├── scripts
│ ├── compression_ratio.py
│ ├── count_nonprojective.py
│ ├── heaps_law.py
│ └── reinsert_compounds.py
│ ├── trash
│ ├── retrained_vocab.py
│ └── weighted_mean.py
│ └── vocabs
│ ├── __init__.py
│ ├── base_vocab.py
│ ├── index_vocab.py
│ ├── multivocab.py
│ ├── ngram_multivocab.py
│ ├── ngram_vocab.py
│ ├── pretrained_vocab.py
│ ├── subtoken_vocab.py
│ └── token_vocab.py
├── convert_NLP4J_to_CoNLL.py
├── data
├── raw.txt
├── sentence_segmented.txt
├── tokenized_sentence_segmented.txt
└── tokenized_sentence_segmented.txt.column
├── get_ColumnFormat.py
└── jPTDP-v1
└── README.md
/NLP4J/bin/nlpdecode:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # ----------------------------------------------------------------------------
3 | # Copyright 2001-2006 The Apache Software Foundation.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ----------------------------------------------------------------------------
17 | #
18 | # Copyright (c) 2001-2006 The Apache Software Foundation. All rights
19 | # reserved.
20 |
21 |
22 | # resolve links - $0 may be a softlink
23 | PRG="$0"
24 |
25 | while [ -h "$PRG" ]; do
26 | ls=`ls -ld "$PRG"`
27 | link=`expr "$ls" : '.*-> \(.*\)$'`
28 | if expr "$link" : '/.*' > /dev/null; then
29 | PRG="$link"
30 | else
31 | PRG=`dirname "$PRG"`/"$link"
32 | fi
33 | done
34 |
35 | PRGDIR=`dirname "$PRG"`
36 | BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd`
37 |
38 | # Reset the REPO variable. If you need to influence this use the environment setup file.
39 | REPO=
40 |
41 |
42 | # OS specific support. $var _must_ be set to either true or false.
43 | cygwin=false;
44 | darwin=false;
45 | case "`uname`" in
46 | CYGWIN*) cygwin=true ;;
47 | Darwin*) darwin=true
48 | if [ -z "$JAVA_VERSION" ] ; then
49 | JAVA_VERSION="CurrentJDK"
50 | else
51 | echo "Using Java version: $JAVA_VERSION"
52 | fi
53 | if [ -z "$JAVA_HOME" ]; then
54 | if [ -x "/usr/libexec/java_home" ]; then
55 | JAVA_HOME=`/usr/libexec/java_home`
56 | else
57 | JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home
58 | fi
59 | fi
60 | ;;
61 | esac
62 |
63 | if [ -z "$JAVA_HOME" ] ; then
64 | if [ -r /etc/gentoo-release ] ; then
65 | JAVA_HOME=`java-config --jre-home`
66 | fi
67 | fi
68 |
69 | # For Cygwin, ensure paths are in UNIX format before anything is touched
70 | if $cygwin ; then
71 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
72 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"`
73 | fi
74 |
75 | # If a specific java binary isn't specified search for the standard 'java' binary
76 | if [ -z "$JAVACMD" ] ; then
77 | if [ -n "$JAVA_HOME" ] ; then
78 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
79 | # IBM's JDK on AIX uses strange locations for the executables
80 | JAVACMD="$JAVA_HOME/jre/sh/java"
81 | else
82 | JAVACMD="$JAVA_HOME/bin/java"
83 | fi
84 | else
85 | JAVACMD=`which java`
86 | fi
87 | fi
88 |
89 | if [ ! -x "$JAVACMD" ] ; then
90 | echo "Error: JAVA_HOME is not defined correctly." 1>&2
91 | echo " We cannot execute $JAVACMD" 1>&2
92 | exit 1
93 | fi
94 |
95 | if [ -z "$REPO" ]
96 | then
97 | REPO="$BASEDIR"/repo
98 | fi
99 |
100 | CLASSPATH="$BASEDIR"/etc:"$REPO"/edu/emory/mathcs/nlp/nlp4j-english/1.1.2/nlp4j-english-1.1.2.jar:"$REPO"/edu/emory/mathcs/nlp/nlp4j-api/1.1.4-SNAPSHOT/nlp4j-api-1.1.4-SNAPSHOT.jar:"$REPO"/org/slf4j/slf4j-api/1.7.21/slf4j-api-1.7.21.jar:"$REPO"/org/tukaani/xz/1.5/xz-1.5.jar:"$REPO"/it/unimi/dsi/fastutil/7.0.12/fastutil-7.0.12.jar:"$REPO"/org/magicwerk/brownies-collections/0.9.13/brownies-collections-0.9.13.jar:"$REPO"/org/apache/commons/commons-math3/3.5/commons-math3-3.5.jar:"$REPO"/org/apache/commons/commons-csv/1.2/commons-csv-1.2.jar:"$REPO"/org/slf4j/slf4j-log4j12/1.7.21/slf4j-log4j12-1.7.21.jar:"$REPO"/log4j/log4j/1.2.17/log4j-1.2.17.jar:"$REPO"/args4j/args4j/2.32/args4j-2.32.jar:"$REPO"/edu/emory/mathcs/nlp/nlp4j-cli/1.1.4-SNAPSHOT/nlp4j-cli-1.1.4-SNAPSHOT.jar
101 |
102 | ENDORSED_DIR=
103 | if [ -n "$ENDORSED_DIR" ] ; then
104 | CLASSPATH=$BASEDIR/$ENDORSED_DIR/*:$CLASSPATH
105 | fi
106 |
107 | if [ -n "$CLASSPATH_PREFIX" ] ; then
108 | CLASSPATH=$CLASSPATH_PREFIX:$CLASSPATH
109 | fi
110 |
111 | # For Cygwin, switch paths to Windows format before running java
112 | if $cygwin; then
113 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"`
114 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"`
115 | [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"`
116 | [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"`
117 | [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"`
118 | fi
119 |
120 | exec "$JAVACMD" $JAVA_OPTS -Xmx8g -XX:+UseConcMarkSweepGC \
121 | -classpath "$CLASSPATH" \
122 | -Dapp.name="nlpdecode" \
123 | -Dapp.pid="$$" \
124 | -Dapp.repo="$REPO" \
125 | -Dapp.home="$BASEDIR" \
126 | -Dbasedir="$BASEDIR" \
127 | edu.emory.mathcs.nlp.bin.NLPDecode \
128 | "$@"
129 |
--------------------------------------------------------------------------------
/NLP4J/bin/nlpdecode.bat:
--------------------------------------------------------------------------------
1 | @REM ----------------------------------------------------------------------------
2 | @REM Copyright 2001-2006 The Apache Software Foundation.
3 | @REM
4 | @REM Licensed under the Apache License, Version 2.0 (the "License");
5 | @REM you may not use this file except in compliance with the License.
6 | @REM You may obtain a copy of the License at
7 | @REM
8 | @REM http://www.apache.org/licenses/LICENSE-2.0
9 | @REM
10 | @REM Unless required by applicable law or agreed to in writing, software
11 | @REM distributed under the License is distributed on an "AS IS" BASIS,
12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @REM See the License for the specific language governing permissions and
14 | @REM limitations under the License.
15 | @REM ----------------------------------------------------------------------------
16 | @REM
17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights
18 | @REM reserved.
19 |
20 | @echo off
21 |
22 | set ERROR_CODE=0
23 |
24 | :init
25 | @REM Decide how to startup depending on the version of windows
26 |
27 | @REM -- Win98ME
28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg
29 |
30 | @REM set local scope for the variables with windows NT shell
31 | if "%OS%"=="Windows_NT" @setlocal
32 |
33 | @REM -- 4NT shell
34 | if "%eval[2+2]" == "4" goto 4NTArgs
35 |
36 | @REM -- Regular WinNT shell
37 | set CMD_LINE_ARGS=%*
38 | goto WinNTGetScriptDir
39 |
40 | @REM The 4NT Shell from jp software
41 | :4NTArgs
42 | set CMD_LINE_ARGS=%$
43 | goto WinNTGetScriptDir
44 |
45 | :Win9xArg
46 | @REM Slurp the command line arguments. This loop allows for an unlimited number
47 | @REM of arguments (up to the command line limit, anyway).
48 | set CMD_LINE_ARGS=
49 | :Win9xApp
50 | if %1a==a goto Win9xGetScriptDir
51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1
52 | shift
53 | goto Win9xApp
54 |
55 | :Win9xGetScriptDir
56 | set SAVEDIR=%CD%
57 | %0\
58 | cd %0\..\..
59 | set BASEDIR=%CD%
60 | cd %SAVEDIR%
61 | set SAVE_DIR=
62 | goto repoSetup
63 |
64 | :WinNTGetScriptDir
65 | set BASEDIR=%~dp0\..
66 |
67 | :repoSetup
68 | set REPO=
69 |
70 |
71 | if "%JAVACMD%"=="" set JAVACMD=java
72 |
73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo
74 |
75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-english\1.1.2\nlp4j-english-1.1.2.jar;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-api\1.1.4-SNAPSHOT\nlp4j-api-1.1.4-SNAPSHOT.jar;"%REPO%"\org\slf4j\slf4j-api\1.7.21\slf4j-api-1.7.21.jar;"%REPO%"\org\tukaani\xz\1.5\xz-1.5.jar;"%REPO%"\it\unimi\dsi\fastutil\7.0.12\fastutil-7.0.12.jar;"%REPO%"\org\magicwerk\brownies-collections\0.9.13\brownies-collections-0.9.13.jar;"%REPO%"\org\apache\commons\commons-math3\3.5\commons-math3-3.5.jar;"%REPO%"\org\apache\commons\commons-csv\1.2\commons-csv-1.2.jar;"%REPO%"\org\slf4j\slf4j-log4j12\1.7.21\slf4j-log4j12-1.7.21.jar;"%REPO%"\log4j\log4j\1.2.17\log4j-1.2.17.jar;"%REPO%"\args4j\args4j\2.32\args4j-2.32.jar;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-cli\1.1.4-SNAPSHOT\nlp4j-cli-1.1.4-SNAPSHOT.jar
76 |
77 | set ENDORSED_DIR=
78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH%
79 |
80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH%
81 |
82 | @REM Reaching here means variables are defined and arguments have been captured
83 | :endInit
84 |
85 | %JAVACMD% %JAVA_OPTS% -Xmx8g -XX:+UseConcMarkSweepGC -classpath %CLASSPATH% -Dapp.name="nlpdecode" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" edu.emory.mathcs.nlp.bin.NLPDecode %CMD_LINE_ARGS%
86 | if %ERRORLEVEL% NEQ 0 goto error
87 | goto end
88 |
89 | :error
90 | if "%OS%"=="Windows_NT" @endlocal
91 | set ERROR_CODE=%ERRORLEVEL%
92 |
93 | :end
94 | @REM set local scope for the variables with windows NT shell
95 | if "%OS%"=="Windows_NT" goto endNT
96 |
97 | @REM For old DOS remove the set variables from ENV - we assume they were not set
98 | @REM before we started - at least we don't leave any baggage around
99 | set CMD_LINE_ARGS=
100 | goto postExec
101 |
102 | :endNT
103 | @REM If error code is set to 1 then the endlocal was done already in :error.
104 | if %ERROR_CODE% EQU 0 @endlocal
105 |
106 |
107 | :postExec
108 |
109 | if "%FORCE_EXIT_ON_ERROR%" == "on" (
110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE%
111 | )
112 |
113 | exit /B %ERROR_CODE%
114 |
--------------------------------------------------------------------------------
/NLP4J/bin/version:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # ----------------------------------------------------------------------------
3 | # Copyright 2001-2006 The Apache Software Foundation.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ----------------------------------------------------------------------------
17 | #
18 | # Copyright (c) 2001-2006 The Apache Software Foundation. All rights
19 | # reserved.
20 |
21 |
22 | # resolve links - $0 may be a softlink
23 | PRG="$0"
24 |
25 | while [ -h "$PRG" ]; do
26 | ls=`ls -ld "$PRG"`
27 | link=`expr "$ls" : '.*-> \(.*\)$'`
28 | if expr "$link" : '/.*' > /dev/null; then
29 | PRG="$link"
30 | else
31 | PRG=`dirname "$PRG"`/"$link"
32 | fi
33 | done
34 |
35 | PRGDIR=`dirname "$PRG"`
36 | BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd`
37 |
38 | # Reset the REPO variable. If you need to influence this use the environment setup file.
39 | REPO=
40 |
41 |
42 | # OS specific support. $var _must_ be set to either true or false.
43 | cygwin=false;
44 | darwin=false;
45 | case "`uname`" in
46 | CYGWIN*) cygwin=true ;;
47 | Darwin*) darwin=true
48 | if [ -z "$JAVA_VERSION" ] ; then
49 | JAVA_VERSION="CurrentJDK"
50 | else
51 | echo "Using Java version: $JAVA_VERSION"
52 | fi
53 | if [ -z "$JAVA_HOME" ]; then
54 | if [ -x "/usr/libexec/java_home" ]; then
55 | JAVA_HOME=`/usr/libexec/java_home`
56 | else
57 | JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home
58 | fi
59 | fi
60 | ;;
61 | esac
62 |
63 | if [ -z "$JAVA_HOME" ] ; then
64 | if [ -r /etc/gentoo-release ] ; then
65 | JAVA_HOME=`java-config --jre-home`
66 | fi
67 | fi
68 |
69 | # For Cygwin, ensure paths are in UNIX format before anything is touched
70 | if $cygwin ; then
71 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
72 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"`
73 | fi
74 |
75 | # If a specific java binary isn't specified search for the standard 'java' binary
76 | if [ -z "$JAVACMD" ] ; then
77 | if [ -n "$JAVA_HOME" ] ; then
78 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
79 | # IBM's JDK on AIX uses strange locations for the executables
80 | JAVACMD="$JAVA_HOME/jre/sh/java"
81 | else
82 | JAVACMD="$JAVA_HOME/bin/java"
83 | fi
84 | else
85 | JAVACMD=`which java`
86 | fi
87 | fi
88 |
89 | if [ ! -x "$JAVACMD" ] ; then
90 | echo "Error: JAVA_HOME is not defined correctly." 1>&2
91 | echo " We cannot execute $JAVACMD" 1>&2
92 | exit 1
93 | fi
94 |
95 | if [ -z "$REPO" ]
96 | then
97 | REPO="$BASEDIR"/repo
98 | fi
99 |
100 | CLASSPATH="$BASEDIR"/etc:"$REPO"/edu/emory/mathcs/nlp/nlp4j-english/1.1.2/nlp4j-english-1.1.2.jar:"$REPO"/edu/emory/mathcs/nlp/nlp4j-api/1.1.4-SNAPSHOT/nlp4j-api-1.1.4-SNAPSHOT.jar:"$REPO"/org/slf4j/slf4j-api/1.7.21/slf4j-api-1.7.21.jar:"$REPO"/org/tukaani/xz/1.5/xz-1.5.jar:"$REPO"/it/unimi/dsi/fastutil/7.0.12/fastutil-7.0.12.jar:"$REPO"/org/magicwerk/brownies-collections/0.9.13/brownies-collections-0.9.13.jar:"$REPO"/org/apache/commons/commons-math3/3.5/commons-math3-3.5.jar:"$REPO"/org/apache/commons/commons-csv/1.2/commons-csv-1.2.jar:"$REPO"/org/slf4j/slf4j-log4j12/1.7.21/slf4j-log4j12-1.7.21.jar:"$REPO"/log4j/log4j/1.2.17/log4j-1.2.17.jar:"$REPO"/args4j/args4j/2.32/args4j-2.32.jar:"$REPO"/edu/emory/mathcs/nlp/nlp4j-cli/1.1.4-SNAPSHOT/nlp4j-cli-1.1.4-SNAPSHOT.jar
101 |
102 | ENDORSED_DIR=
103 | if [ -n "$ENDORSED_DIR" ] ; then
104 | CLASSPATH=$BASEDIR/$ENDORSED_DIR/*:$CLASSPATH
105 | fi
106 |
107 | if [ -n "$CLASSPATH_PREFIX" ] ; then
108 | CLASSPATH=$CLASSPATH_PREFIX:$CLASSPATH
109 | fi
110 |
111 | # For Cygwin, switch paths to Windows format before running java
112 | if $cygwin; then
113 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"`
114 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"`
115 | [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"`
116 | [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"`
117 | [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"`
118 | fi
119 |
120 | exec "$JAVACMD" $JAVA_OPTS -Xmx10g -XX:+UseConcMarkSweepGC \
121 | -classpath "$CLASSPATH" \
122 | -Dapp.name="version" \
123 | -Dapp.pid="$$" \
124 | -Dapp.repo="$REPO" \
125 | -Dapp.home="$BASEDIR" \
126 | -Dbasedir="$BASEDIR" \
127 | edu.emory.mathcs.nlp.bin.Version \
128 | "$@"
129 |
--------------------------------------------------------------------------------
/NLP4J/bin/version.bat:
--------------------------------------------------------------------------------
1 | @REM ----------------------------------------------------------------------------
2 | @REM Copyright 2001-2006 The Apache Software Foundation.
3 | @REM
4 | @REM Licensed under the Apache License, Version 2.0 (the "License");
5 | @REM you may not use this file except in compliance with the License.
6 | @REM You may obtain a copy of the License at
7 | @REM
8 | @REM http://www.apache.org/licenses/LICENSE-2.0
9 | @REM
10 | @REM Unless required by applicable law or agreed to in writing, software
11 | @REM distributed under the License is distributed on an "AS IS" BASIS,
12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @REM See the License for the specific language governing permissions and
14 | @REM limitations under the License.
15 | @REM ----------------------------------------------------------------------------
16 | @REM
17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights
18 | @REM reserved.
19 |
20 | @echo off
21 |
22 | set ERROR_CODE=0
23 |
24 | :init
25 | @REM Decide how to startup depending on the version of windows
26 |
27 | @REM -- Win98ME
28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg
29 |
30 | @REM set local scope for the variables with windows NT shell
31 | if "%OS%"=="Windows_NT" @setlocal
32 |
33 | @REM -- 4NT shell
34 | if "%eval[2+2]" == "4" goto 4NTArgs
35 |
36 | @REM -- Regular WinNT shell
37 | set CMD_LINE_ARGS=%*
38 | goto WinNTGetScriptDir
39 |
40 | @REM The 4NT Shell from jp software
41 | :4NTArgs
42 | set CMD_LINE_ARGS=%$
43 | goto WinNTGetScriptDir
44 |
45 | :Win9xArg
46 | @REM Slurp the command line arguments. This loop allows for an unlimited number
47 | @REM of arguments (up to the command line limit, anyway).
48 | set CMD_LINE_ARGS=
49 | :Win9xApp
50 | if %1a==a goto Win9xGetScriptDir
51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1
52 | shift
53 | goto Win9xApp
54 |
55 | :Win9xGetScriptDir
56 | set SAVEDIR=%CD%
57 | %0\
58 | cd %0\..\..
59 | set BASEDIR=%CD%
60 | cd %SAVEDIR%
61 | set SAVE_DIR=
62 | goto repoSetup
63 |
64 | :WinNTGetScriptDir
65 | set BASEDIR=%~dp0\..
66 |
67 | :repoSetup
68 | set REPO=
69 |
70 |
71 | if "%JAVACMD%"=="" set JAVACMD=java
72 |
73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo
74 |
75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-english\1.1.2\nlp4j-english-1.1.2.jar;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-api\1.1.4-SNAPSHOT\nlp4j-api-1.1.4-SNAPSHOT.jar;"%REPO%"\org\slf4j\slf4j-api\1.7.21\slf4j-api-1.7.21.jar;"%REPO%"\org\tukaani\xz\1.5\xz-1.5.jar;"%REPO%"\it\unimi\dsi\fastutil\7.0.12\fastutil-7.0.12.jar;"%REPO%"\org\magicwerk\brownies-collections\0.9.13\brownies-collections-0.9.13.jar;"%REPO%"\org\apache\commons\commons-math3\3.5\commons-math3-3.5.jar;"%REPO%"\org\apache\commons\commons-csv\1.2\commons-csv-1.2.jar;"%REPO%"\org\slf4j\slf4j-log4j12\1.7.21\slf4j-log4j12-1.7.21.jar;"%REPO%"\log4j\log4j\1.2.17\log4j-1.2.17.jar;"%REPO%"\args4j\args4j\2.32\args4j-2.32.jar;"%REPO%"\edu\emory\mathcs\nlp\nlp4j-cli\1.1.4-SNAPSHOT\nlp4j-cli-1.1.4-SNAPSHOT.jar
76 |
77 | set ENDORSED_DIR=
78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH%
79 |
80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH%
81 |
82 | @REM Reaching here means variables are defined and arguments have been captured
83 | :endInit
84 |
85 | %JAVACMD% %JAVA_OPTS% -Xmx10g -XX:+UseConcMarkSweepGC -classpath %CLASSPATH% -Dapp.name="version" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" edu.emory.mathcs.nlp.bin.Version %CMD_LINE_ARGS%
86 | if %ERRORLEVEL% NEQ 0 goto error
87 | goto end
88 |
89 | :error
90 | if "%OS%"=="Windows_NT" @endlocal
91 | set ERROR_CODE=%ERRORLEVEL%
92 |
93 | :end
94 | @REM set local scope for the variables with windows NT shell
95 | if "%OS%"=="Windows_NT" goto endNT
96 |
97 | @REM For old DOS remove the set variables from ENV - we assume they were not set
98 | @REM before we started - at least we don't leave any baggage around
99 | set CMD_LINE_ARGS=
100 | goto postExec
101 |
102 | :endNT
103 | @REM If error code is set to 1 then the endlocal was done already in :error.
104 | if %ERROR_CODE% EQU 0 @endlocal
105 |
106 |
107 | :postExec
108 |
109 | if "%FORCE_EXIT_ON_ERROR%" == "on" (
110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE%
111 | )
112 |
113 | exit /B %ERROR_CODE%
114 |
--------------------------------------------------------------------------------
/NLP4J/config-CRAFT.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
7 |
8 |
9 |
10 | lexica/en-ambiguity-classes-simplified-lowercase.xz
11 | lexica/en-brown-clusters-simplified-lowercase.xz
12 |
13 |
14 |
15 | models/CRAFT.POS.model.xz
16 | models/CRAFT.DEP.model.xz
17 |
18 |
19 |
--------------------------------------------------------------------------------
/NLP4J/config-GENIA.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
7 |
8 |
9 |
10 | lexica/en-ambiguity-classes-simplified-lowercase.xz
11 | lexica/en-brown-clusters-simplified-lowercase.xz
12 |
13 |
14 |
15 | models/GENIA.POS.model.xz
16 | models/GENIA.DEP.model.xz
17 |
18 |
19 |
--------------------------------------------------------------------------------
/NLP4J/etc/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set root logger level to DEBUG and its only appender to A1.
2 | log4j.rootLogger=INFO, A1
3 |
4 | # A1 is set to be a ConsoleAppender.
5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
6 |
7 | # A1 uses PatternLayout.
8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
9 | log4j.appender.A1.layout.conversionPattern=%m%n
10 |
--------------------------------------------------------------------------------
/NLP4J/lexica/en-ambiguity-classes-simplified-lowercase.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/lexica/en-ambiguity-classes-simplified-lowercase.xz
--------------------------------------------------------------------------------
/NLP4J/lexica/en-brown-clusters-simplified-lowercase.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/lexica/en-brown-clusters-simplified-lowercase.xz
--------------------------------------------------------------------------------
/NLP4J/models/CRAFT.DEP.model.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/models/CRAFT.DEP.model.xz
--------------------------------------------------------------------------------
/NLP4J/models/CRAFT.POS.model.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/models/CRAFT.POS.model.xz
--------------------------------------------------------------------------------
/NLP4J/models/GENIA.DEP.model.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/models/GENIA.DEP.model.xz
--------------------------------------------------------------------------------
/NLP4J/models/GENIA.POS.model.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/models/GENIA.POS.model.xz
--------------------------------------------------------------------------------
/NLP4J/repo/args4j/args4j/2.32/args4j-2.32.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/args4j/args4j/2.32/args4j-2.32.jar
--------------------------------------------------------------------------------
/NLP4J/repo/edu/emory/mathcs/nlp/nlp4j-api/1.1.4-SNAPSHOT/nlp4j-api-1.1.4-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/edu/emory/mathcs/nlp/nlp4j-api/1.1.4-SNAPSHOT/nlp4j-api-1.1.4-SNAPSHOT.jar
--------------------------------------------------------------------------------
/NLP4J/repo/edu/emory/mathcs/nlp/nlp4j-cli/1.1.4-SNAPSHOT/nlp4j-cli-1.1.4-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/edu/emory/mathcs/nlp/nlp4j-cli/1.1.4-SNAPSHOT/nlp4j-cli-1.1.4-SNAPSHOT.jar
--------------------------------------------------------------------------------
/NLP4J/repo/it/unimi/dsi/fastutil/7.0.12/fastutil-7.0.12.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/it/unimi/dsi/fastutil/7.0.12/fastutil-7.0.12.jar
--------------------------------------------------------------------------------
/NLP4J/repo/log4j/log4j/1.2.17/log4j-1.2.17.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/log4j/log4j/1.2.17/log4j-1.2.17.jar
--------------------------------------------------------------------------------
/NLP4J/repo/org/apache/commons/commons-csv/1.2/commons-csv-1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/apache/commons/commons-csv/1.2/commons-csv-1.2.jar
--------------------------------------------------------------------------------
/NLP4J/repo/org/apache/commons/commons-math3/3.5/commons-math3-3.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/apache/commons/commons-math3/3.5/commons-math3-3.5.jar
--------------------------------------------------------------------------------
/NLP4J/repo/org/magicwerk/brownies-collections/0.9.13/brownies-collections-0.9.13.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/magicwerk/brownies-collections/0.9.13/brownies-collections-0.9.13.jar
--------------------------------------------------------------------------------
/NLP4J/repo/org/slf4j/slf4j-api/1.7.21/slf4j-api-1.7.21.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/slf4j/slf4j-api/1.7.21/slf4j-api-1.7.21.jar
--------------------------------------------------------------------------------
/NLP4J/repo/org/slf4j/slf4j-log4j12/1.7.21/slf4j-log4j12-1.7.21.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/slf4j/slf4j-log4j12/1.7.21/slf4j-log4j12-1.7.21.jar
--------------------------------------------------------------------------------
/NLP4J/repo/org/tukaani/xz/1.5/xz-1.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datquocnguyen/BioPosDep/8b902edfa99f7b41b6da9da3d2575281706f2a61/NLP4J/repo/org/tukaani/xz/1.5/xz-1.5.jar
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # Biomedical POS tagging and dependency parsing models
3 |
4 | Biomedical POS tagging and dependency parsing models are trained on [GENIA](http://www.geniaproject.org/) and [CRAFT](http://BioPosDep-corpora.sourceforge.net/CRAFT/). See [our following paper](https://arxiv.org/abs/1808.03731) for more details:
5 |
6 | @Article{NguyenK2019,
7 | author="Nguyen, Dat Quoc and Verspoor, Karin",
8 | title="From POS tagging to dependency parsing for biomedical event extraction",
9 | journal="BMC Bioinformatics",
10 | year="2019",
11 | month="Feb",
12 | day="12",
13 | volume="20",
14 | number="1",
15 | pages="72",
16 | doi="10.1186/s12859-019-2604-0",
17 | url="https://doi.org/10.1186/s12859-019-2604-0"
18 | }
19 |
20 | Our models are **free** for non-commercial use and distributed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International ([CC BY-NC-SA](https://creativecommons.org/licenses/by-nc-sa/4.0/)) License.
21 |
22 |
23 |
24 | # Usage
25 |
26 | #### The first step is to perform POS tagging and dependency parsing using [NLP4J](https://emorynlp.github.io/nlp4j/) models. Here, NLP4J would also perform _TOKENIZATION_ and _SENTENCE SEGMENTATION_ if input files are raw text corpora. Then, the output of NLP4J will be used as input for other dependency parsing models.
27 |
28 | ### Perform biomedical POS tagging and dependency parsing using retrained NLP4J models
29 |
30 | #### Installation
31 |
32 | Download NLP4J models from [https://github.com/datquocnguyen/BioPosDep/archive/master.zip](https://github.com/datquocnguyen/BioPosDep/archive/master.zip) (70MB) or clone these models using `git`:
33 |
34 | $ git clone https://github.com/datquocnguyen/BioPosDep.git
35 |
36 | To run the models, it is expected that `Java` is already set to run in command line or terminal.
37 |
38 | #### Command line
39 |
40 | # Using models trained on GENIA
41 | BioPosDep/NLP4J$ bin/nlpdecode -c config-GENIA.xml -i -format [-ie -oe ]
42 |
43 | # Using models trained on CRAFT
44 | BioPosDep/NLP4J$ bin/nlpdecode -c config-CRAFT.xml -i -format [-ie -oe ]
45 |
46 | -i : input path (required)
47 | -format : format of the input data (raw|line|tsv; default: raw)
48 | -ie : input file extension (default: *)
49 | -oe : output file extension (default: nlp)
50 |
51 | - `-i` specifies the input path pointing to either a file or a directory. When the path points to a file, only the specific file is processed. When the path points to a directory, all files with the file extension `-ie` under the specific directory are processed.
52 | - `-format` specifies the format of the input file: `raw`, `line`, or `tsv`
53 | - `raw` accepts texts in any format
54 | - `line` expects a sentence per line
55 | - `tsv` expects columns delimited by `\t` and sentences separated by `\n`
56 | - `-ie` specifies the input file extension. The default value `*` implies files with any extension. This option is used only when the input path `-i` points to a directory.
57 | - `-oe` specifies the output file extension appended to each input filename. The corresponding output file, consisting of the NLP output, will be generated.
58 |
59 | #### Examples
60 |
61 | # For a raw corpus input
62 | BioPosDep/NLP4J$ bin/nlpdecode -c config-GENIA.xml -i ../data/raw.txt -format raw -oe genia
63 | BioPosDep/NLP4J$ bin/nlpdecode -c config-CRAFT.xml -i ../data/raw.txt -format raw -oe craft
64 |
65 | # For a sentence-segmented corpus input (without tokenization!)
66 | BioPosDep/NLP4J$ bin/nlpdecode -c config-GENIA.xml -i ../data/sentence_segmented.txt -format line -oe genia
67 | BioPosDep/NLP4J$ bin/nlpdecode -c config-CRAFT.xml -i ../data/sentence_segmented.txt -format line -oe craft
68 |
69 | # For a "pre-processed" tokenized and sentence-segmented corpus
70 | # Convert into a column-based format
71 | BioPosDep/NLP4J$ python ../get_ColumnFormat.py ../data/tokenized_sentence_segmented.txt
72 | # Apply models using "tsv". Here we expect word forms at the second column (i.e. column index of 1).
73 | # Adjust in config-GENIA.xml and config-CRAFT.xml if users already have a column-formated corpus with a different index of the word form column.
74 | BioPosDep/NLP4J$ bin/nlpdecode -c config-GENIA.xml -i ../data/tokenized_sentence_segmented.txt.column -format tsv -oe genia
75 | BioPosDep/NLP4J$ bin/nlpdecode -c config-CRAFT.xml -i ../data/tokenized_sentence_segmented.txt.column -format tsv -oe craft
76 |
77 |
78 | From the examples above, output files `.genia` and `.craft ` are generated in folder `data`, containing POS and dependency annotations.
79 |
80 |
81 | #### NOTE
82 | Those NLP4J output files are in a 9-column format. To further apply other dependency parsing models, they must be converted to 10-column format:
83 |
84 | # Command line
85 | BioPosDep$ python convert_NLP4J_to_CoNLL.py
86 |
87 | # Examples
88 | BioPosDep$ python convert_NLP4J_to_CoNLL.py data/raw.txt.genia
89 | BioPosDep$ python convert_NLP4J_to_CoNLL.py data/raw.txt.craft
90 |
91 | ##### Two 10-column output files `raw.txt.genia.conll` and `raw.txt.craft.conll` are generated in folder `data`, which will be used as inputs for other models.
92 |
93 | ### Using retrained Stanford [Biaffine](https://github.com/tdozat/Parser-v2) parsing models
94 |
95 | #### Installation
96 |
97 | # Install prerequisite packages
98 | BioPosDep/StanfordBiaffineParser-v2$ virtualenv .TF1_0
99 | BioPosDep/StanfordBiaffineParser-v2$ source .TF1_0/bin/activate
100 | BioPosDep/StanfordBiaffineParser-v2$ pip install tensorflow==1.0
101 | BioPosDep/StanfordBiaffineParser-v2$ pip install numpy==1.11.0
102 | BioPosDep/StanfordBiaffineParser-v2$ pip install scipy==1.0.0
103 | BioPosDep/StanfordBiaffineParser-v2$ pip install matplotlib==2.1.2
104 | BioPosDep/StanfordBiaffineParser-v2$ pip install backports.lzma
105 |
106 | - Download file `Pre-trained-Biaffine-v2.zip` from [HERE](https://drive.google.com/file/d/18IYSJEV0uwbg468lFXejS0Wyw2_8Pjfa/view?usp=sharing).
107 | - Unzip the file, then copy/move folder `models` and file `PubMed-shuffle-win2-500Kwords.txt` into folder `BioPosDep/StanfordBiaffineParser-v2`.
108 |
109 |
110 |
111 | #### Command line
112 |
113 | # Using model trained on GENIA
114 | BioPosDep/StanfordBiaffineParser-v2$ python main.py --save_dir models/GENIA parse
115 |
116 | # Using model trained on CRAFT
117 | BioPosDep/StanfordBiaffineParser-v2$ python main.py --save_dir models/CRAFT parse
118 |
119 | # Output parsed files are by default saved in the model directory with the same name as the input file.
120 | # NOTE: We can also specify the output directory with the --output_dir flag and/or the output file name with the --output_file flag.
121 |
122 | #### Examples
123 |
124 | # Activate TensorFlow 1.0 before running models:
125 | BioPosDep/StanfordBiaffineParser-v2$ source .TF1_0/bin/activate
126 | BioPosDep/StanfordBiaffineParser-v2$ python main.py --save_dir models/GENIA parse ../data/raw.txt.genia.conll
127 | BioPosDep/StanfordBiaffineParser-v2$ python main.py --save_dir models/CRAFT parse ../data/raw.txt.craft.conll
128 |
129 | Two output parsed files `raw.txt.genia.conll` and `raw.txt.craft.conll` are generated in folders `models/GENIA` and `models/CRAFT`, respectively.
130 |
131 | ### Using retrained jPTDP models
132 |
133 | See [https://github.com/datquocnguyen/jPTDP](https://github.com/datquocnguyen/jPTDP) for details.
134 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/config/CRAFT.cfg:
--------------------------------------------------------------------------------
1 | #***************************************************************
2 | # Where things are located
3 | [Configurable]
4 | train_files = /home/ubuntu/WORKSPACE/StanfordBiaffineParser-v2/predictedPOS/CRAFT.train.conll.20wayJK.txt
5 | parse_files = /home/ubuntu/WORKSPACE/StanfordBiaffineParser-v2/predictedPOS/CRAFT.dev.conll.20wayJK.txt
6 |
7 | [Pretrained Vocab]
8 | filename = /home/ubuntu/WORKSPACE/PubMed-shuffle-win2-500Kwords.txt
9 | # skips the first line of the file, which sometimes contains metadata about the embedding matrix
10 | skip_header = True
11 |
12 | #***************************************************************
13 | # Embedding hyperparameters
14 | [Char Vocab]
15 | # {RNNEmbed, CNNEmbed, MLPEmbed}
16 | embed_model = RNNEmbed
17 |
18 | # The aggregated word vocab, pretrained vocab, and char vocab
19 | [Multivocab]
20 | # probability of dropping a word embedding
21 | embed_keep_prob = .67
22 |
23 | [Tag Vocab]
24 | # probability of dropping a tag embedding
25 | embed_keep_prob = .67
26 |
27 | [RNN Embed]
28 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
29 | recur_cell = LSTMCell
30 | # number of LSTM layers
31 | n_layers = 3
32 | # number of recurrent units
33 | recur_size = 400
34 | # probability of dropping a connection between timesteps at a single layer
35 | recur_keep_prob = .67
36 | # probability of dropping a connection between layers at a single timestep
37 | ff_keep_prob = .67
38 |
39 | #***************************************************************
40 | # NLP model hyperparameters
41 | [Tagger]
42 | #if you only want it to produce the first column of tags, set this to just 'tags'
43 | output_vocabs = tags:xtags
44 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
45 | recur_cell = LSTMCell
46 | # number of LSTM layers
47 | n_layers = 2
48 | # number of recurrent units in each direction of the BiLSTM
49 | recur_size = 400
50 | # number of units in the tag classifier
51 | mlp_size = 600
52 | # probability of dropping a node in the MLP or the classifier
53 | mlp_keep_prob = .67
54 | # probability of dropping a connection between timesteps at a single layer
55 | recur_keep_prob = .5
56 | # probability of dropping a connection between layers at a single timestep
57 | ff_keep_prob = .67
58 |
59 | [Parser]
60 | # if you only want it to use the first column of tags, set this to 'words:tags'
61 | input_vocabs = words:tags:xtags
62 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
63 | recur_cell = LSTMCell
64 | # number of layers
65 | n_layers = 3
66 | # number of recurrent units
67 | recur_size = 400
68 | # number of units in the edge classifier
69 | arc_mlp_size = 600
70 | # number of units in the label classifier (you probably want this to be small!)
71 | rel_mlp_size = 100
72 | # probability of dropping a node in the MLP or the classifier
73 | mlp_keep_prob = .67
74 | # probability of dropping a connection between timesteps at a single layer
75 | recur_keep_prob = .67
76 | # probability of dropping a connection between layers at a single timestep
77 | ff_keep_prob = .67
78 |
79 | #***************************************************************
80 | # Training hyperparameters
81 | [Network]
82 | # {Parser, Tagger}
83 | nlp_model = Parser
84 | quit_after_n_iters_without_improvement = 5000
85 | max_train_iters = 20001
86 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/config/GENIA.cfg:
--------------------------------------------------------------------------------
1 | #***************************************************************
2 | # Where things are located
3 | [Configurable]
4 | train_files = /home/ubuntu/WORKSPACE/StanfordBiaffineParser-v2/predictedPOS/GENIA.train.conll.20wayJK.txt
5 | parse_files = /home/ubuntu/WORKSPACE/StanfordBiaffineParser-v2/predictedPOS/GENIA.dev.conll.20wayJK.txt
6 |
7 | [Pretrained Vocab]
8 | filename = /home/ubuntu/WORKSPACE/PubMed-shuffle-win2-500Kwords.txt
9 | # skips the first line of the file, which sometimes contains metadata about the embedding matrix
10 | skip_header = True
11 |
12 | #***************************************************************
13 | # Embedding hyperparameters
14 | [Char Vocab]
15 | # {RNNEmbed, CNNEmbed, MLPEmbed}
16 | embed_model = RNNEmbed
17 |
18 | # The aggregated word vocab, pretrained vocab, and char vocab
19 | [Multivocab]
20 | # probability of dropping a word embedding
21 | embed_keep_prob = .67
22 |
23 | [Tag Vocab]
24 | # probability of dropping a tag embedding
25 | embed_keep_prob = .67
26 |
27 | [RNN Embed]
28 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
29 | recur_cell = LSTMCell
30 | # number of LSTM layers
31 | n_layers = 3
32 | # number of recurrent units
33 | recur_size = 400
34 | # probability of dropping a connection between timesteps at a single layer
35 | recur_keep_prob = .67
36 | # probability of dropping a connection between layers at a single timestep
37 | ff_keep_prob = .67
38 |
39 | #***************************************************************
40 | # NLP model hyperparameters
41 | [Tagger]
42 | #if you only want it to produce the first column of tags, set this to just 'tags'
43 | output_vocabs = tags:xtags
44 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
45 | recur_cell = LSTMCell
46 | # number of LSTM layers
47 | n_layers = 2
48 | # number of recurrent units in each direction of the BiLSTM
49 | recur_size = 400
50 | # number of units in the tag classifier
51 | mlp_size = 600
52 | # probability of dropping a node in the MLP or the classifier
53 | mlp_keep_prob = .67
54 | # probability of dropping a connection between timesteps at a single layer
55 | recur_keep_prob = .5
56 | # probability of dropping a connection between layers at a single timestep
57 | ff_keep_prob = .67
58 |
59 | [Parser]
60 | # if you only want it to use the first column of tags, set this to 'words:tags'
61 | input_vocabs = words:tags:xtags
62 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
63 | recur_cell = LSTMCell
64 | # number of layers
65 | n_layers = 3
66 | # number of recurrent units
67 | recur_size = 400
68 | # number of units in the edge classifier
69 | arc_mlp_size = 600
70 | # number of units in the label classifier (you probably want this to be small!)
71 | rel_mlp_size = 100
72 | # probability of dropping a node in the MLP or the classifier
73 | mlp_keep_prob = .67
74 | # probability of dropping a connection between timesteps at a single layer
75 | recur_keep_prob = .67
76 | # probability of dropping a connection between layers at a single timestep
77 | ff_keep_prob = .67
78 |
79 | #***************************************************************
80 | # Training hyperparameters
81 | [Network]
82 | # {Parser, Tagger}
83 | nlp_model = Parser
84 | quit_after_n_iters_without_improvement = 5000
85 | max_train_iters = 20001
86 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/config/defaults.cfg:
--------------------------------------------------------------------------------
1 | #***************************************************************
2 | # High level stuff
3 | [DEFAULT]
4 | save_dir = saves/defaults
5 | data_dir = data
6 | lc = en
7 | treebank = English
8 | lang = English
9 |
10 | [Configurable]
11 | train_files = %(data_dir)s/CoNLL17/UD_%(treebank)s/%(lc)s-ud-train.conllu
12 | parse_files = %(data_dir)s/CoNLL17/UD_%(treebank)s/%(lc)s-ud-dev.conllu
13 | verbose = True
14 | name = None
15 |
16 | #***************************************************************
17 | # Vocab data structures
18 | [Base Vocab]
19 | # TODO take special_tokens out of here and put them in the classes
20 | cased = None
21 | embed_size = 100
22 |
23 | [Pretrained Vocab]
24 | special_tokens=:::
25 | skip_header = True
26 | name = pretrained
27 | filename = %(data_dir)s/embeddings/%(lang)s/%(lc)s.vectors.xz
28 | cased = False
29 | max_rank = 0
30 |
31 | [Token Vocab]
32 | name = tokens
33 | embed_keep_prob = .67
34 | min_occur_count = 2
35 | max_rank = 100000
36 |
37 | [Index Vocab]
38 | special_tokens=:
39 |
40 | [Dep Vocab]
41 | name = deps
42 |
43 | [Head Vocab]
44 | name = heads
45 |
46 | [Word Vocab]
47 | special_tokens=:::
48 | name = words
49 | filename = %(save_dir)s/%(name)s.txt
50 | cased = False
51 |
52 | [Lemma Vocab]
53 | name = lemmas
54 | filename = %(save_dir)s/%(name)s.txt
55 |
56 | [Tag Vocab]
57 | special_tokens=PAD:ROOT:DROP:UNK
58 | name = tags
59 | filename = %(save_dir)s/%(name)s.txt
60 | cased = True
61 |
62 | [X Tag Vocab]
63 | name = xtags
64 | filename = %(save_dir)s/%(name)s.txt
65 |
66 | [Rel Vocab]
67 | special_tokens=pad:root:drop:unk
68 | name = rels
69 | filename = %(save_dir)s/%(name)s.txt
70 | cased = True
71 |
72 | [Subtoken Vocab]
73 | max_rank = 0
74 | # TODO Setting this to more than 1 triggers a bug
75 | n_buckets = 2
76 | embed_model = CNNEmbed
77 | embed_keep_prob = 1
78 |
79 | [Char Vocab]
80 | special_tokens = ::::::
81 | name = chars
82 | filename = %(save_dir)s/%(name)s.txt
83 | embed_model = RNNEmbed
84 |
85 | [Ngram Vocab]
86 | special_tokens = ::::
87 | name = ngrams
88 | filename = %(save_dir)s/%(name)s.txt
89 | embed_model = MLPEmbed
90 |
91 | [Ngram Multivocab]
92 | special_tokens = ::::
93 | name = multi-ngram
94 | max_n = 5
95 | embed_model = MLPEmbed
96 |
97 | [Bytepair Vocab]
98 | name = bytepairs
99 | filename = %(save_dir)s/%(name)s.txt
100 | n_bytepairs = 500
101 | embed_model = MLPEmbed
102 |
103 | [Multivocab]
104 | embed_keep_prob = .67
105 |
106 | #***************************************************************
107 | # Neural models
108 | [NN]
109 | recur_cell = LSTMCell
110 | n_layers = 3
111 | mlp_func = leaky_relu
112 | conv_func = leaky_relu
113 | # TODO make sure you add this to Base Cell
114 | recur_size = 200
115 | window_size = 5
116 | conv_size = 200
117 | mlp_size = 200
118 | rnn_func = birnn
119 | conv_keep_prob = .67
120 | mlp_keep_prob = .67
121 | recur_keep_prob = .67
122 | ff_keep_prob = .67
123 |
124 | [Base Cell]
125 | forget_bias = 0
126 | recur_func = tanh
127 | recur_size = 300
128 |
129 | [RNN Cell]
130 | recur_func = leaky_relu
131 | recur_size = 400
132 |
133 | [Base Embed]
134 |
135 | [MLP Embed]
136 |
137 | [RNN Embed]
138 | rnn_func = rnn
139 |
140 | [CNN Embed]
141 |
142 | [Base Tagger]
143 | input_vocabs = words
144 | output_vocabs = tags
145 |
146 | [Base X Tagger]
147 | input_vocabs = words
148 | output_vocabs = tags:xtags
149 |
150 | [Tagger]
151 | name = tagger
152 | n_layers = 2
153 | recur_keep_prob = .5
154 |
155 | [X Tagger]
156 | name = xtagger
157 | n_layers = 2
158 | recur_keep_prob = .5
159 |
160 | [Base Parser]
161 | # TODO take off xtags later
162 | input_vocabs = words:tags:xtags
163 | output_vocabs = rels:heads
164 |
165 | [Parser]
166 | name = parser
167 | arc_mlp_size = 400
168 | rel_mlp_size = 100
169 |
170 | [Xbar Parser]
171 | name = xbar_parser
172 | p_mlp_size = 400
173 | arc_mlp_size = 400
174 | rel_mlp_size = 100
175 |
176 | [Bin Parser]
177 | name = bin_parser
178 | p_mlp_size = 400
179 | arc_mlp_size = 400
180 | rel_mlp_size = 100
181 |
182 | [Fish Parser]
183 | name = fish_parser
184 | lambda_mlp_size = 400
185 | arc_mlp_size = 400
186 | rel_mlp_size = 100
187 |
188 | [Gama Parser]
189 | name = fish_parser
190 | p_mlp_size = 400
191 | arc_mlp_size = 400
192 | rel_mlp_size = 100
193 |
194 | [Joint Parser]
195 | tag_mlp_size = 500
196 | arc_mlp_size = 500
197 | rel_mlp_size = 100
198 |
199 | #***************************************************************
200 | # Sequence data structures
201 | [Multibucket]
202 | n_buckets = 2
203 | name = multibucket
204 |
205 | [Bucket]
206 | name = None
207 |
208 | [Dataset]
209 | #TODO make sure you can get rid of data_files
210 |
211 | [Trainset]
212 | name = trainset
213 | data_files = train_files
214 | n_buckets = 10
215 | batch_by = tokens
216 | batch_size = 5000
217 |
218 | [Parseset]
219 | name = parseset
220 | data_files = parse_files
221 | n_buckets = 5
222 | batch_by = tokens
223 | batch_size = 50000
224 |
225 |
226 | #***************************************************************
227 | # Training
228 | [Network]
229 | name = network
230 | subtoken_vocab = CharVocab
231 | nlp_model = Parser
232 | min_train_iters = 1000
233 | max_train_iters = 20001
234 | validate_every = 100
235 | save_every = 1
236 | quit_after_n_iters_without_improvement = 5000
237 | per_process_gpu_memory_fraction = -1
238 |
239 | #***************************************************************
240 | # Miscellaneous
241 | [Radam Optimizer]
242 | name = radam
243 | # TODO keep adjusting lr?
244 | learning_rate = 2e-3
245 | decay = .75
246 | decay_steps = 5000
247 | clip = 5
248 | mu = .9
249 | nu = .9
250 | gamma = 0
251 | chi = 0
252 | epsilon = 1e-12
253 |
254 | [Zipf]
255 | n_zipfs = 3
256 | name = zipf
257 | filename = %(save_dir)s/%(name)s.txt
258 | batch_size = 500
259 | max_train_iters = 5000
260 | print_every = 500
261 |
262 | [Bucketer]
263 | name = bucketer
264 | filename = %(save_dir)s/%(name)s.txt
265 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/config/template.cfg:
--------------------------------------------------------------------------------
1 | #***************************************************************
2 | # Where things are located
3 | [Configurable]
4 | train_files = colon/separated/list/of/files:supports/glob/*
5 | parse_files = colon/separated/list/of/files:supports/glob/*
6 |
7 | [Pretrained Vocab]
8 | filename = location/of/pretrained/embeddings
9 | # skips the first line of the file, which sometimes contains metadata about the embedding matrix
10 | skip_header = True
11 |
12 | #***************************************************************
13 | # Embedding hyperparameters
14 | [Char Vocab]
15 | # {RNNEmbed, CNNEmbed, MLPEmbed}
16 | embed_model = RNNEmbed
17 |
18 | # The aggregated word vocab, pretrained vocab, and char vocab
19 | [Multivocab]
20 | # probability of dropping a word embedding
21 | embed_keep_prob = .67
22 |
23 | [Tag Vocab]
24 | # probability of dropping a tag embedding
25 | embed_keep_prob = .67
26 |
27 | [RNN Embed]
28 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
29 | recur_cell = LSTMCell
30 | # number of LSTM layers
31 | n_layers = 3
32 | # number of recurrent units
33 | recur_size = 400
34 | # probability of dropping a connection between timesteps at a single layer
35 | recur_keep_prob = .67
36 | # probability of dropping a connection between layers at a single timestep
37 | ff_keep_prob = .67
38 |
39 | #***************************************************************
40 | # NLP model hyperparameters
41 | [Tagger]
42 | #if you only want it to produce the first column of tags, set this to just 'tags'
43 | output_vocabs = tags:xtags
44 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
45 | recur_cell = LSTMCell
46 | # number of LSTM layers
47 | n_layers = 2
48 | # number of recurrent units in each direction of the BiLSTM
49 | recur_size = 400
50 | # number of units in the tag classifier
51 | mlp_size = 600
52 | # probability of dropping a node in the MLP or the classifier
53 | mlp_keep_prob = .67
54 | # probability of dropping a connection between timesteps at a single layer
55 | recur_keep_prob = .5
56 | # probability of dropping a connection between layers at a single timestep
57 | ff_keep_prob = .67
58 |
59 | [Parser]
60 | # if you only want it to use the first column of tags, set this to 'words:tags'
61 | input_vocabs = words:tags:xtags
62 | # {RNNCell, GRUCell, LSTMCell, CifLSTMCell}
63 | recur_cell = LSTMCell
64 | # number of layers
65 | n_layers = 3
66 | # number of recurrent units
67 | recur_size = 400
68 | # number of units in the edge classifier
69 | arc_mlp_size = 600
70 | # number of units in the label classifier (you probably want this to be small!)
71 | rel_mlp_size = 100
72 | # probability of dropping a node in the MLP or the classifier
73 | mlp_keep_prob = .67
74 | # probability of dropping a connection between timesteps at a single layer
75 | recur_keep_prob = .67
76 | # probability of dropping a connection between layers at a single timestep
77 | ff_keep_prob = .67
78 |
79 | #***************************************************************
80 | # Training hyperparameters
81 | [Network]
82 | # {Parser, Tagger}
83 | nlp_model = Parser
84 | quit_after_n_iters_without_improvement = 5000
85 | max_train_iters = 50000
86 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import re
23 | import os
24 | import sys
25 | import codecs
26 | from argparse import ArgumentParser
27 |
28 | from parser import Configurable
29 | from parser import Network
30 |
31 | # TODO make the pretrained vocab names a list given to TokenVocab
32 | #***************************************************************
33 | # Set up the argparser
34 | argparser = ArgumentParser('Network')
35 | argparser.add_argument('--save_dir', required=True)
36 | subparsers = argparser.add_subparsers()
37 | section_names = set()
38 | # --section_name opt1=value1 opt2=value2 opt3=value3
39 | with codecs.open('config/defaults.cfg') as f:
40 | section_regex = re.compile('\[(.*)\]')
41 | for line in f:
42 | match = section_regex.match(line)
43 | if match:
44 | section_names.add(match.group(1).lower().replace(' ', '_'))
45 |
46 | #===============================================================
47 | # Train
48 | #---------------------------------------------------------------
49 | def train(save_dir, **kwargs):
50 | """"""
51 |
52 | kwargs['config_file'] = kwargs.pop('config_file', '')
53 | load = kwargs.pop('load')
54 | try:
55 | if not load and os.path.isdir(save_dir):
56 | raw_input('Save directory already exists. Press to continue or to abort.')
57 | if os.path.isfile(os.path.join(save_dir, 'config.cfg')):
58 | os.remove(os.path.join(save_dir, 'config.cfg'))
59 | except KeyboardInterrupt:
60 | print()
61 | sys.exit(0)
62 | network = Network(**kwargs)
63 | network.train(load=load)
64 | return
65 | #---------------------------------------------------------------
66 |
67 | train_parser = subparsers.add_parser('train')
68 | train_parser.set_defaults(action=train)
69 | train_parser.add_argument('--load', action='store_true')
70 | train_parser.add_argument('--config_file')
71 | for section_name in section_names:
72 | train_parser.add_argument('--'+section_name, nargs='+')
73 |
74 | #===============================================================
75 | # Parse
76 | #---------------------------------------------------------------
77 | def parse(save_dir, **kwargs):
78 | """"""
79 |
80 | kwargs['config_file'] = os.path.join(save_dir, 'config.cfg')
81 | files = kwargs.pop('files')
82 | output_file = kwargs.pop('output_file', None)
83 | output_dir = kwargs.pop('output_dir', None)
84 | if len(files) > 1 and output_file is not None:
85 | raise ValueError('Cannot provide a value for --output_file when parsing multiple files')
86 | kwargs['is_evaluation'] = True
87 | network = Network(**kwargs)
88 | network.parse(files, output_file=output_file, output_dir=output_dir)
89 | return
90 | #---------------------------------------------------------------
91 |
92 | parse_parser = subparsers.add_parser('parse')
93 | parse_parser.set_defaults(action=parse)
94 | parse_parser.add_argument('files', nargs='+')
95 | for section_name in section_names:
96 | parse_parser.add_argument('--'+section_name, nargs='+')
97 | parse_parser.add_argument('--output_file')
98 | parse_parser.add_argument('--output_dir')
99 |
100 | #***************************************************************
101 | # Parse the arguments
102 | kwargs = vars(argparser.parse_args())
103 | action = kwargs.pop('action')
104 | save_dir = kwargs.pop('save_dir')
105 | kwargs = {key: value for key, value in kwargs.iteritems() if value is not None}
106 | for section, values in kwargs.iteritems():
107 | if section in section_names:
108 | values = [value.split('=', 1) for value in values]
109 | kwargs[section] = {opt: value for opt, value in values}
110 | if 'default' not in kwargs:
111 | kwargs['default'] = {}
112 | kwargs['default']['save_dir'] = save_dir
113 | action(save_dir, **kwargs)
114 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 |
4 | from configurable import Configurable
5 | from bucket import Bucket
6 | from multibucket import Multibucket
7 | from network import Network
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/bucket.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | from parser.configurable import Configurable
26 |
27 | #***************************************************************
28 | class Bucket(Configurable):
29 | """"""
30 |
31 | #=============================================================
32 | def __init__(self, *args, **kwargs):
33 | """"""
34 |
35 | embed_model = kwargs.pop('embed_model', None)
36 | super(Bucket, self).__init__(*args, **kwargs)
37 |
38 | self._indices = []
39 | self._maxlen = 0
40 | self._depth = 1
41 | self._tokens = []
42 | if embed_model is not None:
43 | self._embed_model = embed_model.from_configurable(self, name=self.name)
44 | else:
45 | self._embed_model = None
46 | return
47 |
48 | #=============================================================
49 | def __call__(self, vocab, keep_prob=None, moving_params=None):
50 | """"""
51 |
52 | return self.embed_model(vocab, keep_prob=keep_prob, moving_params=moving_params)
53 |
54 | #=============================================================
55 | def open(self, maxlen, depth=None):
56 | """"""
57 |
58 | if depth is None:
59 | self._indices = [[0]]
60 | else:
61 | self._indices = [[[0]*depth]]
62 | self._tokens = [['']]
63 | self._maxlen = maxlen
64 | self._depth = depth
65 | return self
66 |
67 | #=============================================================
68 | def add(self, idxs, tokens=None):
69 | """"""
70 |
71 | if isinstance(self.indices, np.ndarray):
72 | raise TypeError("The bucket has already been closed, you can't add to it")
73 | if len(idxs) > len(self) and len(self) != -1:
74 | raise ValueError('Bucket of max len %d received sequence of len %d' % (len(self), len(idxs)))
75 |
76 | self.indices.append(idxs)
77 | if tokens is not None:
78 | self.tokens.append(tokens)
79 | return len(self.indices) - 1
80 |
81 | #=============================================================
82 | def get_tokens(self, batch):
83 | """"""
84 |
85 | return [self.tokens[sent_idx] for sent_idx in batch]
86 |
87 | #=============================================================
88 | def close(self):
89 | """"""
90 |
91 | if self.depth is None:
92 | indices = np.zeros((len(self.indices), len(self)), dtype=np.int32)
93 | for i, sequence in enumerate(self.indices):
94 | indices[i,0:len(sequence)] = sequence
95 | else:
96 | indices = np.zeros((len(self.indices), len(self), self.depth), dtype=np.int32)
97 | for i, sequence in enumerate(self.indices):
98 | for j, index in enumerate(sequence):
99 | indices[i,j,0:len(index)] = index
100 | self._indices = indices
101 |
102 | #=============================================================
103 | @classmethod
104 | def from_dataset(cls, dataset, bkt_idx, *args, **kwargs):
105 | """"""
106 |
107 | kwargs = dict(kwargs)
108 | kwargs['name'] = '{name}-{bkt_idx}'.format(name=dataset.name, bkt_idx=bkt_idx)
109 | bucket = cls.from_configurable(dataset, *args, **kwargs)
110 | indices = []
111 | for multibucket in dataset:
112 | indices.append(multibucket[bkt_idx].indices)
113 | for i in xrange(len(indices)):
114 | if len(indices[i].shape) == 2:
115 | indices[i] = indices[i][:,:,None]
116 | bucket._indices = np.concatenate(indices, axis=2)
117 | bucket._maxlen = bucket.indices.shape[1]
118 | bucket._depth = bucket.indices.shape[2]
119 | return bucket
120 |
121 | #=============================================================
122 | def reset_placeholders(self):
123 | self.embed_model.reset_placeholders()
124 | return
125 | #=============================================================
126 | @property
127 | def tokens(self):
128 | return self._tokens
129 | @property
130 | def indices(self):
131 | return self._indices
132 | @property
133 | def embed_model(self):
134 | return self._embed_model
135 | @property
136 | def depth(self):
137 | return self._depth
138 | @property
139 | def placeholder(self):
140 | return self.embed_model.placeholder
141 |
142 | #=============================================================
143 | def __len__(self):
144 | return self._maxlen
145 | def __enter__(self):
146 | return self
147 | def __exit__(self, exception_type, exception_value, trace):
148 | if exception_type is not None:
149 | raise exception_type(exception_value)
150 | self.close()
151 | return
152 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/misc/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 |
19 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/misc/colors.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | colors = {
23 | None: '\033[0m',
24 | 'bold': '\033[1m',
25 | 'italic': '\033[3m',
26 | 'uline': '\033[4m',
27 | 'blink': '\033[5m',
28 | 'hlight': '\033[7m',
29 |
30 | 'black': '\033[30m',
31 | 'red': '\033[31m',
32 | 'green': '\033[32m',
33 | 'yellow': '\033[33m',
34 | 'blue': '\033[34m',
35 | 'magenta': '\033[35m',
36 | 'cyan': '\033[36m',
37 | 'white': '\033[37m',
38 |
39 | 'black_hlight': '\033[40m',
40 | 'red_hlight': '\033[41m',
41 | 'green_hlight': '\033[42m',
42 | 'yellow_hlight': '\033[43m',
43 | 'blue_hlight': '\033[44m',
44 | 'magenta_hlight': '\033[45m',
45 | 'cyan_hlight': '\033[46m',
46 | 'white_hlight': '\033[47m',
47 |
48 | 'bright_black': '\033[90m',
49 | 'bright_red': '\033[91m',
50 | 'bright_green': '\033[92m',
51 | 'bright_yellow': '\033[93m',
52 | 'bright_blue': '\033[94m',
53 | 'bright_magenta': '\033[95m',
54 | 'bright_cyan': '\033[96m',
55 | 'bright_white': '\033[97m',
56 |
57 | 'bright_black_hlight': '\033[100m',
58 | 'bright_red_hlight': '\033[101m',
59 | 'bright_green_hlight': '\033[102m',
60 | 'bright_orange_hlight': '\033[103m',
61 | 'bright_blue_hlight': '\033[1010m',
62 | 'bright_magenta_hlight': '\033[105m',
63 | 'bright_cyan_hlight': '\033[106m',
64 | 'bright_white_hlight': '\033[107m',
65 | }
66 |
67 | def ctext(text, *color_list):
68 | return ''.join(colors[color] for color in color_list) + text + colors[None]
69 | def color_pattern(text1, text2, *color_list):
70 | multicolor = ''.join(colors[color] for color in color_list)
71 | return multicolor + colors['bold'] + text1 + colors[None] + ' ' + multicolor + colors['uline'] + text2 + colors[None]
72 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/misc/get_encoding.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import codecs
23 |
24 | #***************************************************************
25 | encodings = ['utf-8', 'ascii']
26 |
27 | def get_encoding(filename):
28 | """"""
29 |
30 | success = False
31 | for encoding in encodings:
32 | with codecs.open(filename, encoding=encoding) as f:
33 | try:
34 | for i, line in enumerate(f):
35 | pass
36 | success = True
37 | break
38 | except ValueError as e:
39 | print('Encoding {0} failed for file {1} at line {2}: {3}\n{4}'.format(encoding, filename, i, line, e))
40 | continue
41 |
42 | if success:
43 | return encoding
44 | else:
45 | raise ValueError('No valid encoding found for file {0}'.format(filename))
46 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/multibucket.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | from parser import Configurable
26 | from parser import Bucket
27 | from parser.misc.colors import ctext
28 |
29 | #***************************************************************
30 | class Multibucket(Configurable):
31 | """"""
32 |
33 | #=============================================================
34 | def __init__(self, *args, **kwargs):
35 | """"""
36 |
37 | self._embed_model = kwargs.pop('embed_model', None)
38 | super(Multibucket, self).__init__(*args, **kwargs)
39 |
40 | self._indices = []
41 | self._buckets = []
42 | self._len2idx = {}
43 | self.placeholder = None
44 | return
45 |
46 | #=============================================================
47 | def __call__(self, vocab, keep_prob=None, moving_params=None):
48 | """"""
49 |
50 | # This placeholder is used to ensure the bucket data is in the right order
51 | reuse = None if moving_params is None else True
52 | self.generate_placeholder()
53 | embeddings = []
54 | for i, bucket in enumerate(self):
55 | if i > 0:
56 | reuse = True
57 | with tf.variable_scope(self.name+'-multibucket', reuse=reuse):
58 | embeddings.append(bucket(vocab, keep_prob=keep_prob, moving_params=moving_params))
59 | return tf.nn.embedding_lookup(tf.concat(embeddings, axis=0), self.placeholder)
60 |
61 | #=============================================================
62 | def reset_placeholders(self):
63 | self.placeholder = None
64 | for bucket in self:
65 | bucket.reset_placeholders()
66 | return
67 |
68 | #=============================================================
69 | def generate_placeholder(self):
70 | """"""
71 |
72 | if self.placeholder is None:
73 | self.placeholder = tf.placeholder(tf.int32, shape=(None,), name=self.name+'-multibucket')
74 | return self.placeholder
75 |
76 | #=============================================================
77 | def open(self, maxlens, depth=None):
78 | """"""
79 |
80 | self._indices = [(0,0)]
81 | self._buckets = []
82 | self._len2idx = {}
83 | prevlen = -1
84 | for idx, maxlen in enumerate(maxlens):
85 | self._buckets.append(Bucket.from_configurable(self, embed_model=self.embed_model, name='%s-%d' % (self.name, idx)).open(maxlen, depth=depth))
86 | self._len2idx.update(zip(range(prevlen+1, maxlen+1), [idx]*(maxlen-prevlen)))
87 | prevlen = maxlen
88 | return self
89 |
90 | #=============================================================
91 | def add(self, idxs, tokens=None):
92 | """"""
93 |
94 | if isinstance(self.indices, np.ndarray):
95 | raise TypeError("The buckets have already been closed, you can't add to them")
96 |
97 | idx = self._len2idx.get(len(idxs), len(self)-1)
98 | bkt_idx = self[idx].add(idxs, tokens=tokens)
99 | self.indices.append( (idx, bkt_idx) )
100 | return len(self.indices) - 1
101 |
102 | #=============================================================
103 | def close(self):
104 | """"""
105 |
106 | for bucket in self:
107 | bucket.close()
108 |
109 | self._indices = np.array(self.indices, dtype=[('bkt_idx', 'i4'), ('idx', 'i4')])
110 | return
111 |
112 | #=============================================================
113 | def inv_idxs(self):
114 | """"""
115 |
116 | return np.argsort(np.concatenate([np.where(self.indices['bkt_idx'][1:] == bkt_idx)[0] for bkt_idx in xrange(len(self))]))
117 |
118 | #=============================================================
119 | def get_tokens(self, bkt_idx, batch):
120 | """"""
121 |
122 | return self[bkt_idx].get_tokens(batch)
123 |
124 | #=============================================================
125 | @classmethod
126 | def from_dataset(cls, dataset, *args, **kwargs):
127 | """"""
128 |
129 | multibucket = cls.from_configurable(dataset, *args, **kwargs)
130 | indices = []
131 | for multibucket_ in dataset:
132 | indices.append(multibucket_.indices)
133 | for i in xrange(1, len(indices)):
134 | assert np.equal(indices[0].astype(int), indices[i].astype(int)).all()
135 | multibucket._indices = np.array(multibucket_.indices)
136 | buckets = [Bucket.from_dataset(dataset, i, *args, **kwargs) for i in xrange(len(multibucket_))]
137 | multibucket._buckets = buckets
138 | if dataset.verbose:
139 | for bucket in multibucket:
140 | print('Bucket {name} is {shape}'.format(name=bucket.name, shape=ctext(' x '.join(str(x) for x in bucket.indices.shape), 'bright_blue')))
141 | return multibucket
142 |
143 | #=============================================================
144 | @property
145 | def indices(self):
146 | return self._indices
147 | @property
148 | def embed_model(self):
149 | return self._embed_model
150 |
151 | #=============================================================
152 | def __str__(self):
153 | return str(self._buckets)
154 | def __iter__(self):
155 | return (bucket for bucket in self._buckets)
156 | def __getitem__(self, key):
157 | return self._buckets[key]
158 | def __len__(self):
159 | return len(self._buckets)
160 | def __enter__(self):
161 | return self
162 | def __exit__(self, exception_type, exception_value, trace):
163 | if exception_type is not None:
164 | raise exception_type(exception_value)
165 | self.close()
166 | return
167 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/__init__.py:
--------------------------------------------------------------------------------
1 | import models
2 | import optimizers
3 | import recur_cells
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/functions.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from __future__ import absolute_import
5 | from __future__ import division
6 | from __future__ import print_function
7 |
8 | import numpy as np
9 | import tensorflow as tf
10 |
11 | #***************************************************************
12 | sig_const = np.arctanh(1/3)
13 | tanh_const = np.arctanh(np.sqrt(1/3))
14 |
15 | def gate(x):
16 | return tf.nn.sigmoid(2*x)
17 |
18 | def tanh(x):
19 | return tf.nn.tanh(x)
20 |
21 | def gated_tanh(x):
22 | dim = len(x.get_shape().as_list())-1
23 | cell_act, gate_act = tf.split(x, 2, dim)
24 | return gate(gate_act) * tanh(cell_act)
25 |
26 | def identity(x):
27 | return tf.identity(x)
28 |
29 | def gated_identity(x):
30 | dim = len(x.get_shape().as_list())-1
31 | cell_act, gate_act = tf.split(x, 2, dim)
32 | return gate(gate_act) * identity(cell_act)
33 |
34 | def softplus(x):
35 | return tf.softplus(2*x)/2
36 |
37 | def elu(x):
38 | return tf.nn.elu(x)
39 |
40 | def relu(x):
41 | return tf.nn.relu(x)
42 |
43 | def leaky_relu(x):
44 | return tf.maximum(.1*x, x)
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/__init__.py:
--------------------------------------------------------------------------------
1 | from nn import NN
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/embeds/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from mlp_embed import MLPEmbed
19 | from rnn_embed import RNNEmbed
20 | from cnn_embed import CNNEmbed
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/embeds/base_embed.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | from parser.vocabs import TokenVocab, Multivocab
26 | from parser.neural.models import NN
27 |
28 | #***************************************************************
29 | class BaseEmbed(NN):
30 | """"""
31 |
32 | #=============================================================
33 | def __init__(self, *args, **kwargs):
34 | """"""
35 |
36 | super(BaseEmbed, self).__init__(*args, **kwargs)
37 | # This placeholder represents the data in the bucket that called BaseEmbed.__init__
38 | self.placeholder = None
39 | return
40 |
41 | #=============================================================
42 | def reset_placeholders(self):
43 | self.placeholder = None
44 | return
45 |
46 | #=============================================================
47 | def __call__(self, vocab, keep_prob=None, moving_params=None):
48 | """"""
49 |
50 | self.moving_params = moving_params
51 | if isinstance(vocab, Multivocab):
52 | multivocab = vocab
53 | self.generate_placeholder([None,None,None])
54 | embeddings = [TokenVocab.__call__(vocab, self.placeholder[:,:,i]) for i, vocab in enumerate(multivocab)]
55 | embeddings = tf.stack(embeddings, axis=2)
56 | # (n x b x g x d) -> (n x b x d)
57 | with tf.variable_scope('Pre-Attn'):
58 | embeddings = self.linear_attention(embeddings)
59 | self._tokens_to_keep = tf.to_float(tf.greater(self.placeholder[:,:,0], vocab.PAD))
60 | else:
61 | self.generate_placeholder([None,None])
62 | # (n x b x d)
63 | embeddings = TokenVocab.__call__(vocab, self.placeholder)
64 | self._tokens_to_keep = tf.to_float(tf.greater(self.placeholder, vocab.PAD))
65 | self._batch_size = tf.shape(self.placeholder)[0]
66 | self._bucket_size = tf.shape(self.placeholder)[1]
67 | self._sequence_lengths = tf.to_int32(tf.reduce_sum(self.tokens_to_keep, axis=1))
68 | self._n_tokens = tf.reduce_sum(self.sequence_lengths)
69 | return embeddings
70 |
71 | #=============================================================
72 | def generate_placeholder(self, shape):
73 | if self.placeholder is None:
74 | self.placeholder = tf.placeholder(tf.int32, shape=shape, name='%s-bkt' % self.name)
75 | return self.placeholder
76 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/embeds/cnn_embed.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | from parser.neural.models.embeds.base_embed import BaseEmbed
26 |
27 | #***************************************************************
28 | class CNNEmbed(BaseEmbed):
29 | """"""
30 |
31 | #=============================================================
32 | def __call__(self, vocab, **kwargs):
33 | """"""
34 |
35 | # (n x b x d)
36 | embeddings = super(CNNEmbed, self).__call__(vocab, **kwargs)
37 | # (n x b x d) -> (n x b x h)
38 | with tf.variable_scope('CNN'):
39 | conv = self.CNN(embeddings, self.window_size, self.conv_size)
40 | # (n x b x h) -> (n x h)
41 | hidden = tf.reduce_max(conv, axis=1)
42 | # (n x h) -> (n x o)
43 | linear = self.linear(hidden, vocab.token_embed_size)
44 | return linear
45 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/embeds/mlp_embed.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | from parser.neural.models.embeds.base_embed import BaseEmbed
26 |
27 | #***************************************************************
28 | class MLPEmbed(BaseEmbed):
29 | """"""
30 |
31 | #=============================================================
32 | def __call__(self, vocab, **kwargs):
33 | """"""
34 |
35 | # (n x b x d)
36 | embeddings = super(MLPEmbed, self).__call__(vocab, **kwargs)
37 | # (n x b x d) -> (n x d)
38 | with tf.variable_scope('Attn'):
39 | attn = self.linear_attention(embeddings)
40 | # (n x d) -> (n x h)
41 | with tf.variable_scope('MLP'):
42 | hidden = self.MLP(attn, self.mlp_size)
43 | # (n x h) -> (n x o)
44 | linear = self.linear(hidden, vocab.token_embed_size)
45 | return linear
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/embeds/rnn_embed.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | import parser.neural.rnn as rnn
26 | from parser.neural.models.embeds.base_embed import BaseEmbed
27 |
28 | #***************************************************************
29 | class RNNEmbed(BaseEmbed):
30 | """"""
31 |
32 | #=============================================================
33 | def __call__(self, vocab, **kwargs):
34 | """"""
35 |
36 | # (n x b x d)
37 | embeddings = super(RNNEmbed, self).__call__(vocab, **kwargs)
38 | # (n x b x d) -> (n x b x h)
39 | with tf.variable_scope('RNN'):
40 | recur, state = self.RNN(embeddings, self.recur_size)
41 | if self.rnn_func == rnn.birnn:
42 | state_fw, state_bw = tf.unstack(state)
43 | state_fw = tf.split(state_fw, 2, axis=1)[0]
44 | state_bw = tf.split(state_bw, 2, axis=1)[0]
45 | state = tf.concat([state_fw, state_bw], 1)
46 | elif self.rnn_func == rnn.rnn:
47 | state = tf.split(state, 2, axis=1)[0]
48 | # (n x b x h) -> (n x h)
49 | with tf.variable_scope('MLP'):
50 | hidden = self.linear_attention(recur)
51 | # (n x h) -> (n x o)
52 | linear = self.linear(tf.concat([hidden, state], axis=1), vocab.token_embed_size)
53 | return linear
54 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/__init__.py:
--------------------------------------------------------------------------------
1 | from parsers import *
2 | from taggers import *
3 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | from parser import Parser
2 | from fish_parser import FishParser
3 | from gama_parser import GamaParser
4 | from xbar_parser import XbarParser
5 | from bin_parser import BinParser
6 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/base_parser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import re
23 | import codecs
24 | import numpy as np
25 | import tensorflow as tf
26 | import matplotlib.pyplot as plt
27 |
28 | from parser.misc.colors import ctext, color_pattern
29 | from parser.misc.mst import nonprojective, argmax
30 | from parser.neural.models.nn import NN
31 |
32 | #***************************************************************
33 | class BaseParser(NN):
34 | """"""
35 |
36 | PAD = 0
37 | ROOT = 1
38 |
39 | #=============================================================
40 | def __call__(self, vocabs, moving_params=None):
41 | """"""
42 |
43 | self.moving_params = moving_params
44 | if isinstance(vocabs, dict):
45 | self.vocabs = vocabs
46 | else:
47 | self.vocabs = {vocab.name: vocab for vocab in vocabs}
48 |
49 | input_vocabs = [self.vocabs[name] for name in self.input_vocabs]
50 | #embed = tf.concat([vocab(moving_params=self.moving_params) for vocab in input_vocabs], 2)
51 | embed = self.embed_concat(input_vocabs)
52 | for vocab in self.vocabs.values():
53 | if vocab not in input_vocabs:
54 | vocab.generate_placeholder()
55 | placeholder = self.vocabs['words'].placeholder
56 | if len(placeholder.get_shape().as_list()) == 3:
57 | placeholder = placeholder[:,:,0]
58 | self._tokens_to_keep = tf.to_float(tf.greater(placeholder, self.ROOT))
59 | self._batch_size = tf.shape(placeholder)[0]
60 | self._bucket_size = tf.shape(placeholder)[1]
61 | self._sequence_lengths = tf.reduce_sum(tf.to_int32(tf.greater(placeholder, self.PAD)), axis=1)
62 | self._n_tokens = tf.to_int32(tf.reduce_sum(self.tokens_to_keep))
63 |
64 | top_recur = embed
65 | for i in xrange(self.n_layers):
66 | with tf.variable_scope('RNN%d' % i):
67 | top_recur, _ = self.RNN(top_recur, self.recur_size)
68 | return top_recur
69 |
70 | #=============================================================
71 | def process_accumulators(self, accumulators, time=None):
72 | """"""
73 |
74 | n_tokens, n_seqs, loss, rel_corr, arc_corr, corr, seq_corr = accumulators
75 | acc_dict = {
76 | 'Loss': loss,
77 | 'LS': rel_corr/n_tokens*100,
78 | 'UAS': arc_corr/n_tokens*100,
79 | 'LAS': corr/n_tokens*100,
80 | 'SS': seq_corr/n_seqs*100,
81 | }
82 | if time is not None:
83 | acc_dict.update({
84 | 'Token_rate': n_tokens / time,
85 | 'Seq_rate': n_seqs / time,
86 | })
87 | return acc_dict
88 |
89 | #=============================================================
90 | def update_history(self, history, accumulators):
91 | """"""
92 |
93 | acc_dict = self.process_accumulators(accumulators)
94 | for key, value in acc_dict.iteritems():
95 | history[key].append(value)
96 | return history['LAS'][-1]
97 |
98 | #=============================================================
99 | def print_accuracy(self, accumulators, time, prefix='Train'):
100 | """"""
101 |
102 | acc_dict = self.process_accumulators(accumulators, time=time)
103 | strings = []
104 | strings.append(color_pattern('Loss:', '{Loss:7.3f}', 'bright_red'))
105 | strings.append(color_pattern('LS:', '{LS:5.2f}%', 'bright_cyan'))
106 | strings.append(color_pattern('UAS:', '{UAS:5.2f}%', 'bright_cyan'))
107 | strings.append(color_pattern('LAS:', '{LAS:5.2f}%', 'bright_cyan'))
108 | strings.append(color_pattern('SS:', '{SS:5.2f}%', 'bright_green'))
109 | strings.append(color_pattern('Speed:', '{Seq_rate:6.1f} seqs/sec', 'bright_magenta'))
110 | string = ctext('{0} ', 'bold') + ' | '.join(strings)
111 | print(string.format(prefix, **acc_dict))
112 | return
113 |
114 | #=============================================================
115 | def plot(self, history, prefix='Train'):
116 | """"""
117 |
118 | pass
119 |
120 | #=============================================================
121 | def check(self, preds, sents, fileobj):
122 | """"""
123 |
124 | for tokens, arc_preds, rel_preds in zip(sents, preds[0], preds[1]):
125 | for token, arc_pred, rel_pred in zip(zip(*tokens), arc_preds, rel_preds):
126 | arc = self.vocabs['heads'][arc_pred]
127 | rel = self.vocabs['rels'][rel_pred]
128 | fileobj.write('\t'.join(token+(arc, rel))+'\n')
129 | fileobj.write('\n')
130 | return
131 |
132 | #=============================================================
133 | def write_probs(self, sents, output_file, probs, inv_idxs):
134 | """"""
135 |
136 | #parse_algorithm = self.parse_algorithm
137 |
138 | # Turns list of tuples of tensors into list of matrices
139 | arc_probs = [arc_prob for batch in probs for arc_prob in batch[0]]
140 | rel_probs = [rel_prob for batch in probs for rel_prob in batch[1]]
141 | tokens_to_keep = [weight for batch in probs for weight in batch[2]]
142 | tokens = [sent for batch in sents for sent in batch]
143 |
144 | with codecs.open(output_file, 'w', encoding='utf-8', errors='ignore') as f:
145 | j = 0
146 | for i in inv_idxs:
147 | sent, arc_prob, rel_prob, weights = tokens[i], arc_probs[i], rel_probs[i], tokens_to_keep[i]
148 | sent = zip(*sent)
149 | sequence_length = int(np.sum(weights))+1
150 | arc_prob = arc_prob[:sequence_length][:,:sequence_length]
151 | #arc_preds = np.argmax(arc_prob, axis=1)
152 | arc_preds = nonprojective(arc_prob)
153 | arc_preds_one_hot = np.zeros([rel_prob.shape[0], rel_prob.shape[2]])
154 | arc_preds_one_hot[np.arange(len(arc_preds)), arc_preds] = 1.
155 | rel_preds = np.argmax(np.einsum('nrb,nb->nr', rel_prob, arc_preds_one_hot), axis=1)
156 | for token, arc_pred, rel_pred, weight in zip(sent, arc_preds[1:], rel_preds[1:], weights[1:]):
157 | token = list(token)
158 | token.insert(5, '_')
159 | token.append('_')
160 | token.append('_')
161 | token[6] = self.vocabs['heads'][arc_pred]
162 | token[7] = self.vocabs['rels'][rel_pred]
163 | f.write('\t'.join(token)+'\n')
164 | j += 1
165 | if j < len(inv_idxs):
166 | f.write('\n')
167 | return
168 |
169 | #=============================================================
170 | @property
171 | def train_keys(self):
172 | return ('n_tokens', 'n_seqs', 'loss', 'n_rel_correct', 'n_arc_correct', 'n_correct', 'n_seqs_correct')
173 |
174 | #=============================================================
175 | @property
176 | def valid_keys(self):
177 | return ('arc_preds', 'rel_preds')
178 |
179 | #=============================================================
180 | @property
181 | def parse_keys(self):
182 | return ('arc_probs', 'rel_probs', 'tokens_to_keep')
183 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/bin_parser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser
26 |
27 | #***************************************************************
28 | class BinParser(BaseParser):
29 | """"""
30 |
31 | #=============================================================
32 | def __call__(self, vocabs, moving_params=None):
33 | """"""
34 |
35 | top_recur = super(BinParser, self).__call__(vocabs, moving_params=moving_params)
36 | int_tokens_to_keep = tf.to_int32(self.tokens_to_keep)
37 |
38 | with tf.variable_scope('MLP'):
39 | dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size + self.p_mlp_size,
40 | n_splits=2)
41 | arc_dep_mlp, rel_dep_mlp, p_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size], axis=2)
42 | arc_head_mlp, rel_head_mlp, p_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size], axis=2)
43 |
44 | with tf.variable_scope('p'):
45 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
46 | arc_ps = self.bilinear(p_dep_mlp, p_head_mlp, 1)
47 | # (b x 1)
48 | arc_logits = -tf.nn.softplus(arc_ps)
49 |
50 | with tf.variable_scope('Arc'):
51 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
52 | arc_logits += self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False)
53 | # (n x b x b)
54 | arc_probs = tf.nn.softmax(arc_logits)
55 | # (n x b)
56 | arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1))
57 | # (n x b)
58 | arc_targets = self.vocabs['heads'].placeholder
59 | # (n x b)
60 | arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep
61 | # ()
62 | arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep)
63 |
64 | with tf.variable_scope('Rel'):
65 | # (n x b x d) o (d x r x d) o (n x b x d).T -> (n x b x r x b)
66 | rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels']))
67 | # (n x b x r x b)
68 | rel_probs = tf.nn.softmax(rel_logits, dim=2)
69 | # (n x b x b)
70 | one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size)
71 | # (n x b x b) -> (n x b x b x 1)
72 | one_hot = tf.expand_dims(one_hot, axis=3)
73 | # (n x b x r x b) o (n x b x b x 1) -> (n x b x r x 1)
74 | select_rel_logits = tf.matmul(rel_logits, one_hot)
75 | # (n x b x r x 1) -> (n x b x r)
76 | select_rel_logits = tf.squeeze(select_rel_logits, axis=3)
77 | # (n x b)
78 | rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1))
79 | # (n x b)
80 | rel_targets = self.vocabs['rels'].placeholder
81 | # (n x b)
82 | rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep
83 | # ()
84 | rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep)
85 |
86 | n_arc_correct = tf.reduce_sum(arc_correct)
87 | n_rel_correct = tf.reduce_sum(rel_correct)
88 | correct = arc_correct * rel_correct
89 | n_correct = tf.reduce_sum(correct)
90 | n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1)))
91 | loss = arc_loss + rel_loss
92 |
93 | outputs = {
94 | 'arc_logits': arc_logits,
95 | 'arc_probs': arc_probs,
96 | 'arc_preds': arc_preds,
97 | 'arc_targets': arc_targets,
98 | 'arc_correct': arc_correct,
99 | 'arc_loss': arc_loss,
100 | 'n_arc_correct': n_arc_correct,
101 |
102 | 'rel_logits': rel_logits,
103 | 'rel_probs': rel_probs,
104 | 'rel_preds': rel_preds,
105 | 'rel_targets': rel_targets,
106 | 'rel_correct': rel_correct,
107 | 'rel_loss': rel_loss,
108 | 'n_rel_correct': n_rel_correct,
109 |
110 | 'n_tokens': self.n_tokens,
111 | 'n_seqs': self.batch_size,
112 | 'tokens_to_keep': self.tokens_to_keep,
113 | 'n_correct': n_correct,
114 | 'n_seqs_correct': n_seqs_correct,
115 | 'loss': loss
116 | }
117 |
118 | return outputs
119 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/fish_parser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser
26 |
27 | #***************************************************************
28 | class FishParser(BaseParser):
29 | """"""
30 |
31 | #=============================================================
32 | def __call__(self, vocabs, moving_params=None):
33 | """"""
34 |
35 | top_recur = super(FishParser, self).__call__(vocabs, moving_params=moving_params)
36 | int_tokens_to_keep = tf.to_int32(self.tokens_to_keep)
37 |
38 | with tf.variable_scope('MLP'):
39 | dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size + self.lambda_mlp_size,
40 | n_splits=2)
41 | arc_dep_mlp, rel_dep_mlp, lambda_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.lambda_mlp_size], axis=2)
42 | arc_head_mlp, rel_head_mlp, lambda_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.lambda_mlp_size], axis=2)
43 |
44 | with tf.variable_scope('Lambda'):
45 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
46 | arc_lambdas = self.bilinear(lambda_dep_mlp, lambda_head_mlp, 1) + 5
47 | # (b x 1)
48 | i_mat = tf.expand_dims(tf.expand_dims(tf.range(self.bucket_size), 1), 0)
49 | # (1 x b)
50 | j_mat = tf.expand_dims(tf.expand_dims(tf.range(self.bucket_size), 0), 0)
51 | # (b x 1) - (1 x b) -> (b x b)
52 | k_mat = tf.abs(i_mat - j_mat)
53 | # (b x 1)
54 | n_mat = tf.expand_dims(tf.expand_dims(self.sequence_lengths, 1), 1) - 1 - i_mat
55 | # (b x b) * (n x b x b) - (n x b x b) - (b x b) -> (n x b x b)
56 | arc_logits = tf.to_float(k_mat)*arc_lambdas - tf.exp(arc_lambdas) - tf.lgamma(tf.to_float(k_mat+1))
57 |
58 | with tf.variable_scope('Arc'):
59 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
60 | arc_logits += self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False)
61 | # (n x b x b)
62 | arc_probs = tf.nn.softmax(arc_logits)
63 | # (n x b)
64 | arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1))
65 | # (n x b)
66 | arc_targets = self.vocabs['heads'].placeholder
67 | # (n x b)
68 | arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep
69 | # ()
70 | arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep)
71 |
72 | with tf.variable_scope('Rel'):
73 | # (n x b x d) o (d x r x d) o (n x b x d).T -> (n x b x r x b)
74 | rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels']))
75 | # (n x b x r x b)
76 | rel_probs = tf.nn.softmax(rel_logits, dim=2)
77 | # (n x b x b)
78 | one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size)
79 | # (n x b x b) -> (n x b x b x 1)
80 | one_hot = tf.expand_dims(one_hot, axis=3)
81 | # (n x b x r x b) o (n x b x b x 1) -> (n x b x r x 1)
82 | select_rel_logits = tf.matmul(rel_logits, one_hot)
83 | # (n x b x r x 1) -> (n x b x r)
84 | select_rel_logits = tf.squeeze(select_rel_logits, axis=3)
85 | # (n x b)
86 | rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1))
87 | # (n x b)
88 | rel_targets = self.vocabs['rels'].placeholder
89 | # (n x b)
90 | rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep
91 | # ()
92 | rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep)
93 |
94 | n_arc_correct = tf.reduce_sum(arc_correct)
95 | n_rel_correct = tf.reduce_sum(rel_correct)
96 | correct = arc_correct * rel_correct
97 | n_correct = tf.reduce_sum(correct)
98 | n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1)))
99 | loss = arc_loss + rel_loss
100 |
101 | outputs = {
102 | 'arc_logits': arc_logits,
103 | 'arc_lambdas': arc_lambdas,
104 | 'arc_probs': arc_probs,
105 | 'arc_preds': arc_preds,
106 | 'arc_targets': arc_targets,
107 | 'arc_correct': arc_correct,
108 | 'arc_loss': arc_loss,
109 | 'n_arc_correct': n_arc_correct,
110 |
111 | 'rel_logits': rel_logits,
112 | 'rel_probs': rel_probs,
113 | 'rel_preds': rel_preds,
114 | 'rel_targets': rel_targets,
115 | 'rel_correct': rel_correct,
116 | 'rel_loss': rel_loss,
117 | 'n_rel_correct': n_rel_correct,
118 |
119 | 'n_tokens': self.n_tokens,
120 | 'n_seqs': self.batch_size,
121 | 'tokens_to_keep': self.tokens_to_keep,
122 | 'n_correct': n_correct,
123 | 'n_seqs_correct': n_seqs_correct,
124 | 'loss': loss
125 | }
126 |
127 | return outputs
128 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/gama_parser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser
26 |
27 | #***************************************************************
28 | class GamaParser(BaseParser):
29 | """"""
30 |
31 | #=============================================================
32 | def __call__(self, vocabs, moving_params=None):
33 | """"""
34 |
35 | top_recur = super(GamaParser, self).__call__(vocabs, moving_params=moving_params)
36 | int_tokens_to_keep = tf.to_int32(self.tokens_to_keep)
37 |
38 | with tf.variable_scope('MLP'):
39 | dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size + 2*self.p_mlp_size,
40 | n_splits=2)
41 | arc_dep_mlp, rel_dep_mlp, mu_dep_mlp, sigma_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size, self.p_mlp_size], axis=2)
42 | arc_head_mlp, rel_head_mlp, mu_head_mlp, sigma_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size, self.p_mlp_size], axis=2)
43 |
44 | with tf.variable_scope('dist'):
45 | with tf.variable_scope('mu'):
46 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
47 | arc_mus = self.bilinear(mu_dep_mlp, mu_head_mlp, 1)**2
48 | with tf.variable_scope('sigma'):
49 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
50 | arc_sigmas = self.bilinear(sigma_dep_mlp, sigma_head_mlp, 1, initializer=None)**2 + .1
51 | # (b x 1)
52 | i_mat = tf.expand_dims(tf.range(self.bucket_size), 1)
53 | # (1 x b)
54 | j_mat = tf.expand_dims(tf.range(self.bucket_size), 0)
55 | # (b x 1) - (1 x b) -> (b x b)
56 | k_mat = tf.to_float(tf.abs(i_mat - j_mat))
57 |
58 | arc_logits = -.5*tf.log(2*np.pi * arc_sigmas) - .5*(k_mat-arc_mus)**2 / arc_sigmas
59 | #arc_rs += tf.to_float(k_mat)#tf.to_float(tf.expand_dims(tf.expand_dims(self.sequence_lengths, 1), 1))
60 | # (b x 1)
61 | #n_mat = tf.expand_dims(self.sequence_lengths, 1) - 1 - i_mat
62 | # (b x b) * (n x b x b) - (n x b x b) - (b x b) -> (n x b x b)
63 | #arc_logits = (tf.lgamma(arc_rs+1) - tf.lgamma(k_mat) - tf.lgamma(arc_rs-k_mat+2) +
64 | # k_mat * tf.log(arc_ps) + (arc_rs-k_mat+1)*tf.log(1-arc_ps) )
65 | with tf.variable_scope('Arc'):
66 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
67 | arc_logits += self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False)
68 | # (n x b x b)
69 | arc_probs = tf.nn.softmax(arc_logits)
70 | # (n x b)
71 | arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1))
72 | # (n x b)
73 | arc_targets = self.vocabs['heads'].placeholder
74 | # (n x b)
75 | arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep
76 | # ()
77 | arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep)
78 |
79 | with tf.variable_scope('Rel'):
80 | # (n x b x d) o (d x r x d) o (n x b x d).T -> (n x b x r x b)
81 | rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels']))
82 | # (n x b x r x b)
83 | rel_probs = tf.nn.softmax(rel_logits, dim=2)
84 | # (n x b x b)
85 | one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size)
86 | # (n x b x b) -> (n x b x b x 1)
87 | one_hot = tf.expand_dims(one_hot, axis=3)
88 | # (n x b x r x b) o (n x b x b x 1) -> (n x b x r x 1)
89 | select_rel_logits = tf.matmul(rel_logits, one_hot)
90 | # (n x b x r x 1) -> (n x b x r)
91 | select_rel_logits = tf.squeeze(select_rel_logits, axis=3)
92 | # (n x b)
93 | rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1))
94 | # (n x b)
95 | rel_targets = self.vocabs['rels'].placeholder
96 | # (n x b)
97 | rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep
98 | # ()
99 | rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep)
100 |
101 | n_arc_correct = tf.reduce_sum(arc_correct)
102 | n_rel_correct = tf.reduce_sum(rel_correct)
103 | correct = arc_correct * rel_correct
104 | n_correct = tf.reduce_sum(correct)
105 | n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1)))
106 | loss = arc_loss + rel_loss
107 |
108 | outputs = {
109 | 'arc_logits': arc_logits,
110 | 'arc_mus': arc_mus,
111 | 'arc_sigmas': arc_sigmas,
112 | 'arc_probs': arc_probs,
113 | 'arc_preds': arc_preds,
114 | 'arc_targets': arc_targets,
115 | 'arc_correct': arc_correct,
116 | 'arc_loss': arc_loss,
117 | 'n_arc_correct': n_arc_correct,
118 |
119 | 'rel_logits': rel_logits,
120 | 'rel_probs': rel_probs,
121 | 'rel_preds': rel_preds,
122 | 'rel_targets': rel_targets,
123 | 'rel_correct': rel_correct,
124 | 'rel_loss': rel_loss,
125 | 'n_rel_correct': n_rel_correct,
126 |
127 | 'n_tokens': self.n_tokens,
128 | 'n_seqs': self.batch_size,
129 | 'tokens_to_keep': self.tokens_to_keep,
130 | 'n_correct': n_correct,
131 | 'n_seqs_correct': n_seqs_correct,
132 | 'loss': loss
133 | }
134 |
135 | return outputs
136 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/parser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser
26 |
27 | #***************************************************************
28 | class Parser(BaseParser):
29 | """"""
30 |
31 | #=============================================================
32 | def __call__(self, vocabs, moving_params=None):
33 | """"""
34 |
35 | top_recur = super(Parser, self).__call__(vocabs, moving_params=moving_params)
36 | int_tokens_to_keep = tf.to_int32(self.tokens_to_keep)
37 |
38 | with tf.variable_scope('MLP'):
39 | dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size,
40 | n_splits=2)
41 | arc_dep_mlp, rel_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size], axis=2)
42 | arc_head_mlp, rel_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size], axis=2)
43 |
44 | with tf.variable_scope('Arc'):
45 | # (n x b x d) * (d x 1 x d) * (n x b x d).T -> (n x b x b)
46 | arc_logits = self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False)
47 | # (n x b x b)
48 | arc_probs = tf.nn.softmax(arc_logits)
49 | # (n x b)
50 | arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1))
51 | # (n x b)
52 | arc_targets = self.vocabs['heads'].placeholder
53 | # (n x b)
54 | arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep
55 | # ()
56 | arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep)
57 |
58 | with tf.variable_scope('Rel'):
59 | # (n x b x d) * (d x r x d) * (n x b x d).T -> (n x b x r x b)
60 | rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels']))
61 | # (n x b x r x b)
62 | rel_probs = tf.nn.softmax(rel_logits, dim=2)
63 | # (n x b x b)
64 | one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size)
65 | # (n x b x b) -> (n x b x b x 1)
66 | one_hot = tf.expand_dims(one_hot, axis=3)
67 | # (n x b x r x b) * (n x b x b x 1) -> (n x b x r x 1)
68 | select_rel_logits = tf.matmul(rel_logits, one_hot)
69 | # (n x b x r x 1) -> (n x b x r)
70 | select_rel_logits = tf.squeeze(select_rel_logits, axis=3)
71 | # (n x b)
72 | rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1))
73 | # (n x b)
74 | rel_targets = self.vocabs['rels'].placeholder
75 | # (n x b)
76 | rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep
77 | # ()
78 | rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep)
79 |
80 | n_arc_correct = tf.reduce_sum(arc_correct)
81 | n_rel_correct = tf.reduce_sum(rel_correct)
82 | correct = arc_correct * rel_correct
83 | n_correct = tf.reduce_sum(correct)
84 | n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1)))
85 | loss = arc_loss + rel_loss
86 |
87 | outputs = {
88 | 'arc_logits': arc_logits,
89 | 'arc_probs': arc_probs,
90 | 'arc_preds': arc_preds,
91 | 'arc_targets': arc_targets,
92 | 'arc_correct': arc_correct,
93 | 'arc_loss': arc_loss,
94 | 'n_arc_correct': n_arc_correct,
95 |
96 | 'rel_logits': rel_logits,
97 | 'rel_probs': rel_probs,
98 | 'rel_preds': rel_preds,
99 | 'rel_targets': rel_targets,
100 | 'rel_correct': rel_correct,
101 | 'rel_loss': rel_loss,
102 | 'n_rel_correct': n_rel_correct,
103 |
104 | 'n_tokens': self.n_tokens,
105 | 'n_seqs': self.batch_size,
106 | 'tokens_to_keep': self.tokens_to_keep,
107 | 'n_correct': n_correct,
108 | 'n_seqs_correct': n_seqs_correct,
109 | 'loss': loss
110 | }
111 |
112 | return outputs
113 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/parsers/xbar_parser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | from parser.neural.models.nlp.parsers.base_parser import BaseParser
26 |
27 | #***************************************************************
28 | class XbarParser(BaseParser):
29 | """"""
30 |
31 | #=============================================================
32 | def __call__(self, vocabs, moving_params=None):
33 | """"""
34 |
35 | top_recur = super(XbarParser, self).__call__(vocabs, moving_params=moving_params)
36 | int_tokens_to_keep = tf.to_int32(self.tokens_to_keep)
37 |
38 | with tf.variable_scope('MLP'):
39 | dep_mlp, head_mlp = self.MLP(top_recur, self.arc_mlp_size + self.rel_mlp_size + self.p_mlp_size,
40 | n_splits=2)
41 | arc_dep_mlp, rel_dep_mlp, p_dep_mlp = tf.split(dep_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size], axis=2)
42 | arc_head_mlp, rel_head_mlp, p_head_mlp = tf.split(head_mlp, [self.arc_mlp_size, self.rel_mlp_size, self.p_mlp_size], axis=2)
43 |
44 | with tf.variable_scope('p'):
45 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
46 | arc_ps = self.bilinear(p_dep_mlp, p_head_mlp, 1, add_bias2=False)
47 | # (b x 1)
48 | i_mat = tf.expand_dims(tf.expand_dims(tf.range(self.bucket_size), 1), 0)
49 | # (1 x b)
50 | j_mat = tf.expand_dims(tf.expand_dims(tf.range(self.bucket_size), 0), 0)
51 | # (b x 1) > (1 x b) -> (b x b)
52 | k_mat = tf.tile(j_mat > i_mat, [self.batch_size,1,1])
53 | # (b x 1)
54 | n_mat = tf.expand_dims(tf.expand_dims(self.sequence_lengths, 1), 1) - 1 - i_mat
55 | # (n x b x b) + (b x b) * (n x b x b) + (b x b) * (n x b x b) -> (n x b x b)
56 | arc_logits = -tf.nn.softplus(tf.where(k_mat, arc_ps, -arc_ps))
57 | # (n x b x b) - (b x b) * (b x b) -> (n x b x b)
58 |
59 | with tf.variable_scope('Arc'):
60 | # (n x b x d) o (d x 1 x d) o (n x b x d).T -> (n x b x b)
61 | arc_logits += self.bilinear(arc_dep_mlp, arc_head_mlp, 1, add_bias2=False)
62 | # (n x b x b)
63 | arc_probs = tf.nn.softmax(arc_logits)
64 | # (n x b)
65 | arc_preds = tf.to_int32(tf.argmax(arc_logits, axis=-1))
66 | # (n x b)
67 | arc_targets = self.vocabs['heads'].placeholder
68 | # (n x b)
69 | arc_correct = tf.to_int32(tf.equal(arc_preds, arc_targets))*int_tokens_to_keep
70 | # ()
71 | arc_loss = tf.losses.sparse_softmax_cross_entropy(arc_targets, arc_logits, self.tokens_to_keep)
72 |
73 | with tf.variable_scope('Rel'):
74 | # (n x b x d) o (d x r x d) o (n x b x d).T -> (n x b x r x b)
75 | rel_logits = self.bilinear(rel_dep_mlp, rel_head_mlp, len(self.vocabs['rels']))
76 | # (n x b x r x b)
77 | rel_probs = tf.nn.softmax(rel_logits, dim=2)
78 | # (n x b x b)
79 | one_hot = tf.one_hot(arc_preds if moving_params is not None else arc_targets, self.bucket_size)
80 | # (n x b x b) -> (n x b x b x 1)
81 | one_hot = tf.expand_dims(one_hot, axis=3)
82 | # (n x b x r x b) o (n x b x b x 1) -> (n x b x r x 1)
83 | select_rel_logits = tf.matmul(rel_logits, one_hot)
84 | # (n x b x r x 1) -> (n x b x r)
85 | select_rel_logits = tf.squeeze(select_rel_logits, axis=3)
86 | # (n x b)
87 | rel_preds = tf.to_int32(tf.argmax(select_rel_logits, axis=-1))
88 | # (n x b)
89 | rel_targets = self.vocabs['rels'].placeholder
90 | # (n x b)
91 | rel_correct = tf.to_int32(tf.equal(rel_preds, rel_targets))*int_tokens_to_keep
92 | # ()
93 | rel_loss = tf.losses.sparse_softmax_cross_entropy(rel_targets, select_rel_logits, self.tokens_to_keep)
94 |
95 | n_arc_correct = tf.reduce_sum(arc_correct)
96 | n_rel_correct = tf.reduce_sum(rel_correct)
97 | correct = arc_correct * rel_correct
98 | n_correct = tf.reduce_sum(correct)
99 | n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1)))
100 | loss = arc_loss + rel_loss
101 |
102 | outputs = {
103 | 'arc_logits': arc_logits,
104 | 'arc_probs': arc_probs,
105 | 'arc_preds': arc_preds,
106 | 'arc_targets': arc_targets,
107 | 'arc_correct': arc_correct,
108 | 'arc_loss': arc_loss,
109 | 'n_arc_correct': n_arc_correct,
110 |
111 | 'rel_logits': rel_logits,
112 | 'rel_probs': rel_probs,
113 | 'rel_preds': rel_preds,
114 | 'rel_targets': rel_targets,
115 | 'rel_correct': rel_correct,
116 | 'rel_loss': rel_loss,
117 | 'n_rel_correct': n_rel_correct,
118 |
119 | 'n_tokens': self.n_tokens,
120 | 'n_seqs': self.batch_size,
121 | 'tokens_to_keep': self.tokens_to_keep,
122 | 'n_correct': n_correct,
123 | 'n_seqs_correct': n_seqs_correct,
124 | 'loss': loss
125 | }
126 |
127 | return outputs
128 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/__init__.py:
--------------------------------------------------------------------------------
1 | from tagger import Tagger
2 | from xtagger import XTagger
3 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/base_tagger.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import re
23 | import codecs
24 | import numpy as np
25 | import tensorflow as tf
26 | import matplotlib.pyplot as plt
27 |
28 | from parser.misc.colors import ctext, color_pattern
29 | from parser.neural.models.nn import NN
30 |
31 | #***************************************************************
32 | class BaseTagger(NN):
33 | """"""
34 |
35 | PAD = 0
36 | ROOT = 1
37 |
38 | #=============================================================
39 | def __call__(self, vocabs, moving_params=None):
40 | """"""
41 |
42 | self.moving_params = moving_params
43 | if isinstance(vocabs, dict):
44 | self.vocabs = vocabs
45 | else:
46 | self.vocabs = {vocab.name: vocab for vocab in vocabs}
47 |
48 | input_vocabs = [self.vocabs[name] for name in self.input_vocabs]
49 | embed = self.embed_concat(input_vocabs)
50 | for vocab in self.vocabs.values():
51 | if vocab not in input_vocabs:
52 | vocab.generate_placeholder()
53 | placeholder = self.vocabs['words'].placeholder
54 | if len(placeholder.get_shape().as_list()) == 3:
55 | placeholder = placeholder[:,:,0]
56 | self._tokens_to_keep = tf.to_float(tf.greater(placeholder, self.ROOT))
57 | self._batch_size = tf.shape(placeholder)[0]
58 | self._bucket_size = tf.shape(placeholder)[1]
59 | self._sequence_lengths = tf.reduce_sum(tf.to_int32(tf.greater(placeholder, self.PAD)), axis=1)
60 | self._n_tokens = tf.to_int32(tf.reduce_sum(self.tokens_to_keep))
61 |
62 | top_recur = embed
63 | for i in xrange(self.n_layers):
64 | with tf.variable_scope('RNN%d' % i):
65 | top_recur, _ = self.RNN(top_recur, self.recur_size)
66 | return top_recur
67 |
68 | #=============================================================
69 | def process_accumulators(self, accumulators, time=None):
70 | """"""
71 |
72 | n_tokens, n_seqs, loss, corr, seq_corr = accumulators
73 | acc_dict = {
74 | 'Loss': loss,
75 | 'TS': corr/n_tokens*100,
76 | 'SS': seq_corr/n_seqs*100,
77 | }
78 | if time is not None:
79 | acc_dict.update({
80 | 'Token_rate': n_tokens / time,
81 | 'Seq_rate': n_seqs / time,
82 | })
83 | return acc_dict
84 |
85 | #=============================================================
86 | def update_history(self, history, accumulators):
87 | """"""
88 |
89 | acc_dict = self.process_accumulators(accumulators)
90 | for key, value in acc_dict.iteritems():
91 | history[key].append(value)
92 | return history['TS'][-1]
93 |
94 | #=============================================================
95 | def print_accuracy(self, accumulators, time, prefix='Train'):
96 | """"""
97 |
98 | acc_dict = self.process_accumulators(accumulators, time=time)
99 | strings = []
100 | strings.append(color_pattern('Loss:', '{Loss:7.3f}', 'bright_red'))
101 | strings.append(color_pattern('TS:', '{TS:5.2f}%', 'bright_cyan'))
102 | strings.append(color_pattern('SS:', '{SS:5.2f}%', 'bright_green'))
103 | strings.append(color_pattern('Speed:', '{Seq_rate:6.1f} seqs/sec', 'bright_magenta'))
104 | string = ctext('{0} ', 'bold') + ' | '.join(strings)
105 | print(string.format(prefix, **acc_dict))
106 | return
107 |
108 | #=============================================================
109 | def plot(self, history, prefix='Train'):
110 | """"""
111 |
112 | pass
113 |
114 | #=============================================================
115 | def check(self, preds, sents, fileobj):
116 | """"""
117 |
118 | for tokens, preds in zip(sents, preds[0]):
119 | for token, pred in zip(zip(*tokens), preds):
120 | tag = self.vocabs['tags'][pred]
121 | fileobj.write('\t'.join(token+(tag, ))+'\n')
122 | fileobj.write('\n')
123 | return
124 |
125 | #=============================================================
126 | def write_probs(self, sents, output_file, probs, inv_idxs):
127 | """"""
128 |
129 | # Turns list of tuples of tensors into list of matrices
130 | tag_probs = [tag_prob for batch in probs for tag_prob in batch[0]]
131 | tokens_to_keep = [weight for batch in probs for weight in batch[1]]
132 | tokens = [sent for batch in sents for sent in batch]
133 |
134 | with codecs.open(output_file, 'w', encoding='utf-8', errors='ignore') as f:
135 | for i in inv_idxs:
136 | sent, tag_prob, weights = tokens[i], tag_probs[i], tokens_to_keep[i]
137 | sent = zip(*sent)
138 | tag_preds = np.argmax(tag_prob, axis=1)
139 | for token, tag_pred, weight in zip(sent, tag_preds[1:], weights[1:]):
140 | token = list(token)
141 | token.insert(5, '_')
142 | token.append('_')
143 | token.append('_')
144 | token[3] = self.vocabs['tags'][tag_pred]
145 | f.write('\t'.join(token)+'\n')
146 | f.write('\n')
147 | return
148 |
149 | #=============================================================
150 | @property
151 | def train_keys(self):
152 | return ('n_tokens', 'n_seqs', 'loss', 'n_correct', 'n_seqs_correct')
153 |
154 | #=============================================================
155 | @property
156 | def valid_keys(self):
157 | return ('preds', )
158 |
159 | #=============================================================
160 | @property
161 | def parse_keys(self):
162 | return ('probs', 'tokens_to_keep')
163 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/base_xtagger.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import re
23 | import codecs
24 | import numpy as np
25 | import tensorflow as tf
26 | import matplotlib.pyplot as plt
27 |
28 | from parser.misc.colors import ctext, color_pattern
29 | from parser.neural.models.nn import NN
30 |
31 | #***************************************************************
32 | class BaseXTagger(NN):
33 | """"""
34 |
35 | PAD = 0
36 | ROOT = 1
37 |
38 | #=============================================================
39 | def __call__(self, vocabs, moving_params=None):
40 | """"""
41 |
42 | self.moving_params = moving_params
43 | if isinstance(vocabs, dict):
44 | self.vocabs = vocabs
45 | else:
46 | self.vocabs = {vocab.name: vocab for vocab in vocabs}
47 |
48 | input_vocabs = [self.vocabs[name] for name in self.input_vocabs]
49 | embed = self.embed_concat(input_vocabs)
50 | for vocab in self.vocabs.values():
51 | if vocab not in input_vocabs:
52 | vocab.generate_placeholder()
53 | placeholder = self.vocabs['words'].placeholder
54 | if len(placeholder.get_shape().as_list()) == 3:
55 | placeholder = placeholder[:,:,0]
56 | self._tokens_to_keep = tf.to_float(tf.greater(placeholder, self.ROOT))
57 | self._batch_size = tf.shape(placeholder)[0]
58 | self._bucket_size = tf.shape(placeholder)[1]
59 | self._sequence_lengths = tf.reduce_sum(tf.to_int32(tf.greater(placeholder, self.PAD)), axis=1)
60 | self._n_tokens = tf.to_int32(tf.reduce_sum(self.tokens_to_keep))
61 |
62 | top_recur = embed
63 | for i in xrange(self.n_layers):
64 | with tf.variable_scope('RNN%d' % i):
65 | top_recur, _ = self.RNN(top_recur, self.recur_size)
66 | return top_recur
67 |
68 | #=============================================================
69 | def process_accumulators(self, accumulators, time=None):
70 | """"""
71 |
72 | n_tokens, n_seqs, loss, corr, xcorr, seq_corr = accumulators
73 | acc_dict = {
74 | 'Loss': loss,
75 | 'TS': corr/n_tokens*100,
76 | 'XTS': xcorr/n_tokens*100,
77 | 'SS': seq_corr/n_seqs*100,
78 | }
79 | if time is not None:
80 | acc_dict.update({
81 | 'Token_rate': n_tokens / time,
82 | 'Seq_rate': n_seqs / time,
83 | })
84 | return acc_dict
85 |
86 | #=============================================================
87 | def update_history(self, history, accumulators):
88 | """"""
89 |
90 | acc_dict = self.process_accumulators(accumulators)
91 | for key, value in acc_dict.iteritems():
92 | history[key].append(value)
93 | return history['TS'][-1]
94 |
95 | #=============================================================
96 | def print_accuracy(self, accumulators, time, prefix='Train'):
97 | """"""
98 |
99 | acc_dict = self.process_accumulators(accumulators, time=time)
100 | strings = []
101 | strings.append(color_pattern('Loss:', '{Loss:7.3f}', 'bright_red'))
102 | strings.append(color_pattern('TS:', '{TS:5.2f}%', 'bright_cyan'))
103 | strings.append(color_pattern('XTS:', '{XTS:5.2f}%', 'bright_cyan'))
104 | strings.append(color_pattern('SS:', '{SS:5.2f}%', 'bright_green'))
105 | strings.append(color_pattern('Speed:', '{Seq_rate:6.1f} seqs/sec', 'bright_magenta'))
106 | string = ctext('{0} ', 'bold') + ' | '.join(strings)
107 | print(string.format(prefix, **acc_dict))
108 | return
109 |
110 | #=============================================================
111 | def plot(self, history, prefix='Train'):
112 | """"""
113 |
114 | pass
115 |
116 | #=============================================================
117 | def check(self, preds, sents, fileobj):
118 | """"""
119 |
120 | for tokens, preds, xpreds in zip(sents, preds[0], preds[1]):
121 | for token, pred, xpred in zip(zip(*tokens), preds, xpreds):
122 | tag = self.vocabs['tags'][pred]
123 | xtag = self.vocabs['xtags'][xpred]
124 | fileobj.write('\t'.join(token+(tag, xtag))+'\n')
125 | fileobj.write('\n')
126 | return
127 |
128 | #=============================================================
129 | def write_probs(self, sents, output_file, probs, inv_idxs):
130 | """"""
131 |
132 | # Turns list of tuples of tensors into list of matrices
133 | tag_probs = [tag_prob for batch in probs for tag_prob in batch[0]]
134 | xtag_probs = [xtag_prob for batch in probs for xtag_prob in batch[1]]
135 | tokens_to_keep = [weight for batch in probs for weight in batch[2]]
136 | tokens = [sent for batch in sents for sent in batch]
137 |
138 | with codecs.open(output_file, 'w', encoding='utf-8', errors='ignore') as f:
139 | for i in inv_idxs:
140 | sent, tag_prob, xtag_prob, weights = tokens[i], tag_probs[i], xtag_probs[i], tokens_to_keep[i]
141 | sent = zip(*sent)
142 | tag_preds = np.argmax(tag_prob, axis=1)
143 | xtag_preds = np.argmax(xtag_prob, axis=1)
144 | for token, tag_pred, xtag_pred, weight in zip(sent, tag_preds[1:], xtag_preds[1:], weights[1:]):
145 | token = list(token)
146 | token.insert(5, '_')
147 | token.append('_')
148 | token.append('_')
149 | token[3] = self.vocabs['tags'][tag_pred]
150 | token[4] = self.vocabs['xtags'][xtag_pred]
151 | f.write('\t'.join(token)+'\n')
152 | f.write('\n')
153 | return
154 |
155 | #=============================================================
156 | @property
157 | def train_keys(self):
158 | return ('n_tokens', 'n_seqs', 'loss', 'n_tag_correct', 'n_xtag_correct', 'n_seqs_correct')
159 |
160 | #=============================================================
161 | @property
162 | def valid_keys(self):
163 | return ('tag_preds', 'xtag_preds')
164 |
165 | #=============================================================
166 | @property
167 | def parse_keys(self):
168 | return ('tag_probs', 'xtag_probs', 'tokens_to_keep')
169 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/tagger.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | from parser.neural.models.nlp.taggers.base_tagger import BaseTagger
26 |
27 | #***************************************************************
28 | class Tagger(BaseTagger):
29 | """"""
30 |
31 | #=============================================================
32 | def __call__(self, vocabs, moving_params=None):
33 | """"""
34 |
35 | top_recur = super(Tagger, self).__call__(vocabs, moving_params=moving_params)
36 | int_tokens_to_keep = tf.to_int32(self.tokens_to_keep)
37 |
38 | with tf.variable_scope('MLP'):
39 | mlp = self.MLP(top_recur, self.mlp_size)
40 |
41 | with tf.variable_scope('Tag'):
42 | logits = self.linear(mlp, len(self.vocabs['tags']))
43 | probs = tf.nn.softmax(logits)
44 | preds = tf.to_int32(tf.argmax(logits, axis=-1))
45 | targets = self.vocabs['tags'].placeholder
46 | correct = tf.to_int32(tf.equal(preds, targets))*int_tokens_to_keep
47 | loss = tf.losses.sparse_softmax_cross_entropy(targets, logits, self.tokens_to_keep)
48 |
49 |
50 | n_correct = tf.reduce_sum(correct)
51 | n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1)))
52 |
53 | outputs = {
54 | 'logits': logits,
55 | 'probs': probs,
56 | 'preds': preds,
57 | 'targets': targets,
58 | 'correct': correct,
59 | 'loss': loss,
60 | 'n_correct': n_correct,
61 |
62 | 'n_tokens': self.n_tokens,
63 | 'n_seqs': self.batch_size,
64 | 'tokens_to_keep': self.tokens_to_keep,
65 | 'n_correct': n_correct,
66 | 'n_seqs_correct': n_seqs_correct,
67 | 'loss': loss
68 | }
69 |
70 | return outputs
71 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/models/nlp/taggers/xtagger.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | from parser.neural.models.nlp.taggers.base_xtagger import BaseXTagger
26 |
27 | #***************************************************************
28 | class XTagger(BaseXTagger):
29 | """"""
30 |
31 | #=============================================================
32 | def __call__(self, vocabs, moving_params=None):
33 | """"""
34 |
35 | top_recur = super(XTagger, self).__call__(vocabs, moving_params=moving_params)
36 | int_tokens_to_keep = tf.to_int32(self.tokens_to_keep)
37 |
38 | with tf.variable_scope('MLP'):
39 | tag_mlp, xtag_mlp = self.MLP(top_recur, self.mlp_size, n_splits=2)
40 |
41 | with tf.variable_scope('Tag'):
42 | tag_logits = self.linear(tag_mlp, len(self.vocabs['tags']))
43 | tag_probs = tf.nn.softmax(tag_logits)
44 | tag_preds = tf.to_int32(tf.argmax(tag_logits, axis=-1))
45 | tag_targets = self.vocabs['tags'].placeholder
46 | tag_correct = tf.to_int32(tf.equal(tag_preds, tag_targets))*int_tokens_to_keep
47 | tag_loss = tf.losses.sparse_softmax_cross_entropy(tag_targets, tag_logits, self.tokens_to_keep)
48 |
49 | with tf.variable_scope('XTag'):
50 | xtag_logits = self.linear(xtag_mlp, len(self.vocabs['xtags']))
51 | xtag_probs = tf.nn.softmax(xtag_logits)
52 | xtag_preds = tf.to_int32(tf.argmax(xtag_logits, axis=-1))
53 | xtag_targets = self.vocabs['xtags'].placeholder
54 | xtag_correct = tf.to_int32(tf.equal(xtag_preds, xtag_targets))*int_tokens_to_keep
55 | xtag_loss = tf.losses.sparse_softmax_cross_entropy(xtag_targets, xtag_logits, self.tokens_to_keep)
56 |
57 | correct = tag_correct * xtag_correct
58 | n_correct = tf.reduce_sum(correct)
59 | n_tag_correct = tf.reduce_sum(tag_correct)
60 | n_xtag_correct = tf.reduce_sum(xtag_correct)
61 | n_seqs_correct = tf.reduce_sum(tf.to_int32(tf.equal(tf.reduce_sum(correct, axis=1), self.sequence_lengths-1)))
62 | loss = tag_loss + xtag_loss
63 |
64 | outputs = {
65 | 'tag_logits': tag_logits,
66 | 'tag_probs': tag_probs,
67 | 'tag_preds': tag_preds,
68 | 'tag_targets': tag_targets,
69 | 'tag_correct': tag_correct,
70 | 'tag_loss': tag_loss,
71 | 'n_tag_correct': n_tag_correct,
72 |
73 | 'xtag_logits': xtag_logits,
74 | 'xtag_probs': xtag_probs,
75 | 'xtag_preds': xtag_preds,
76 | 'xtag_targets': xtag_targets,
77 | 'xtag_correct': xtag_correct,
78 | 'xtag_loss': xtag_loss,
79 | 'n_xtag_correct': n_xtag_correct,
80 |
81 | 'n_tokens': self.n_tokens,
82 | 'n_seqs': self.batch_size,
83 | 'tokens_to_keep': self.tokens_to_keep,
84 | 'n_correct': n_correct,
85 | 'n_seqs_correct': n_seqs_correct,
86 | 'loss': loss
87 | }
88 |
89 | return outputs
90 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from sgd_optimizer import SGDOptimizer
19 | from radam_optimizer import RadamOptimizer
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/optimizers/radam_optimizer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import tensorflow as tf
23 |
24 | from parser.neural.optimizers.base_optimizer import BaseOptimizer
25 |
26 | #***************************************************************
27 | class RadamOptimizer(BaseOptimizer):
28 | """"""
29 |
30 | #=============================================================
31 | def _init_acc(self, var_list, grads):
32 | """"""
33 |
34 | super(RadamOptimizer, self)._init_acc(var_list, grads)
35 | for x_tm1, g_t in zip(var_list, grads):
36 | if self.mu > 0:
37 | self.get_accumulator(x_tm1, 'm')
38 | shape = self.get_variable_shape(x_tm1)
39 | if isinstance(g_t, tf.Tensor):
40 | self.get_accumulator(x_tm1, 'm/tm1', [])
41 | else:
42 | self.get_accumulator(x_tm1, 'm/tm1', [shape[0]]+[1]*(len(shape)-1))
43 | if self.nu > 0:
44 | self.get_accumulator(x_tm1, 'v')
45 | shape = self.get_variable_shape(x_tm1)
46 | if isinstance(g_t, tf.Tensor):
47 | self.get_accumulator(x_tm1, 'v/tm1', [])
48 | else:
49 | self.get_accumulator(x_tm1, 'v/tm1', [shape[0]]+[1]*(len(shape)-1))
50 | return
51 |
52 | #=============================================================
53 | def _apply_dense(self, cache):
54 | """"""
55 |
56 | x_tm1, g_t = cache['x_tm1'], cache['g_t']
57 | updates = cache['updates']
58 |
59 | if self.mu > 0:
60 | m_t, t_m = self._dense_moving_average(x_tm1, g_t, 'm', beta=self.mu)
61 | m_bar_t = (1-self.gamma) * m_t + self.gamma * g_t
62 | updates.extend([m_t, t_m])
63 | else:
64 | m_bar_t = g_t
65 |
66 | if self.nu > 0:
67 | v_t, t_v = self._dense_moving_average(x_tm1, g_t**2, 'v', beta=self.nu)
68 | v_bar_t = tf.sqrt(v_t + self.epsilon)
69 | updates.extend([v_t, t_v])
70 | else:
71 | v_bar_t = 1
72 |
73 | s_t = self.learning_rate * m_bar_t / v_bar_t
74 | cache['s_t'] = tf.where(tf.is_finite(s_t), s_t, tf.zeros_like(s_t))
75 | return cache
76 |
77 | #=============================================================
78 | def _apply_sparse(self, cache):
79 | """"""
80 |
81 | x_tm1, g_t, idxs = cache['x_tm1'], cache['g_t'], cache['idxs']
82 | idxs, idxs_ = tf.unique(idxs)
83 | g_t_ = tf.unsorted_segment_sum(g_t, idxs_, tf.size(idxs))
84 | updates = cache['updates']
85 |
86 | if self.mu > 0:
87 | m_t, t_m = self._sparse_moving_average(x_tm1, idxs, g_t_, 'm', beta=self.mu)
88 | m_t_ = tf.gather(m_t, idxs)
89 | m_bar_t_ = (1-self.gamma) * m_t_ + self.gamma * g_t_
90 | updates.extend([m_t, t_m])
91 | else:
92 | m_bar_t_ = g_t_
93 |
94 | if self.nu > 0:
95 | v_t, t_v = self._sparse_moving_average(x_tm1, idxs, g_t_**2, 'v', beta=self.nu)
96 | v_t_ = tf.gather(v_t, idxs)
97 | v_bar_t_ = tf.sqrt(v_t_ + self.epsilon)
98 | updates.extend([v_t, t_v])
99 | else:
100 | v_bar_t_ = 1
101 |
102 | s_t_ = self.learning_rate * m_bar_t_ / v_bar_t_
103 | cache['s_t'] = tf.where(tf.is_finite(s_t_), s_t_, tf.zeros_like(s_t_))
104 | cache['g_t'] = g_t_
105 | cache['idxs'] = idxs
106 | return cache
107 |
108 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/optimizers/sgd_optimizer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import tensorflow as tf
23 |
24 | from parser.neural.optimizers.base_optimizer import BaseOptimizer
25 |
26 | #***************************************************************
27 | class SGDOptimizer(BaseOptimizer):
28 | """"""
29 |
30 | #=============================================================
31 | def _apply_dense(self, cache):
32 | """"""
33 |
34 | g_t = cache['g_t']
35 | cache['s_t'] = self.learning_rate * g_t
36 | return cache
37 |
38 | #=============================================================
39 | def _apply_sparse(self, cache):
40 | """"""
41 |
42 | g_t, idxs = cache['g_t'], cache['idxs']
43 | idxs, idxs_ = tf.unique(idxs)
44 | g_t_ = tf.unsorted_segment_sum(g_t, idxs_, tf.size(idxs))
45 |
46 | cache['g_t'] = g_t_
47 | cache['idxs'] = idxs
48 | cache['s_t'] = self.learning_rate * g_t_
49 |
50 | return cache
51 |
52 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/recur_cells/.directory:
--------------------------------------------------------------------------------
1 | [Dolphin]
2 | Timestamp=2016,10,21,3,50,28
3 | Version=3
4 | ViewMode=1
5 | VisibleRoles=Details_text,Details_size,Details_date,Details_wordCount,Details_lineCount,CustomizedDetails
6 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/recur_cells/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from rnn_cell import RNNCell
19 | from gru_cell import GRUCell
20 | from cif_lstm_cell import CifLSTMCell
21 | from lstm_cell import LSTMCell
22 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/recur_cells/base_cell.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import tensorflow as tf
23 |
24 | from parser.configurable import Configurable
25 |
26 | #***************************************************************
27 | class BaseCell(Configurable):
28 | """"""
29 |
30 | #=============================================================
31 | def __init__(self, output_size, *args, **kwargs):
32 | """"""
33 |
34 | self._output_size = output_size
35 | input_size = kwargs.pop('input_size', self._output_size)
36 | self.moving_params = kwargs.pop('moving_params', None)
37 | super(BaseCell, self).__init__(*args, **kwargs)
38 | self._input_size = input_size if input_size is not None else self.output_size
39 |
40 | #=============================================================
41 | def __call__(self, inputs, state, scope=None):
42 | """"""
43 |
44 | raise NotImplementedError()
45 |
46 | #=============================================================
47 | def zero_state(self, batch_size, dtype):
48 | """"""
49 |
50 | zero_state = tf.get_variable('Zero_state',
51 | shape=self.state_size,
52 | dtype=dtype,
53 | initializer=tf.zeros_initializer())
54 | state = tf.reshape(tf.tile(zero_state, tf.stack([batch_size])), tf.stack([batch_size, self.state_size]))
55 | state.set_shape([None, self.state_size])
56 | return state
57 |
58 | #=============================================================
59 | @property
60 | def input_size(self):
61 | return self._input_size
62 | @property
63 | def output_size(self):
64 | return self._output_size
65 | @property
66 | def state_size(self):
67 | raise NotImplementedError()
68 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/recur_cells/cif_lstm_cell.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import tensorflow as tf
23 |
24 | from parser.neural.recur_cells.base_cell import BaseCell
25 | from parser.neural.linalg import linear
26 | from parser.neural.functions import gate
27 |
28 | #***************************************************************
29 | class CifLSTMCell(BaseCell):
30 | """"""
31 |
32 | #=============================================================
33 | def __call__(self, inputs, state, scope=None):
34 | """"""
35 |
36 | with tf.variable_scope(scope or type(self).__name__):
37 | cell_tm1, hidden_tm1 = tf.split(state, 2, axis=1)
38 | input_list = [inputs, hidden_tm1]
39 | lin = linear(input_list,
40 | self.output_size,
41 | add_bias=True,
42 | n_splits=3,
43 | moving_params=self.moving_params)
44 | cell_act, update_act, output_act = lin
45 |
46 | cell_tilde_t = cell_act
47 | update_gate = gate(update_act-self.forget_bias)
48 | output_gate = gate(output_act)
49 | cell_t = update_gate * cell_tilde_t + (1-update_gate) * cell_tm1
50 | hidden_tilde_t = self.recur_func(cell_t)
51 | hidden_t = hidden_tilde_t * output_gate
52 |
53 | return hidden_t, tf.concat([cell_t, hidden_t], 1)
54 |
55 | #=============================================================
56 | @property
57 | def state_size(self):
58 | return self.output_size * 2
59 |
60 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/recur_cells/gru_cell.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import tensorflow as tf
23 |
24 | from parser.neural.recur_cells.base_cell import BaseCell
25 | from parser.neural.linalg import linear
26 | from parser.neural.functions import gate
27 |
28 | #***************************************************************
29 | class GRUCell(BaseCell):
30 | """"""
31 |
32 | #=============================================================
33 | def __call__(self, inputs, state, scope=None):
34 | """"""
35 |
36 | with tf.variable_scope(scope or type(self).__name__):
37 | cell_tm1, hidden_tm1 = tf.split(state, 2, axis=1)
38 | input_list = [inputs, hidden_tm1]
39 | with tf.variable_scope('Gates'):
40 | gates = linear(inputs_list,
41 | self.output_size,
42 | add_bias=True,
43 | n_splits=2,
44 | moving_params=self.moving_params)
45 | update_act, reset_act = gates
46 | update_gate = gate(update_act-self.forget_bias)
47 | reset_gate = gate(reset_act)
48 | reset_state = reset_gate * hidden_tm1
49 | input_list = [inputs, reset_state]
50 | with tf.variable_scope('Candidate'):
51 | hidden_act = linear(input_list,
52 | self.output_size,
53 | add_bias=True,
54 | moving_params=self.moving_params)
55 | hidden_tilde = self.recur_func(hidden_act)
56 | cell_t = update_gate * cell_tm1 + (1-update_gate) * hidden_tilde
57 | return cell_t, tf.concat([cell_t, cell_t], 1)
58 |
59 | #=============================================================
60 | @property
61 | def state_size(self):
62 | return self.output_size * 2
63 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/recur_cells/lstm_cell.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import tensorflow as tf
23 |
24 | from parser.neural.recur_cells.base_cell import BaseCell
25 | from parser.neural.linalg import linear
26 | from parser.neural.functions import gate, tanh
27 |
28 | #***************************************************************
29 | class LSTMCell(BaseCell):
30 | """"""
31 |
32 | #=============================================================
33 | def __call__(self, inputs, state, scope=None):
34 | """"""
35 |
36 | with tf.variable_scope(scope or type(self).__name__):
37 | cell_tm1, hidden_tm1 = tf.split(state, 2, axis=1)
38 | input_list = [inputs, hidden_tm1]
39 | lin = linear(input_list,
40 | self.output_size,
41 | add_bias=True,
42 | n_splits=4,
43 | moving_params=self.moving_params)
44 | cell_act, input_act, forget_act, output_act = lin
45 |
46 | cell_tilde_t = tanh(cell_act)
47 | input_gate = gate(input_act)
48 | forget_gate = gate(forget_act-self.forget_bias)
49 | output_gate = gate(output_act)
50 | cell_t = input_gate * cell_tilde_t + (1-forget_gate) * cell_tm1
51 | hidden_tilde_t = self.recur_func(cell_t)
52 | hidden_t = hidden_tilde_t * output_gate
53 |
54 | return hidden_t, tf.concat([cell_t, hidden_t], 1)
55 |
56 | #=============================================================
57 | @property
58 | def state_size(self):
59 | return self.output_size * 2
60 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/recur_cells/rnn_cell.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import tensorflow as tf
23 |
24 | from parser.neural.recur_cells.base_cell import BaseCell
25 | from parser.neural.linalg import linear
26 |
27 | #***************************************************************
28 | class RNNCell(BaseCell):
29 | """"""
30 |
31 | #=============================================================
32 | def __call__(self, inputs, state, scope=None):
33 | """"""
34 |
35 | with tf.variable_scope(scope or type(self).__name__):
36 | inputs_list = [inputs, state]
37 | hidden_act = linear(inputs_list,
38 | self.output_size,
39 | add_bias=True,
40 | moving_params=self.moving_params)
41 | hidden = self.recur_func(hidden_act)
42 | return hidden, hidden
43 |
44 | #=============================================================
45 | @property
46 | def state_size(self):
47 | return self.output_size
48 |
49 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/neural/rnn.py:
--------------------------------------------------------------------------------
1 | # Copyright 2015 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """RNN helpers for TensorFlow models."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | import parser.neural.linalg as linalg
26 |
27 | #===============================================================
28 | def birnn(cell, inputs, sequence_length, initial_state_fw=None, initial_state_bw=None, ff_keep_prob=1., recur_keep_prob=1., dtype=tf.float32, scope=None):
29 | """"""
30 |
31 | # Forward direction
32 | with tf.variable_scope(scope or 'BiRNN_FW') as fw_scope:
33 | output_fw, output_state_fw = rnn(cell, inputs, sequence_length, initial_state_fw, ff_keep_prob, recur_keep_prob, dtype, scope=fw_scope)
34 |
35 | # Backward direction
36 | rev_inputs = tf.reverse_sequence(inputs, sequence_length, 1, 0)
37 | with tf.variable_scope(scope or 'BiRNN_BW') as bw_scope:
38 | output_bw, output_state_bw = rnn(cell, rev_inputs, sequence_length, initial_state_bw, ff_keep_prob, recur_keep_prob, dtype, scope=bw_scope)
39 | output_bw = tf.reverse_sequence(output_bw, sequence_length, 1, 0)
40 | # Concat each of the forward/backward outputs
41 | outputs = tf.concat([output_fw, output_bw], 2)
42 |
43 | return outputs, tf.tuple([output_state_fw, output_state_bw])
44 |
45 | #===============================================================
46 | def rnn(cell, inputs, sequence_length=None, initial_state=None, ff_keep_prob=1., recur_keep_prob=1., dtype=tf.float32, scope=None):
47 | """"""
48 |
49 | inputs = tf.transpose(inputs, [1, 0, 2]) # (B,T,D) => (T,B,D)
50 |
51 | parallel_iterations = 32
52 | if sequence_length is not None:
53 | sequence_length = tf.to_int32(sequence_length)
54 |
55 | with tf.variable_scope(scope or 'RNN') as varscope:
56 | #if varscope.caching_device is None:
57 | # varscope.set_caching_device(lambda op: op.device)
58 | input_shape = tf.shape(inputs)
59 | time_steps, batch_size, _ = tf.unstack(input_shape, 3)
60 | const_time_steps, const_batch_size, const_depth = inputs.get_shape().as_list()
61 |
62 | if initial_state is not None:
63 | state = initial_state
64 | else:
65 | if not dtype:
66 | raise ValueError('If no initial_state is provided, dtype must be.')
67 | state = cell.zero_state(batch_size, dtype)
68 |
69 | zero_output = tf.zeros(tf.stack([batch_size, cell.output_size]), inputs.dtype)
70 | if sequence_length is not None:
71 | min_sequence_length = tf.reduce_min(sequence_length)
72 | max_sequence_length = tf.reduce_max(sequence_length)
73 |
74 | time = tf.constant(0, dtype=tf.int32, name='time')
75 |
76 | output_ta = tf.TensorArray(dtype=inputs.dtype,
77 | size=time_steps,
78 | tensor_array_name='dynamic_rnn_output')
79 |
80 | input_ta = tf.TensorArray(dtype=inputs.dtype,
81 | size=time_steps,
82 | tensor_array_name='dynamic_rnn_input')
83 |
84 | if ff_keep_prob < 1:
85 | noise_shape = tf.stack([1, batch_size, const_depth])
86 | inputs = tf.nn.dropout(inputs, ff_keep_prob, noise_shape=noise_shape)
87 |
88 | if recur_keep_prob < 1:
89 | ones = tf.ones(tf.stack([batch_size, cell.output_size]))
90 | state_dropout = tf.nn.dropout(ones, recur_keep_prob)
91 | state_dropout = tf.concat([ones] * (cell.state_size // cell.output_size - 1) + [state_dropout], 1)
92 | else:
93 | state_dropout = 1
94 |
95 | input_ta = input_ta.unstack(inputs)
96 |
97 | #-----------------------------------------------------------
98 | def _time_step(time, state, output_ta_t):
99 | """"""
100 |
101 | input_t = input_ta.read(time)
102 |
103 | #- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
104 | def _empty_update():
105 | return zero_output, state
106 | #- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
107 | def _call_cell():
108 | return cell(input_t, state * state_dropout)
109 | #- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
110 | def _maybe_copy_some_through():
111 | new_output, new_state = _call_cell()
112 |
113 | return tf.cond(
114 | time < min_sequence_length,
115 | lambda: (new_output, new_state),
116 | lambda: (tf.where(time >= sequence_length, zero_output, new_output),
117 | tf.where(time >= sequence_length, state, new_state)))
118 | #- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
119 |
120 | if sequence_length is not None:
121 | output, new_state = tf.cond(
122 | time >= max_sequence_length,
123 | _empty_update,
124 | _maybe_copy_some_through)
125 | else:
126 | (output, new_state) = _call_cell()
127 |
128 | output_ta_t = output_ta_t.write(time, output)
129 |
130 | return (time + 1, new_state, output_ta_t)
131 | #-----------------------------------------------------------
132 |
133 | _, final_state, output_final_ta = tf.while_loop(
134 | cond=lambda time, _1, _2: time < time_steps,
135 | body=_time_step,
136 | loop_vars=(time, state, output_ta),
137 | parallel_iterations=parallel_iterations)
138 |
139 | final_outputs = output_final_ta.stack()
140 |
141 | outputs = tf.transpose(final_outputs, [1, 0, 2]) # (T,B,D) => (B,T,D)
142 | return outputs, final_state
143 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/scripts/compression_ratio.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import os
23 | import re
24 | import argparse
25 | import codecs
26 | from backports import lzma
27 |
28 | import numpy as np
29 | from numpy.linalg import inv
30 | import matplotlib.pyplot as plt
31 | from collections import Counter
32 |
33 | #***************************************************************
34 | if __name__ == '__main__':
35 | """"""
36 |
37 | parser = argparse.ArgumentParser()
38 | parser.add_argument('-k', '--k_trials', type=int, default=100)
39 | parser.add_argument('-n', '--n_words', type=int, default=5000)
40 | parser.add_argument('files', nargs='+')
41 |
42 | args = parser.parse_args()
43 | type_counter = Counter()
44 | for filename in args.files:
45 | with codecs.open(filename, encoding='utf-8', errors='ignore') as f:
46 | for line in f:
47 | line = line.strip()
48 | if line:
49 | if not re.match('#|[0-9]+[-.][0-9]+', line):
50 | type_counter[line.split('\t')[1]] += 1
51 |
52 | types = type_counter.keys()
53 | total = sum(type_counter.values())
54 | probs = [type_counter[type_] / total for type_ in types]
55 |
56 | trials = []
57 | n_words = min(args.n_words, len(types)) or len(types)
58 | for _ in xrange(args.k_trials):
59 | chosen_types = np.random.choice(types, size=n_words, replace=False, p=probs)
60 | with codecs.open('uncompressed.txt', 'w', encoding='utf-8', errors='ignore') as f:
61 | f.write('\n'.join(chosen_types))
62 | with lzma.open('compressed.txt.xz', 'wb') as f:
63 | f.write('\n'.join(chosen_types).encode('utf-8', 'ignore'))
64 | trials.append(os.path.getsize('compressed.txt.xz')/os.path.getsize('uncompressed.txt'))
65 | os.remove('uncompressed.txt')
66 | os.remove('compressed.txt.xz')
67 | print(np.mean(trials))
68 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/scripts/count_nonprojective.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import os
23 | import re
24 | import argparse
25 |
26 | import numpy as np
27 | from collections import defaultdict
28 |
29 | #***************************************************************
30 | class DepTree:
31 | """"""
32 |
33 | #=============================================================
34 | def __init__(self, buff):
35 | """"""
36 |
37 | self._head2deps = defaultdict(list)
38 | self._dep2head = dict()
39 | self._str = []
40 | for line in buff:
41 | dep_idx = int(line[0])
42 | head_idx = int(line[6])
43 | self.head2deps[head_idx].append(dep_idx)
44 | self.dep2head[dep_idx] = head_idx
45 | self._str.append(line[1])
46 | return
47 |
48 | #=============================================================
49 | def count_nonprojective(self):
50 | """"""
51 |
52 | nonproj = []
53 | for dep in self:
54 | head = self.dep2head[dep]
55 | span_min = min(dep, head)
56 | span_max = max(dep, head)
57 | for mid_dep in xrange(span_min+1, span_max):
58 | mid_head = self.dep2head[mid_dep]
59 | if mid_head < span_min or mid_head > span_max:
60 | crossing = True
61 | break
62 | else:
63 | crossing = False
64 | nonproj.append(int(crossing))
65 | return nonproj
66 |
67 | #=============================================================
68 | @property
69 | def head2deps(self):
70 | return self._head2deps
71 | @property
72 | def dep2head(self):
73 | return self._dep2head
74 |
75 | #=============================================================
76 | def __iter__(self):
77 | return (dep for dep in self.dep2head)
78 | def __len__(self):
79 | return len(self.dep2head)
80 | def __str__(self):
81 | return ' '.join(self._str)+'\n'
82 |
83 | #***************************************************************
84 | if __name__ == '__main__':
85 | """"""
86 |
87 | parser = argparse.ArgumentParser()
88 | parser.add_argument('files', nargs='+')
89 |
90 | args = parser.parse_args()
91 | for filename in args.files:
92 | lang = re.search('([-\w]*)-ud', filename).group(1)
93 | nonproj = []
94 | with open(filename) as f:
95 | buff = []
96 | for line in f:
97 | line = line.strip()
98 | if line:
99 | if not re.match('#|[0-9]+[-.][0-9]+', line):
100 | buff.append(line.split('\t'))
101 | else:
102 | tree = DepTree(buff)
103 | nonproj.extend(tree.count_nonprojective())
104 | buff = []
105 | print(lang, np.mean(nonproj)*100)
106 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/scripts/heaps_law.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import os
23 | import re
24 | import argparse
25 |
26 | import numpy as np
27 | from numpy.linalg import inv
28 | import matplotlib.pyplot as plt
29 | from collections import defaultdict
30 |
31 | #***************************************************************
32 | if __name__ == '__main__':
33 | """"""
34 |
35 | parser = argparse.ArgumentParser()
36 | parser.add_argument('files', nargs='+')
37 |
38 | args = parser.parse_args()
39 | words = []
40 | types = set()
41 | n_types = []
42 | for filename in args.files:
43 | with open(filename) as f:
44 | for line in f:
45 | line = line.strip()
46 | if line:
47 | if not re.match('#|[0-9]+[-.][0-9]+', line):
48 | words.append(line.split('\t')[1])
49 | np.random.shuffle(words)
50 | for word in words:
51 | types.add(word)
52 | n_types.append(len(types))
53 |
54 | K = 1
55 | b = .75
56 | y = n_types
57 | logy = np.log(y)
58 | x = np.arange(len(n_types))+1
59 | logx = np.log(x)
60 | d2ell = np.array([[1, np.mean(logx)],[np.mean(logx), np.mean(logx**2)]])
61 | d2ellinv = inv(d2ell)
62 | ell = np.mean((logy - b*logx-K)**2 / 2)
63 | dell = np.array([np.mean(K+b*logx-logy), np.mean((K+b*logx-logy)*logx)])
64 | updates = d2ellinv.dot(dell)
65 | K -= updates[0]
66 | b -= updates[1]
67 | print(b)
68 | #K_ = 5
69 | #b_ = .74
70 | #for i in xrange(20):
71 | # ell = np.mean((y - K_*x**b_)**2 / 2)
72 | # K_ -= 2*np.mean((K_*x**b_-y)*x**b_) / np.mean(x**(2*b_))
73 | # b_ -= 2*np.mean((K_*x**b_-y)*K_*x**b_*logx) / np.mean((2*K_*x**b_ - y)*K_*x**b_*logx**2)
74 | # print(ell, K_, b_)
75 | #plt.figure()
76 | #plt.grid()
77 | #plt.plot(x, y)
78 | #plt.plot(x, np.exp(b*logx+K))
79 | #plt.show()
80 | #plt.figure()
81 | #plt.grid()
82 | #plt.plot(x, logy - b*logx-K)
83 | #plt.show()
84 | #plt.figure()
85 | #plt.grid()
86 | #plt.plot(x, y - K_*x**b_)
87 | #plt.show()
88 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/scripts/reinsert_compounds.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import os
6 | import sys
7 | import codecs
8 |
9 | input_file = sys.argv[2]
10 | output_file = sys.argv[1]
11 |
12 | lines = []
13 |
14 | with codecs.open(output_file, encoding='utf-8') as f:
15 | for line in f:
16 | lines.append(line)
17 |
18 | with codecs.open(input_file, encoding='utf-8') as f:
19 | with codecs.open(output_file, 'w', encoding='utf-8') as fout:
20 | i = 0
21 | for line in f:
22 | line = line.strip()
23 |
24 | if len(line) == 0:
25 | fout.write(lines[i])
26 | i += 1
27 | continue
28 |
29 | if line[0] == '#':
30 | continue
31 |
32 | line = line.split('\t')
33 | if '.' in line[0]:
34 | continue
35 |
36 | if '-' in line[0]:
37 | fout.write('%s\n' % ('\t'.join(line)))
38 | continue
39 |
40 | fout.write(lines[i])
41 | i += 1
42 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/trash/retrained_vocab.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import os
23 | import codecs
24 | from collections import Counter
25 |
26 | import numpy as np
27 | import scipy.linalg as la
28 | import tensorflow as tf
29 |
30 | from parser.vocabs.base_vocab import BaseVocab
31 |
32 | #***************************************************************
33 | class RetrainedVocab(BaseVocab):
34 | """"""
35 |
36 | #=============================================================
37 | def __init__(self, pretrained_vocab, *args, **kwargs):
38 | """"""
39 |
40 | super(RetrainedVocab, self).__init__(*args, **kwargs)
41 |
42 | self._pretrained_vocab = pretrained_vocab
43 | return
44 |
45 | #=============================================================
46 | def __call__(self):
47 | """"""
48 |
49 | embed_size = self.embed_size
50 | row_idxs = tf.placeholder(tf.int32, shape=(None,), name='row_idxs')
51 | col_idxs = tf.placeholder(tf.int32, shape=(None,), name='col_idxs')
52 | S, U, _ = tf.svd(self.pretrained_vocab.embeddings)
53 | self.embeddings = U[:,:embed_size] * S[:embed_size]
54 |
55 | old_rows = tf.gather(self.pretrained_vocab.embeddings, row_idxs)
56 | old_cols = tf.gather(self.pretrained_vocab.embeddings, col_idxs)
57 | new_rows = tf.gather(self.embeddings, row_idxs)
58 | new_cols = tf.gather(self.embeddings, col_idxs)
59 | old_matmul = tf.matmul(old_rows, old_cols, transpose_b=True)
60 | new_matmul = tf.matmul(new_rows, new_cols, transpose_b=True)
61 |
62 | if self.embed_loss == 'cross_entropy':
63 | old_matmul = tf.expand_dims(tf.nn.softmax(old_matmul), axis=1)
64 | new_matmul = tf.expand_dims(tf.nn.softmax(new_matmul), axis=2)
65 | loss = -tf.reduce_sum(tf.matmul(old_matmul, tf.log(new_matmul))) / tf.to_float(tf.shape(row_idxs)[0])
66 | elif self.embed_loss == 'l2_loss':
67 | loss = tf.reduce_sum((old_matmul - new_matmul)**2 / 2) / tf.to_float(tf.shape(row_idxs)[0])
68 | else:
69 | raise ValueError('embed_loss must be in "(cross_entropy, l2_loss)"')
70 |
71 | return {'row_idxs': row_idxs,
72 | 'col_idxs': col_idxs,
73 | 'loss': loss}
74 |
75 | #=============================================================
76 | def dump(self):
77 | """"""
78 |
79 | matrix = self.embeddings.eval()
80 | with codecs.open(self.name+'.txt', 'w') as f:
81 | for idx in xrange(self.START_IDX, len(self)):
82 | f.write('%s %s\n' % (self[idx], ' '.join(matrix[idx])))
83 | return
84 |
85 | #=============================================================
86 | @property
87 | def pretrained_vocab(self):
88 | return self._pretrained_vocab
89 |
90 | #=============================================================
91 | def __setattr__(self, name, value):
92 | if name == '_pretrained_vocab':
93 | self._str2idx = value._str2idx
94 | self._idx2str = value._idx2str
95 | self._counts = value._counts
96 | super(RetrainedVocab, self).__setattr__(name, value)
97 |
98 | #***************************************************************
99 | if __name__ == '__main__':
100 | """"""
101 |
102 | from parser import Configurable
103 | from parser.vocabs import PretrainedVocab
104 | configurable = Configurable(retrained_vocab={'embed_loss':'cross_entropy', 'retrained_embed_size':50})
105 | pretrained_vocab = PretrainedVocab.from_configurable(configurable)
106 | retrained_vocab = RetrainedVocab.from_vocab(pretrained_vocab)
107 | retrain_loss = retrained_vocab(pretrained_vocab)
108 | print('RetrainedVocab passes')
109 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/trash/weighted_mean.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import numpy as np
23 | import tensorflow as tf
24 |
25 | from parser.neural.models import NN
26 |
27 | #***************************************************************
28 | class WeightedMean(NN):
29 | """"""
30 |
31 | #=============================================================
32 | def __call__(self, vocab, output_size, moving_params=None):
33 | """"""
34 |
35 | inputs = tf.placeholder(tf.int32, shape=(None,None), name='inputs-%s' % self.name)
36 |
37 | self.tokens_to_keep = tf.to_float(tf.greater(inputs, vocab.PAD))
38 | self.sequence_lengths = tf.reduce_sum(self.tokens_to_keep, axis=1, keep_dims=True)
39 | self.n_tokens = tf.reduce_sum(self.sequence_lengths)
40 | self.batch_size = tf.shape(inputs)[0]
41 | self.bucket_size = tf.shape(inputs)[1]
42 | self.moving_params = moving_params
43 |
44 | embeddings = vocab.embedding_lookup(inputs, moving_params=self.moving_params)
45 | weighted_embeddings = self.linear_attention(embeddings)
46 | mlp = self.MLP(weighted_embeddings, self.mlp_size)
47 | lin = self.linear(mlp, output_size)
48 |
49 | return {'output': lin, 'inputs': inputs}
50 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/__init__.py:
--------------------------------------------------------------------------------
1 | from index_vocab import IndexVocab, DepVocab, HeadVocab
2 | from pretrained_vocab import PretrainedVocab
3 | from token_vocab import TokenVocab, WordVocab, LemmaVocab, TagVocab, XTagVocab, RelVocab
4 | from subtoken_vocab import SubtokenVocab, CharVocab
5 | from ngram_vocab import NgramVocab
6 | from multivocab import Multivocab
7 | from ngram_multivocab import NgramMultivocab
8 |
9 | __all__ = [
10 | 'DepVocab',
11 | 'HeadVocab',
12 | 'PretrainedVocab',
13 | 'WordVocab',
14 | 'LemmaVocab',
15 | 'TagVocab',
16 | 'XTagVocab',
17 | 'RelVocab',
18 | 'CharVocab',
19 | 'NgramVocab',
20 | 'Multivocab',
21 | 'NgramMultivocab'
22 | ]
23 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/base_vocab.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import os
23 | import re
24 | from collections import Counter
25 |
26 | import numpy as np
27 | import tensorflow as tf
28 |
29 | import parser.neural.linalg as linalg
30 | from parser import Configurable
31 |
32 | #***************************************************************
33 | class BaseVocab(Configurable):
34 | """"""
35 |
36 | #=============================================================
37 | def __init__(self, *args, **kwargs):
38 | """"""
39 |
40 | super(BaseVocab, self).__init__(*args, **kwargs)
41 |
42 | self._cased = super(BaseVocab, self).cased
43 | self._special_tokens = super(BaseVocab, self).special_tokens
44 | self._special_tokens_set = set(self._special_tokens)
45 | self._set_special_tokens()
46 | # NOTE: __setattr__ turns these into dicts
47 | self._str2idx = zip(self.special_tokens, range(len(self.special_tokens)))
48 | self._idx2str = zip(range(len(self.special_tokens)), self.special_tokens)
49 | self._tok2idx = self._str2idx
50 | self._counts = None
51 | self._embeddings = None
52 | # NOTE this placeholder stores the token data indices
53 | # I.e. the token's index in the word/tag/glove embedding matrix
54 | # CharVocab will by default be "char"
55 | self.placeholder = None
56 |
57 | #=============================================================
58 | def _set_special_tokens(self):
59 | pattern = re.compile('\W+', re.UNICODE)
60 | for i, token in enumerate(self.special_tokens):
61 | token = token.lstrip('<')
62 | token = token.rstrip('>')
63 | token = token.upper()
64 | token = pattern.sub('', token)
65 | assert token not in self.__dict__
66 | self.__dict__[token] = i
67 | return
68 |
69 | #=============================================================
70 | @classmethod
71 | def from_vocab(cls, vocab, *args, **kwargs):
72 | """"""
73 |
74 | args += (vocab,)
75 | return cls.from_configurable(vocab, *args, **kwargs)
76 |
77 | #=============================================================
78 | def generate_placeholder(self):
79 | """"""
80 |
81 | if self.placeholder is None:
82 | self.placeholder = tf.placeholder(tf.int32, shape=[None, None], name=self.name)
83 | return self.placeholder
84 |
85 | #=============================================================
86 | def __call__(self, placeholder=None, moving_params=None):
87 | """"""
88 |
89 | placeholder = self.generate_placeholder() if placeholder is None else placeholder
90 | embeddings = self.embeddings if moving_params is None else moving_params.average(self.embeddings)
91 | return tf.nn.embedding_lookup(embeddings, placeholder)
92 |
93 | #=============================================================
94 | def setup(self):
95 | """"""
96 |
97 | self.placeholder = None
98 | return
99 |
100 | #=============================================================
101 | def set_feed_dict(self, data, feed_dict):
102 | """"""
103 |
104 | feed_dict[self.placeholder] = data
105 | return
106 |
107 | #=============================================================
108 | def load(self):
109 | raise NotImplementedError()
110 | def dump(self):
111 | raise NotImplementedError()
112 | def count(self):
113 | raise NotImplementedError()
114 |
115 | #=============================================================
116 | def strings(self):
117 | return self._str2idx.keys()
118 | def indices(self):
119 | return self._str2idx.values()
120 | def iteritems(self):
121 | return self._str2idx.iteritems()
122 | def most_common(self, n=None):
123 | return self._counts.most_common(n)
124 | def index(self, token):
125 | if not self.cased and token not in self._special_tokens_set:
126 | token = token.lower()
127 | return self._tok2idx.get(token, self.UNK)
128 |
129 | #=============================================================
130 | @property
131 | def depth(self):
132 | return None
133 | @property
134 | def special_tokens(self):
135 | return self._special_tokens
136 | @property
137 | def cased(self):
138 | return self._cased
139 | @property
140 | def counts(self):
141 | return self._counts
142 | @property
143 | def embeddings(self):
144 | return self._embeddings
145 | #@embeddings.setter
146 | #def embeddings(self, matrix):
147 | # if matrix.shape[1] != self.embed_size:
148 | # raise ValueError("Matrix shape[1] of %d doesn't match expected shape of %d" % (matrix.shape[1], self.embed_size))
149 | # with tf.device('/cpu:0'):
150 | # with tf.variable_scope(self.name.title()):
151 | # self._embeddings = tf.Variable(matrix, name='Embeddings', dtype=tf.float32, trainable=True)
152 | # return
153 |
154 | #=============================================================
155 | def __getitem__(self, key):
156 | if isinstance(key, basestring):
157 | if not self.cased and key not in self._special_tokens_set:
158 | key = key.lower()
159 | return self._str2idx.get(key, self.UNK)
160 | elif isinstance(key, (int, long, np.int32, np.int64)):
161 | return self._idx2str.get(key, self.special_tokens[self.UNK])
162 | elif hasattr(key, '__iter__'):
163 | return [self[k] for k in key]
164 | else:
165 | raise ValueError('key to BaseVocab.__getitem__ must be (iterable of) string or integer')
166 | return
167 |
168 | def __setitem__(self, key, value):
169 | if isinstance(key, basestring):
170 | if not self.cased and key not in self._special_tokens_set:
171 | key = key.lower()
172 | self._str2idx[key] = value
173 | self._idx2str[value] = key
174 | elif isinstance(key, (int, long)):
175 | if not self.cased and value not in self._special_tokens_set:
176 | value = value.lower()
177 | self._idx2str[key] = value
178 | self._str2idx[value] = key
179 | elif hasattr(key, '__iter__') and hasattr(value, '__iter__'):
180 | for k, v in zip(key, value):
181 | self[k] = v
182 | else:
183 | raise ValueError('keys and values to BaseVocab.__setitem__ must be (iterable of) string or integer')
184 |
185 | def __contains__(self, key):
186 | if isinstance(key, basestring):
187 | if not self.cased and key not in self._special_tokens_set:
188 | key = key.lower()
189 | return key in self._str2idx
190 | elif isinstance(key, (int, long)):
191 | return key in self._idx2str
192 | else:
193 | raise ValueError('key to BaseVocab.__contains__ must be string or integer')
194 | return
195 |
196 | def __len__(self):
197 | return len(self._str2idx)
198 |
199 | def __iter__(self):
200 | return (key for key in sorted(self._str2idx, key=self._str2idx.get))
201 |
202 | def __setattr__(self, name, value):
203 | if name in ('_str2idx', '_idx2str', '_str2idxs'):
204 | value = dict(value)
205 | elif name == '_counts':
206 | value = Counter(value)
207 | super(BaseVocab, self).__setattr__(name, value)
208 | return
209 |
210 | #***************************************************************
211 | if __name__ == '__main__':
212 | """"""
213 |
214 | base_vocab = BaseVocab()
215 | print('BaseVocab passes')
216 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/index_vocab.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import os
23 | import re
24 | import sys
25 | from collections import Counter
26 |
27 | import numpy as np
28 | import tensorflow as tf
29 |
30 | from parser import Configurable
31 |
32 | __all__ = ['DepVocab', 'HeadVocab']
33 |
34 | #***************************************************************
35 | class IndexVocab(Configurable):
36 | """"""
37 |
38 | ROOT = 0
39 |
40 | #=============================================================
41 | def __init__(self, *args, **kwargs):
42 | """"""
43 |
44 | super(IndexVocab, self).__init__(*args, **kwargs)
45 | self.placeholder = None
46 |
47 | #=============================================================
48 | def generate_placeholder(self):
49 | """"""
50 |
51 | if self.placeholder is None:
52 | self.placeholder = tf.placeholder(tf.int32, shape=[None, None], name=self.name)
53 | return self.placeholder
54 |
55 | #=============================================================
56 | def set_feed_dict(self, data, feed_dict):
57 | """"""
58 |
59 | feed_dict[self.placeholder] = data
60 | return
61 |
62 | #=============================================================
63 | def setup(self):
64 | self.placeholder = None
65 | return
66 |
67 | #=============================================================
68 | def index(self, token):
69 | return 0 if token == '_' else int(token)
70 |
71 | #=============================================================
72 | @property
73 | def depth(self):
74 | return None
75 | @property
76 | def conll_idx(self):
77 | return self._conll_idx
78 |
79 | #=============================================================
80 | def __getitem__(self, key):
81 | if isinstance(key, basestring):
82 | return int(key)
83 | elif isinstance(key, (int, long, np.int32, np.int64)):
84 | return str(key)
85 | elif hasattr(key, '__iter__'):
86 | return [self[k] for k in key]
87 | else:
88 | raise ValueError('key to BaseVocab.__getitem__ must be (iterable of) string or integer')
89 | return
90 |
91 | #***************************************************************
92 | class DepVocab(IndexVocab):
93 | _conll_idx = 0
94 | class HeadVocab(IndexVocab):
95 | _conll_idx = 6
96 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/multivocab.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import os
23 | import re
24 | import codecs
25 | from collections import Counter
26 |
27 | import numpy as np
28 | import tensorflow as tf
29 |
30 | from parser import Configurable
31 | from parser.neural import linalg
32 | from parser.vocabs import TokenVocab, SubtokenVocab
33 |
34 | __all__ = ['Multivocab']
35 |
36 | #***************************************************************
37 | class Multivocab(Configurable):
38 | """"""
39 |
40 | #=============================================================
41 | def __init__(self, vocabs, *args, **kwargs):
42 | """"""
43 |
44 | super(Multivocab, self).__init__(*args, **kwargs)
45 |
46 | self._vocabs = vocabs
47 | self._set_special_tokens()
48 | # NOTE Don't forget to run index_tokens() after adding test/validation files!
49 | self.placeholder = None
50 | return
51 |
52 | #=============================================================
53 | def __call__(self, placeholder=None, moving_params=None):
54 | """"""
55 | # TODO check to see if a word is all unk, and if so, replace it with a random vector
56 |
57 | embeddings = [vocab(moving_params=moving_params) for vocab in self]
58 | return tf.add_n(embeddings)
59 |
60 | #=============================================================
61 | def setup(self):
62 | """"""
63 |
64 | self.placeholder = None
65 | for vocab in self:
66 | vocab.setup()
67 | return
68 |
69 | #=============================================================
70 | def generate_placeholder(self):
71 | """"""
72 |
73 | if self.placeholder is None:
74 | self.placeholder = tf.stack([vocab.generate_placeholder() for vocab in self], axis=2)
75 | return self.placeholder
76 |
77 | #=============================================================
78 | def _set_special_tokens(self):
79 | pattern = re.compile('\W+', re.UNICODE)
80 | self._special_tokens = zip(*[vocab.special_tokens for vocab in self])
81 | for i, token in enumerate(self.special_tokens):
82 | n = len(token)
83 | assert len(set(token)) == 1
84 | token = token[0]
85 | token = token.lstrip('<')
86 | token = token.rstrip('>')
87 | token = token.upper()
88 | token = pattern.sub('', token)
89 | assert token not in self.__dict__
90 | self.__dict__[token] = tuple(i for _ in xrange(n))
91 | return
92 |
93 | #=============================================================
94 | def add_files(self, conll_files):
95 | """"""
96 |
97 | conll_files = list(conll_files)
98 | token_vocabs = []
99 | for vocab in self:
100 | if hasattr(vocab, 'token_vocab'):
101 | if vocab.token_vocab not in token_vocabs:
102 | vocab.token_vocab.count(conll_files)
103 | token_vocabs.append(vocab.token_vocab)
104 | return
105 |
106 | #=============================================================
107 | def index_tokens(self):
108 | """"""
109 |
110 | for vocab in self:
111 | if hasattr(vocab, 'index_tokens'):
112 | vocab.index_tokens()
113 | return
114 |
115 | #=============================================================
116 | def set_feed_dict(self, data, feed_dict):
117 | """"""
118 |
119 | for i, vocab in enumerate(self):
120 | vocab.set_feed_dict(data[:,:,i], feed_dict)
121 | return
122 |
123 | #=============================================================
124 | def index(self, token):
125 | return tuple(vocab.index(token) for vocab in self)
126 |
127 | #=============================================================
128 | @property
129 | def depth(self):
130 | return len(self)
131 | @property
132 | def special_tokens(self):
133 | return self._special_tokens
134 | @property
135 | def conll_idx(self):
136 | return self._conll_idx
137 |
138 | #=============================================================
139 | def __iter__(self):
140 | return (vocab for vocab in self._vocabs)
141 | def __getitem__(self, key):
142 | return self._vocabs[key]
143 | def __len__(self):
144 | return len(self._vocabs)
145 | def __setattr__(self, key, value):
146 | if key == '_vocabs':
147 | conll_idxs = set([vocab.conll_idx for vocab in value if hasattr(vocab, 'conll_idx')])
148 | assert len(conll_idxs) == 1
149 | self._conll_idx = list(conll_idxs)[0]
150 | super(Multivocab, self).__setattr__(key, value)
151 |
152 | #***************************************************************
153 | if __name__ == '__main__':
154 | """"""
155 |
156 | from parser.vocabs import PretrainedVocab, WordVocab, CharVocab, Multivocab
157 |
158 | configurable = Configurable()
159 | token_vocab = WordVocab.from_configurable(configurable)
160 | pretrained_vocab = PretrainedVocab.from_vocab(token_vocab)
161 | subtoken_vocab = CharVocab.from_vocab(token_vocab)
162 | multivocab = Multivocab.from_configurable(configurable, [pretrained_vocab, token_vocab, subtoken_vocab])
163 | multivocab.add_files(configurable.valid_files)
164 | multivocab.index_tokens()
165 | print("Indices for '': %s" % str(multivocab.index('')))
166 | print("Indices for 'the': %s" % str(multivocab.index('the')))
167 | print("Indices for 'The': %s" % str(multivocab.index('The')))
168 | print('Multivocab passes')
169 |
170 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/ngram_multivocab.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import os
23 | import sys
24 | import codecs
25 | from collections import Counter
26 |
27 | import numpy as np
28 | import tensorflow as tf
29 |
30 | from parser import Configurable, Multibucket
31 | from parser.vocabs.base_vocab import BaseVocab
32 | from parser.vocabs import SubtokenVocab, NgramVocab, Multivocab
33 | from parser.misc.bucketer import Bucketer
34 |
35 | __all__ = ['NgramMultivocab']
36 |
37 | #***************************************************************
38 | class NgramMultivocab(Multivocab, SubtokenVocab):
39 | """"""
40 |
41 | #=============================================================
42 | def __init__(self, token_vocab, *args, **kwargs):
43 | """"""
44 |
45 | super(BaseVocab, self).__init__(*args, **kwargs)
46 | self._cased = super(BaseVocab, self).cased
47 |
48 | SubtokenVocab.__setattr__(self, '_token_vocab', token_vocab)
49 | self._multibucket = Multibucket.from_configurable(self, embed_model=self.embed_model, name=self.name)
50 | self._vocabs = [NgramVocab.from_vocab(self.token_vocab, i+1, cased=self.cased) for i in xrange(self.max_n)]
51 | self._special_tokens = super(BaseVocab, self).special_tokens
52 | self._special_tokens_set = set(self._special_tokens)
53 | SubtokenVocab._set_special_tokens(self)
54 | self._tok2idx = {}
55 |
56 | for vocab in self:
57 | assert vocab.token_vocab is self.token_vocab
58 | return
59 |
60 | #=============================================================
61 | def add_files(self, conll_files):
62 | """"""
63 |
64 | self.token_vocab.count(conll_files)
65 | return
66 |
67 | #=============================================================
68 | def index_tokens(self):
69 | """"""
70 |
71 | n_buckets = self.n_buckets
72 | tok2idxs = {token: [vocab.subtoken_indices(token) for vocab in self] for token in self.token_vocab.counts}
73 | with Bucketer.from_configurable(self, self.n_buckets, name='bucketer-%s'%self.name) as bucketer:
74 | splits = bucketer.compute_splits(len(indices[0]) for indices in tok2idxs.values())
75 | bucketer.plot()
76 | with self.multibucket.open(splits, depth=len(self)):
77 | for index, special_token in enumerate(self.special_tokens):
78 | self.tok2idx[special_token] = self.multibucket.add([[index]*len(self)])
79 | for token, _ in self.sorted_counts(self.token_vocab.counts):
80 | indices = tok2idxs[token]
81 | sequence = [[indices[i][j] for i in xrange(len(indices)) if j < len(indices[i])] for j in xrange(len(indices[0]))]
82 | self.tok2idx[token] = self.multibucket.add(sequence)
83 | return
84 |
85 | #=============================================================
86 | def __call__(self, placeholder, keep_prob=None, moving_params=None):
87 | return SubtokenVocab.__call__(self, placeholder, keep_prob=keep_prob, moving_params=moving_params)
88 |
89 | def index(self, token):
90 | return SubtokenVocab.index(self, token)
91 |
92 | def generate_placeholder(self):
93 | return SubtokenVocab.generate_placeholder(self)
94 |
95 | #=============================================================
96 | def embedding_lookup(self, placeholders, embed_keep_prob=None, moving_params=None):
97 | """"""
98 |
99 | if moving_params is None:
100 | shape = tf.shape(placeholders)
101 | shape = tf.stack([shape[0], 1, shape[2]])
102 | placeholders = la.random_where(embed_keep_prob, placeholders, self.UNK, shape=shape)
103 | embeddings = [vocab.embedding_lookup(placeholders[:,:,i], embed_keep_prob=1, moving_params=moving_params) for i, vocab in enumerate(self)]
104 | return tf.stack(embeddings, axis=2)
105 |
106 | #=============================================================
107 | def __iter__(self):
108 | return (vocab for vocab in self._vocabs)
109 | def __getitem__(self, key):
110 | return self._vocabs[key]
111 | def __len__(self):
112 | return len(self._vocabs)
113 |
114 | #***************************************************************
115 | if __name__ == '__main__':
116 | """"""
117 |
118 | from parser import Configurable
119 | from parser.vocabs import WordVocab, NgramMultivocab
120 |
121 | configurable = Configurable()
122 | token_vocab = WordVocab.from_configurable(configurable)
123 | ngram_multivocab = NgramMultivocab.from_vocab(token_vocab)
124 | ngram_multivocab.add_files(configurable.valid_files)
125 | ngram_multivocab.index_tokens()
126 | print("Indices for '': %s" % str(ngram_multivocab.index('')))
127 | print("Indices for 'the': %s" % str(ngram_multivocab.index('the')))
128 | print("Indices for 'The': %s" % str(ngram_multivocab.index('The')))
129 | print('NgramMultivocab passes')
130 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/ngram_vocab.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import os
23 | import codecs
24 | from collections import Counter
25 |
26 | import numpy as np
27 | import tensorflow as tf
28 |
29 | from parser.vocabs import TokenVocab, SubtokenVocab, CharVocab
30 | from parser import Multibucket
31 |
32 | __all__ = ['NgramVocab']
33 |
34 | #***************************************************************
35 | class NgramVocab(SubtokenVocab):
36 | """"""
37 |
38 | #=============================================================
39 | def __init__(self, n, token_vocab, *args, **kwargs):
40 | """"""
41 |
42 | recount = kwargs.pop('recount', False)
43 | initialize_zero = kwargs.pop('initialize_zero', False)
44 | super(TokenVocab, self).__init__(*args, **kwargs)
45 |
46 | self._n = n
47 | self._token_vocab = token_vocab
48 | self._token_counts = Counter()
49 | self._subtoken_vocab = CharVocab.from_vocab(self.token_vocab)
50 | self._multibucket = Multibucket.from_configurable(self, embed_model=self.embed_model, name=self.name)
51 |
52 | if recount:
53 | self.count()
54 | else:
55 | if os.path.isfile(self.filename):
56 | self.load()
57 | else:
58 | self.count()
59 | self.dump()
60 | self.index_vocab()
61 |
62 | embed_dims = [len(self), self.embed_size]
63 | if initialize_zero:
64 | self.embeddings = np.zeros(embed_dims)
65 | else:
66 | self.embeddings = np.random.randn(*embed_dims)
67 | return
68 |
69 | #=============================================================
70 | def count(self):
71 | """"""
72 |
73 | special_tokens = set(self.token_vocab.special_tokens)
74 | for token in self.token_vocab:
75 | if token not in special_tokens:
76 | idxs = self.subtoken_vocab.subtoken_indices(token)
77 | idxs = [self.subtoken_vocab.START] + idxs + [self.subtoken_vocab.STOP]
78 | if len(idxs) > self.n:
79 | for i in xrange(len(idxs) - self.n):
80 | subtoken = ''.join(self.subtoken_vocab[idxs[i:i+self.n]])
81 | self.counts[subtoken] += 1
82 | self.token_counts[subtoken] += self.token_vocab.counts[token]
83 | return
84 |
85 | #=============================================================
86 | def subtoken_indices(self, token):
87 | """"""
88 |
89 | idxs = self.subtoken_vocab.subtoken_indices(token)
90 | idxs = [self.subtoken_vocab.START] + idxs + [self.subtoken_vocab.STOP]
91 | if len(idxs) <= self.n:
92 | return [self.PAD]
93 | else:
94 | subtokens = []
95 | for i in xrange(len(idxs) - self.n):
96 | subtokens.append(''.join(self.subtoken_vocab[idxs[i:i+self.n]]))
97 | return self[subtokens]
98 |
99 | #=============================================================
100 | @property
101 | def n(self):
102 | return self._n
103 | @property
104 | def subtoken_vocab(self):
105 | return self._subtoken_vocab
106 | @property
107 | def name(self):
108 | return '%d-%s' % (self.n, super(NgramVocab, self).name)
109 |
110 | #=============================================================
111 | def __setattr__(self, name, value):
112 | if name == '_subtoken_vocab':
113 | self._conll_idx = value.conll_idx
114 | if self.cased is None:
115 | self._cased = value.cased
116 | elif self.cased != value.cased:
117 | cls = value.__class__
118 | value = cls.from_configurable(value, value.token_vocab,
119 | cased=self.cased,
120 | recount=True)
121 | super(NgramVocab, self).__setattr__(name, value)
122 | return
123 |
124 | #***************************************************************
125 | if __name__ == '__main__':
126 | """"""
127 |
128 | from parser import Configurable
129 | from parser.vocabs import WordVocab, CharVocab, NgramVocab
130 |
131 | configurable = Configurable()
132 | token_vocab = WordVocab.from_configurable(configurable, 1)
133 | if os.path.isfile('saves/defaults/2-ngrams.txt'):
134 | os.remove('saves/defaults/2-ngrams.txt')
135 | ngram_vocab = NgramVocab.from_vocab(token_vocab, 2)
136 | ngram_vocab = NgramVocab.from_vocab(token_vocab, 2)
137 | ngram_vocab.token_vocab.count(conll_files = configurable.valid_files)
138 | ngram_vocab.index_tokens()
139 | ngram_vocab.fit_to_zipf()
140 | print('NgramVocab passes')
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/pretrained_vocab.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import os
23 | import codecs
24 | import gzip
25 | import warnings
26 | try:
27 | from backports import lzma
28 | except:
29 | warnings.warn('Install backports.lzma for xz support')
30 | from collections import Counter
31 |
32 | import numpy as np
33 | import tensorflow as tf
34 |
35 | import parser.neural.linalg as linalg
36 | from parser.vocabs.base_vocab import BaseVocab
37 |
38 | #***************************************************************
39 | class PretrainedVocab(BaseVocab):
40 | """"""
41 |
42 | #=============================================================
43 | def __init__(self, token_vocab, *args, **kwargs):
44 | """"""
45 |
46 | super(PretrainedVocab, self).__init__(*args, **kwargs)
47 |
48 | self._token_vocab = token_vocab
49 |
50 | self.load()
51 | self.count()
52 | return
53 |
54 | #=============================================================
55 | def __call__(self, placeholder=None, moving_params=None):
56 | """"""
57 |
58 | embeddings = super(PretrainedVocab, self).__call__(placeholder, moving_params=moving_params)
59 | # (n x b x d') -> (n x b x d)
60 | with tf.variable_scope(self.name.title()):
61 | matrix = linalg.linear(embeddings, self.token_embed_size, moving_params=moving_params)
62 | if moving_params is None:
63 | with tf.variable_scope('Linear', reuse=True):
64 | weights = tf.get_variable('Weights')
65 | tf.losses.add_loss(tf.nn.l2_loss(tf.matmul(tf.transpose(weights), weights) - tf.eye(self.token_embed_size)))
66 | return matrix
67 | #return embeddings # changed in saves2/test8
68 |
69 | #=============================================================
70 | def setup(self):
71 | """"""
72 |
73 | self.placeholder = None
74 | with tf.device('/cpu:0'):
75 | with tf.variable_scope(self.name.title()):
76 | self._embeddings = tf.Variable(self._embeddings_array, name='Embeddings', dtype=tf.float32, trainable=False)
77 | return
78 |
79 | #=============================================================
80 | def load(self):
81 | """"""
82 |
83 | embeddings = []
84 | cur_idx = len(self.special_tokens)
85 | max_rank = self.max_rank
86 | if self.filename.endswith('.xz'):
87 | open_func = lzma.open
88 | else:
89 | open_func = codecs.open
90 | with open_func(self.filename, 'rb') as f:
91 | reader = codecs.getreader('utf-8')(f, errors='ignore')
92 | if self.skip_header == True:
93 | reader.readline()
94 | for line_num, line in enumerate(reader):
95 | if (not max_rank) or line_num < max_rank:
96 | line = line.rstrip().split(' ')
97 | if len(line) > 1:
98 | embeddings.append(np.array(line[1:], dtype=np.float32))
99 | self[line[0]] = cur_idx
100 | cur_idx += 1
101 | else:
102 | break
103 | try:
104 | embeddings = np.stack(embeddings)
105 | embeddings = np.pad(embeddings, ( (len(self.special_tokens),0), (0,0) ), 'constant')
106 | self._embeddings_array = np.stack(embeddings)
107 | self._embed_size = self._embeddings_array.shape[1]
108 | except:
109 | shapes = set([embedding.shape for embedding in embeddings])
110 | raise ValueError("Couldn't stack embeddings with shapes in %s" % shapes)
111 | return
112 |
113 | #=============================================================
114 | def count(self):
115 | """"""
116 |
117 | if self.token_vocab is not None:
118 | zipf = self.token_vocab.fit_to_zipf(plot=False)
119 | zipf_freqs = zipf.predict(np.arange(len(self))+1)
120 | else:
121 | zipf_freqs = -np.log(np.arange(len(self))+1)
122 | zipf_counts = zipf_freqs / np.min(zipf_freqs)
123 | for count, token in zip(zipf_counts, self.strings()):
124 | self.counts[token] = int(count)
125 | return
126 |
127 | #=============================================================
128 | @property
129 | def token_vocab(self):
130 | return self._token_vocab
131 | @property
132 | def token_embed_size(self):
133 | return (self.token_vocab or self).embed_size
134 | @property
135 | def embeddings(self):
136 | return super(PretrainedVocab, self).embeddings
137 | #@embeddings.setter
138 | #def embeddings(self, matrix):
139 | # self._embed_size = matrix.shape[1]
140 | # with tf.device('/cpu:0'):
141 | # with tf.variable_scope(self.name.title()):
142 | # self._embeddings = tf.Variable(matrix, name='Embeddings', trainable=False)
143 | # return
144 |
145 | #***************************************************************
146 | if __name__ == '__main__':
147 | """"""
148 |
149 | pretrained_vocab = PretrainedVocab(None)
150 | print('PretrainedVocab passes')
151 |
--------------------------------------------------------------------------------
/StanfordBiaffineParser-v2/parser/vocabs/token_vocab.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: UTF-8 -*-
3 |
4 | # Copyright 2016 Timothy Dozat
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 | import os
23 | import re
24 | import codecs
25 | from collections import Counter
26 |
27 | import numpy as np
28 | import tensorflow as tf
29 |
30 | from parser.vocabs.base_vocab import BaseVocab
31 | from parser.misc.zipf import Zipf
32 |
33 | __all__ = ['WordVocab', 'LemmaVocab', 'TagVocab', 'XTagVocab', 'RelVocab']
34 |
35 | #***************************************************************
36 | class TokenVocab(BaseVocab):
37 | """"""
38 |
39 | #=============================================================
40 | def __init__(self, *args, **kwargs):
41 | """"""
42 |
43 | recount = kwargs.pop('recount', False)
44 | initialize_zero = kwargs.pop('initialize_zero', True)
45 | super(TokenVocab, self).__init__(*args, **kwargs)
46 |
47 | if recount:
48 | self.count()
49 | else:
50 | if os.path.isfile(self.filename):
51 | self.load()
52 | else:
53 | self.count()
54 | self.dump()
55 | self.index_vocab()
56 |
57 | embed_dims = [len(self), self.embed_size]
58 | if initialize_zero:
59 | self._embeddings_array = np.zeros(embed_dims)
60 | else:
61 | self._embeddings_array = np.random.randn(*embed_dims)
62 | return
63 |
64 | #=============================================================
65 | def setup(self):
66 | """"""
67 |
68 | self.placeholder = None
69 | del self._embeddings
70 | with tf.device('/cpu:0'):
71 | with tf.variable_scope(self.name.title()):
72 | self._embeddings = tf.Variable(self._embeddings_array, name='Embeddings', dtype=tf.float32, trainable=True)
73 | return
74 |
75 |
76 | #=============================================================
77 | def count(self, conll_files=None):
78 | """"""
79 |
80 | if conll_files is None:
81 | conll_files = self.train_files
82 |
83 | for conll_file in conll_files:
84 | with codecs.open(conll_file, encoding='utf-8', errors='ignore') as f:
85 | for line_num, line in enumerate(f):
86 | try:
87 | line = line.strip()
88 | if line and not line.startswith('#'):
89 | line = line.split('\t')
90 | assert len(line) == 10
91 | token = line[self.conll_idx]
92 | if not self.cased:
93 | token = token.lower()
94 | self.counts[token] += 1
95 | except:
96 | raise ValueError('File %s is misformatted at line %d' % (conll_file, line_num+1))
97 | return
98 |
99 | #=============================================================
100 | def load(self):
101 | """"""
102 |
103 | with codecs.open(self.filename, encoding='utf-8') as f:
104 | for line_num, line in enumerate(f):
105 | try:
106 | line = line.rstrip()
107 | if line:
108 | line = line.split('\t')
109 | token, count = line
110 | self.counts[token] = int(count)
111 | except:
112 | raise ValueError('File %s is misformatted at line %d' % (train_file, line_num+1))
113 | return
114 |
115 | #=============================================================
116 | def dump(self):
117 | """"""
118 |
119 | with codecs.open(self.filename, 'w', encoding='utf-8') as f:
120 | for word, count in self.sorted_counts(self.counts):
121 | f.write('%s\t%d\n' % (word, count))
122 | return
123 |
124 | #=============================================================
125 | def index_vocab(self):
126 | """"""
127 |
128 | for token, count in self.sorted_counts(self.counts):
129 | if ((count >= self.min_occur_count) and
130 | token not in self and
131 | (not self.max_rank or len(self) < self.max_rank)):
132 | self[token] = len(self)
133 | return
134 |
135 | #=============================================================
136 | def fit_to_zipf(self, plot=True):
137 | """"""
138 |
139 | zipf = Zipf.from_configurable(self, self.counts, name='zipf-%s'%self.name)
140 | if plot:
141 | zipf.plot()
142 | return zipf
143 |
144 | #=============================================================
145 | @staticmethod
146 | def sorted_counts(counts):
147 | return sorted(counts.most_common(), key=lambda x: (-x[1], x[0]))
148 |
149 | #=============================================================
150 | @property
151 | def conll_idx(self):
152 | return self._conll_idx
153 |
154 | #***************************************************************
155 | class WordVocab(TokenVocab):
156 | _conll_idx = 1
157 | class LemmaVocab(WordVocab):
158 | _conll_idx = 2
159 | class TagVocab(TokenVocab):
160 | _conll_idx = 3
161 | class XTagVocab(TagVocab):
162 | _conll_idx = 4
163 | class RelVocab(TokenVocab):
164 | _conll_idx = 7
165 |
166 | #***************************************************************
167 | if __name__ == '__main__':
168 | """"""
169 |
170 | from parser import Configurable
171 | from parser.vocabs import PretrainedVocab, TokenVocab, WordVocab
172 |
173 | configurable = Configurable()
174 | if os.path.isfile('saves/defaults/words.txt'):
175 | os.remove('saves/defaults/words.txt')
176 | token_vocab = WordVocab.from_configurable(configurable, 1)
177 | token_vocab = WordVocab.from_configurable(configurable, 1)
178 | token_vocab.fit_to_zipf()
179 | #pretrained_vocab = PretrainedVocab.from_vocab(token_vocab)
180 | #assert min(pretrained_vocab.counts.values()) == 1
181 | print('TokenVocab passed')
182 |
--------------------------------------------------------------------------------
/convert_NLP4J_to_CoNLL.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import os
3 | import sys
4 |
5 | #Convert NLP4J's 9-column output into CoNLL's 10-column format
6 | def convert(inputFile):
7 | writer = open(inputFile + ".conll", "w")
8 | for line in open(inputFile, "r").readlines():
9 | eles = line.strip().split()
10 | if len(eles) == 0:
11 | writer.write("\n")
12 | else:
13 | eles[4] = "_"
14 | eles.insert(4, eles[3])
15 | eles[8] = "_"
16 | eles[9] = "_"
17 | writer.write("\t".join(eles) + "\n")
18 |
19 | writer.close()
20 |
21 | if __name__ == "__main__":
22 | convert(sys.argv[1])
--------------------------------------------------------------------------------
/data/sentence_segmented.txt:
--------------------------------------------------------------------------------
1 | Induction of tyrosine phosphorylation and T-cell activation by vanadate peroxide, an inhibitor of protein tyrosine phosphatases.
2 | Rapid tyrosine phosphorylation of key cellular proteins is a crucial event in the transduction of activation signals to T-lymphocytes.
3 | The regulatory role of protein tyrosine phosphatases (PTPases) in this process was explored by studying the effects of a powerful PTPase inhibitor, vanadate peroxide (pervanadate), on the activation cascade of Jurkat human leukaemic T-cells.
--------------------------------------------------------------------------------
/data/tokenized_sentence_segmented.txt:
--------------------------------------------------------------------------------
1 | Induction of tyrosine phosphorylation and T-cell activation by vanadate peroxide , an inhibitor of protein tyrosine phosphatases .
2 | Rapid tyrosine phosphorylation of key cellular proteins is a crucial event in the transduction of activation signals to T-lymphocytes .
3 | The regulatory role of protein tyrosine phosphatases ( PTPases ) in this process was explored by studying the effects of a powerful PTPase inhibitor , vanadate peroxide ( pervanadate ) , on the activation cascade of Jurkat human leukaemic T-cells .
--------------------------------------------------------------------------------
/data/tokenized_sentence_segmented.txt.column:
--------------------------------------------------------------------------------
1 | 1 Induction _ _ _ _ _ _ _ _
2 | 2 of _ _ _ _ _ _ _ _
3 | 3 tyrosine _ _ _ _ _ _ _ _
4 | 4 phosphorylation _ _ _ _ _ _ _ _
5 | 5 and _ _ _ _ _ _ _ _
6 | 6 T-cell _ _ _ _ _ _ _ _
7 | 7 activation _ _ _ _ _ _ _ _
8 | 8 by _ _ _ _ _ _ _ _
9 | 9 vanadate _ _ _ _ _ _ _ _
10 | 10 peroxide _ _ _ _ _ _ _ _
11 | 11 , _ _ _ _ _ _ _ _
12 | 12 an _ _ _ _ _ _ _ _
13 | 13 inhibitor _ _ _ _ _ _ _ _
14 | 14 of _ _ _ _ _ _ _ _
15 | 15 protein _ _ _ _ _ _ _ _
16 | 16 tyrosine _ _ _ _ _ _ _ _
17 | 17 phosphatases _ _ _ _ _ _ _ _
18 | 18 . _ _ _ _ _ _ _ _
19 |
20 | 1 Rapid _ _ _ _ _ _ _ _
21 | 2 tyrosine _ _ _ _ _ _ _ _
22 | 3 phosphorylation _ _ _ _ _ _ _ _
23 | 4 of _ _ _ _ _ _ _ _
24 | 5 key _ _ _ _ _ _ _ _
25 | 6 cellular _ _ _ _ _ _ _ _
26 | 7 proteins _ _ _ _ _ _ _ _
27 | 8 is _ _ _ _ _ _ _ _
28 | 9 a _ _ _ _ _ _ _ _
29 | 10 crucial _ _ _ _ _ _ _ _
30 | 11 event _ _ _ _ _ _ _ _
31 | 12 in _ _ _ _ _ _ _ _
32 | 13 the _ _ _ _ _ _ _ _
33 | 14 transduction _ _ _ _ _ _ _ _
34 | 15 of _ _ _ _ _ _ _ _
35 | 16 activation _ _ _ _ _ _ _ _
36 | 17 signals _ _ _ _ _ _ _ _
37 | 18 to _ _ _ _ _ _ _ _
38 | 19 T-lymphocytes _ _ _ _ _ _ _ _
39 | 20 . _ _ _ _ _ _ _ _
40 |
41 | 1 The _ _ _ _ _ _ _ _
42 | 2 regulatory _ _ _ _ _ _ _ _
43 | 3 role _ _ _ _ _ _ _ _
44 | 4 of _ _ _ _ _ _ _ _
45 | 5 protein _ _ _ _ _ _ _ _
46 | 6 tyrosine _ _ _ _ _ _ _ _
47 | 7 phosphatases _ _ _ _ _ _ _ _
48 | 8 ( _ _ _ _ _ _ _ _
49 | 9 PTPases _ _ _ _ _ _ _ _
50 | 10 ) _ _ _ _ _ _ _ _
51 | 11 in _ _ _ _ _ _ _ _
52 | 12 this _ _ _ _ _ _ _ _
53 | 13 process _ _ _ _ _ _ _ _
54 | 14 was _ _ _ _ _ _ _ _
55 | 15 explored _ _ _ _ _ _ _ _
56 | 16 by _ _ _ _ _ _ _ _
57 | 17 studying _ _ _ _ _ _ _ _
58 | 18 the _ _ _ _ _ _ _ _
59 | 19 effects _ _ _ _ _ _ _ _
60 | 20 of _ _ _ _ _ _ _ _
61 | 21 a _ _ _ _ _ _ _ _
62 | 22 powerful _ _ _ _ _ _ _ _
63 | 23 PTPase _ _ _ _ _ _ _ _
64 | 24 inhibitor _ _ _ _ _ _ _ _
65 | 25 , _ _ _ _ _ _ _ _
66 | 26 vanadate _ _ _ _ _ _ _ _
67 | 27 peroxide _ _ _ _ _ _ _ _
68 | 28 ( _ _ _ _ _ _ _ _
69 | 29 pervanadate _ _ _ _ _ _ _ _
70 | 30 ) _ _ _ _ _ _ _ _
71 | 31 , _ _ _ _ _ _ _ _
72 | 32 on _ _ _ _ _ _ _ _
73 | 33 the _ _ _ _ _ _ _ _
74 | 34 activation _ _ _ _ _ _ _ _
75 | 35 cascade _ _ _ _ _ _ _ _
76 | 36 of _ _ _ _ _ _ _ _
77 | 37 Jurkat _ _ _ _ _ _ _ _
78 | 38 human _ _ _ _ _ _ _ _
79 | 39 leukaemic _ _ _ _ _ _ _ _
80 | 40 T-cells _ _ _ _ _ _ _ _
81 | 41 . _ _ _ _ _ _ _ _
82 |
83 |
--------------------------------------------------------------------------------
/get_ColumnFormat.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import os
3 | import sys
4 |
5 | #Convert word-segmented corpus into 10-column format for dependency parsing
6 | def convert(inputFilePath):
7 | writer = open(inputFilePath + ".column", "w")
8 | lines = open(inputFilePath, "r").readlines()
9 | for line in lines:
10 | tok = line.strip().split()
11 | if not tok or line.strip() == '':
12 | writer.write("\n")
13 | else:
14 | count = 0
15 | for word in tok:
16 | count += 1
17 | writer.write(str(count) + "\t" + word + "\t" + '\t'.join(['_'] * 8) + "\n")
18 | writer.write("\n")
19 | writer.close()
20 |
21 | if __name__ == "__main__":
22 | convert(sys.argv[1])
23 | pass
--------------------------------------------------------------------------------
/jPTDP-v1/README.md:
--------------------------------------------------------------------------------
1 | jPTDP: Neural network models for joint POS tagging and dependency parsing
2 |
3 | See [https://github.com/datquocnguyen/jPTDP](https://github.com/datquocnguyen/jPTDP) for more details.
--------------------------------------------------------------------------------